diff --git a/.circleci/config.yml b/.circleci/config.yml index 7a566f8787..6e130a8144 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,511 +1,33 @@ version: 2.1 -orbs: - win: circleci/windows@2.2.0 - -executors: - py37: - docker: - - image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.7 - py38: - docker: - - image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.8 - py39: - docker: - - image: 350138855857.dkr.ecr.eu-west-2.amazonaws.com/kedro-builder:3.9 - -commands: - setup_conda: - description: Activate conda environment - steps: - - run: - name: Run conda.sh - command: echo ". /home/circleci/miniconda/etc/profile.d/conda.sh" >> $BASH_ENV - - run: - name: Activate conda environment - command: echo "conda deactivate; conda activate kedro_builder" >> $BASH_ENV - - run: - # pytables does not work properly with python 3.9 to handle our HDFDataSet - # if pip-installed, so we install this dependency via conda - name: Install conda packages - command: echo "conda install -c conda-forge pytables -y" >> $BASH_ENV - - setup_requirements: - description: Install PIP dependencies - steps: - - run: - name: Install pip setuptools - command: make install-pip-setuptools - - run: - name: Install requirements - command: pip install -r requirements.txt -U - - run: - # Virtualenv 20.0.20 broke pre-commit, capped for now - name: Install venv for some pre-commit hooks - command: conda install -y "virtualenv<20.0" - - run: - name: Install test requirements - command: pip install -r test_requirements.txt -U - - run: - # Since recently Spark installation for some reason does not have enough permissions to execute - # /home/circleci/miniconda/envs/kedro_builder/lib/python3.X/site-packages/pyspark/bin/spark-class. - # So fixing it manually here. - name: Fix Spark permissions - command: sudo chmod -R u+x /home/circleci/miniconda/envs/kedro_builder/lib/ - - run: - name: Print Python environment - command: make print-python-env - - run: - name: Pip freeze - command: pip freeze - - setup_pre_commit: - description: Install pre-commit hooks - steps: - - run: - name: Install pre-commit hooks - command: pre-commit install --install-hooks - - run: - name: Run pre-commit hooks - command: pre-commit install --hook-type pre-push - - unit_tests: - description: Run unit tests - steps: - - checkout - - setup_conda - - setup_requirements - - run: - name: Run unit tests - command: make test - - lint: - description: Run linters - steps: - - checkout - - setup_conda - - setup_requirements - - setup_pre_commit - - run: - name: Run linters - command: make lint - - run: - name: Check legal headers - command: make legal - - e2e_tests: - description: Run all end to end tests - steps: - - checkout - - setup_conda - - setup_requirements - - run: - name: Run e2e tests - command: make e2e-tests - - build_docs: - description: Build docs - steps: - - checkout - - setup_conda - - setup_requirements - - run: - name: Build docs - command: make build-docs - - docs_linkcheck: - description: Build docs and check for broken links - steps: - - checkout - - setup_conda - - setup_requirements - - run: - name: Check for broken links - command: make linkcheck - - pip_compile: - description: Pip-compile requirements file - steps: - - checkout - - setup_conda - - setup_requirements - - run: - name: Pip-compile requirements file - command: make pip-compile - - # Windows-related commands - win_setup_conda: - description: Setup conda - parameters: - python_version: - type: string - steps: - - run: - name: Initialize conda - command: conda init powershell - - run: - name: Create 'kedro_builder' conda environment - command: | - conda create --name kedro_builder python=<< parameters.python_version >> -y - - win_setup_env: - description: Setup environment - steps: - - run: - # Required for Tensorflow tests - name: Install Microsoft Visual C++ Redistributable - command: | - $ProgressPreference = "SilentlyContinue" - Invoke-WebRequest https://aka.ms/vs/16/release/vc_redist.x64.exe -OutFile vc_redist.x64.exe - .\vc_redist.x64.exe /S /v/qn - - run: - name: Install Java 8 - command: | - $ProgressPreference = "SilentlyContinue" - Invoke-WebRequest https://github.com/AdoptOpenJDK/openjdk8-upstream-binaries/releases/download/jdk8u252-b09/OpenJDK8U-jdk_x64_windows_8u252b09.zip -OutFile OpenJDK8U.zip - Expand-Archive .\OpenJDK8U.zip -DestinationPath C:\OpenJDK8U - - run: - name: Create Inbound rules for Java - command: | - New-NetFirewallRule -DisplayName "Allow JDK UDP" -Profile "Public" -Protocol "UDP" -Direction Inbound -Program "C:\OpenJDK8U\openjdk-8u252-b09\bin\java.exe" -Action Allow - New-NetFirewallRule -DisplayName "Allow JDK TCP" -Profile "Public" -Protocol "TCP" -Direction Inbound -Program "C:\OpenJDK8U\openjdk-8u252-b09\bin\java.exe" -Action Allow - - run: - name: Set Java environment variables - command: | - [Environment]::SetEnvironmentVariable("Path", [Environment]::GetEnvironmentVariable('Path', 'Machine') + ";C:\OpenJDK8U\openjdk-8u252-b09\bin", "Machine") - setx /m JAVA_HOME "C:\OpenJDK8U\openjdk-8u252-b09" - - run: - name: Setup Hadoop binary - command: | - $ProgressPreference = "SilentlyContinue" - Invoke-WebRequest https://github.com/steveloughran/winutils/raw/master/hadoop-2.6.3/bin/winutils.exe -OutFile winutils.exe - New-Item -ItemType directory -Path C:\hadoop\bin - mv .\winutils.exe C:\hadoop\bin - setx /m HADOOP_HOME "C:\hadoop\" - - run: - name: Install 'make' command - command: choco install make - - win_setup_requirements: - description: Install Kedro dependencies - steps: - # pytables and Fiona have a series of binary dependencies under Windows that - # are best handled by conda-installing instead of pip-installing them. - - run: - name: Install pytables - command: conda activate kedro_builder; conda install -c conda-forge pytables -y - - run: - name: Install Fiona - command: conda activate kedro_builder; conda install -c conda-forge fiona -y - - run: - name: Install all requirements - command: conda activate kedro_builder; pip install -r test_requirements.txt -U - - run: - name: Print Python environment - command: conda activate kedro_builder; make print-python-env - - run: - name: Pip freeze - command: conda activate kedro_builder; pip freeze - - win_unit_tests: - description: Run unit tests - steps: - - checkout - - win_setup_env - - restore_cache: - key: kedro-deps-v1-win-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }} - - win_setup_requirements - - run: - # geopandas and tensorflow conflicts when imported simultaneously. - # The HDF5 header files used to compile this application do not match - # the version used by the HDF5 library to which this application is linked. - # Data corruption or segmentation faults may occur if the application continues. - # This can happen when an application was compiled by one version of HDF5 but - # linked with a different version of static or shared HDF5 library. - # You should recompile the application or check your shared library related - # settings such as 'LD_LIBRARY_PATH'. - # You can, at your own risk, disable this warning by setting the environment - # variable 'HDF5_DISABLE_VERSION_CHECK' to a value of '1'. - # Setting it to 2 or higher will suppress the warning messages totally. - name: Set HDF5_DISABLE_VERSION_CHECK environment variable - command: setx /m HDF5_DISABLE_VERSION_CHECK 1 - - run: - name: Run unit tests - command: | - conda activate kedro_builder - pytest .\tests --ignore .\tests\extras\datasets\spark --ignore .\tests\extras\datasets\tensorflow --ignore tests\framework\session\test_session_hooks.py --no-cov - - win_e2e_tests: - description: Run all end to end tests - steps: - - checkout - - run: - name: Install dependencies - command: | - conda activate kedro_builder - pip install -r features/windows_reqs.txt - choco install make - - run: - name: Run e2e tests - command: conda activate kedro_builder; make e2e-tests - - win_pip_compile: - description: Pip-compile requirements file - parameters: - cache_save: - type: boolean - default: false - steps: - - checkout - - win_setup_env - - restore_cache: - key: kedro-deps-v1-win-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }} - - win_setup_requirements - - when: - # Cache when `parameters.cache_save` is True - condition: << parameters.cache_save >> - steps: - - save_cache: - key: kedro-deps-v1-win-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }} - paths: - # Cache pip cache and conda packages directories - - c:\tools\miniconda3\pkgs - - c:\users\circleci\appdata\local\pip\cache - - run: - name: Pip-compile requirements file - command: conda activate kedro_builder; make pip-compile - -jobs: - unit_tests_37: - executor: py37 - steps: [unit_tests] - - linters_37: - executor: py37 - steps: [lint] - - e2e_tests_37: - executor: py37 - steps: [e2e_tests] - - docs_37: - executor: py37 - steps: [build_docs] +parameters: + release_kedro: + type: boolean + default: false + # The parameters below are set in CircleCI UI. + # https://app.circleci.com/settings/project/github/kedro-org/kedro/triggers?return-to=https%3A%2F%2Fapp.circleci.com%2Fpipelines%2Fgithub%2Fkedro-org%2Fkedro&triggerSource=&scheduledTriggerId=61f7226f-f092-4449-98d9-40420f8c46b2&success=true + run_hourly: + type: boolean + default: false + run_nightly: + type: boolean + default: false + +setup: true - docs_linkcheck_37: - executor: py37 - steps: [docs_linkcheck] - - unit_tests_38: - executor: py38 - steps: - - checkout - - setup_conda - - setup_requirements - - run: - name: Run unit tests without Spark - command: make test-no-spark - - linters_38: - executor: py38 - steps: [lint] - - e2e_tests_38: - executor: py38 - steps: [e2e_tests] - - unit_tests_39: - executor: py39 - steps: - - checkout - - setup_conda - - setup_requirements - - run: - name: Run unit tests without Spark - command: make test-no-spark - - linters_39: - executor: py39 - steps: [lint] - - e2e_tests_39: - executor: py39 - steps: [e2e_tests] - - pip_compile_37: - executor: py37 - steps: [pip_compile] - - pip_compile_38: - executor: py38 - steps: [pip_compile] - - pip_compile_39: - executor: py39 - steps: [pip_compile] - - all_circleci_checks_succeeded: - docker: - - image: circleci/python # any light-weight image - - steps: - - run: - name: Success! - command: echo "All checks passed" - - # Windows-related jobs - win_unit_tests_37: - executor: - name: win/default - steps: - - win_setup_conda: - python_version: "3.7" - - win_unit_tests - - win_unit_tests_38: - executor: - name: win/default - steps: - - win_setup_conda: - python_version: "3.8" - - checkout - - win_setup_env - - restore_cache: - key: kedro-deps-v1-win-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }} - - win_setup_requirements - - run: - name: Set HDF5_DISABLE_VERSION_CHECK environment variable - command: setx /m HDF5_DISABLE_VERSION_CHECK 1 - - run: - name: Run unit tests without Spark and TensorFlow - # Run `test_parallel_runner.py` separately because of `Windows fatal exception: stack overflow` - command: | - conda activate kedro_builder - pytest .\tests --ignore .\tests\extras\datasets\spark --ignore .\tests\extras\datasets\tensorflow --ignore tests\framework\session\test_session_hooks.py --ignore .\tests\runner\test_parallel_runner.py --no-cov - if ($?) { pytest .\tests\runner\test_parallel_runner.py --no-cov } - - win_unit_tests_39: - executor: - name: win/default - steps: - - win_setup_conda: - python_version: "3.9" - - checkout - - win_setup_env - - restore_cache: - key: kedro-deps-v1-win-{{ checksum "requirements.txt" }}-{{ checksum "test_requirements.txt" }} - - win_setup_requirements - - run: - name: Set HDF5_DISABLE_VERSION_CHECK environment variable - command: setx /m HDF5_DISABLE_VERSION_CHECK 1 - - run: - name: Run unit tests without Spark and TensorFlow - # Run `test_parallel_runner.py` separately because of `Windows fatal exception: stack overflow` - command: | - conda activate kedro_builder - pytest .\tests --ignore .\tests\extras\datasets\spark --ignore .\tests\extras\datasets\tensorflow --ignore tests\framework\session\test_session_hooks.py --ignore .\tests\runner\test_parallel_runner.py --no-cov - if ($?) { pytest .\tests\runner\test_parallel_runner.py --no-cov } - - win_e2e_tests_37: - executor: - name: win/default - steps: - - win_setup_conda: - python_version: "3.7" - - win_e2e_tests - - win_e2e_tests_38: - executor: - name: win/default - steps: - - win_setup_conda: - python_version: "3.8" - - win_e2e_tests - - win_e2e_tests_39: - executor: - name: win/default - steps: - - win_setup_conda: - python_version: "3.9" - - win_e2e_tests - - win_pip_compile_37: - executor: - name: win/default - steps: - - win_setup_conda: - python_version: "3.7" - - win_pip_compile: - # Save cache only for Python 3.7. There is no need to save it for each Python. - cache_save: true - - win_pip_compile_38: - executor: - name: win/default - steps: - - win_setup_conda: - python_version: "3.8" - - win_pip_compile - - win_pip_compile_39: - executor: - name: win/default - steps: - - win_setup_conda: - python_version: "3.9" - - win_pip_compile +orbs: + path-filtering: circleci/path-filtering@0.1.1 workflows: - version: 2 regular: jobs: - - docs_linkcheck_37 - - unit_tests_37 - - linters_37 - - e2e_tests_37 - - docs_37 - - unit_tests_38 - - linters_38 - - e2e_tests_38 - - unit_tests_39 - - linters_39 - - e2e_tests_39 - - pip_compile_37 - - pip_compile_38 - - pip_compile_39 - - win_unit_tests_37 - - win_unit_tests_38 - - win_unit_tests_39 - - win_pip_compile_37 - - win_pip_compile_38 - - win_pip_compile_39 - - win_e2e_tests_37 - - win_e2e_tests_38 - - win_e2e_tests_39 - - all_circleci_checks_succeeded: - requires: - - unit_tests_37 - - linters_37 - - e2e_tests_37 - - docs_37 - - docs_linkcheck_37 - - unit_tests_38 - - linters_38 - - e2e_tests_38 - - unit_tests_39 - - linters_39 - - e2e_tests_39 - - pip_compile_37 - - pip_compile_38 - - pip_compile_39 - - win_pip_compile_37 - - win_pip_compile_38 - - win_pip_compile_39 - - win_unit_tests_37 - # Skipped due to Windows fatal exception: stack overflow - # - win_unit_tests_38 - # - win_unit_tests_39 - - win_e2e_tests_37 - - win_e2e_tests_38 - - win_e2e_tests_39 + # the path-filtering/filter job determines which pipeline + # parameters to update, i.e. which builds to run. + - path-filtering/filter: + name: check-updated-files + base-revision: main + config-path: .circleci/continue_config.yml + # + mapping: | + docs/.* docs_change true + ^((?!docs/).)*$ code_change true diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml new file mode 100644 index 0000000000..c83d6615cb --- /dev/null +++ b/.circleci/continue_config.yml @@ -0,0 +1,681 @@ +version: 2.1 + + +# the default pipeline parameters, which will be updated according to +# the results of the path-filtering orb +parameters: + code_change: + type: boolean + default: false + docs_change: + type: boolean + default: false + release_kedro: + type: boolean + default: false + # The parameters below are set in CircleCI UI. + # https://app.circleci.com/settings/project/github/kedro-org/kedro/triggers?return-to=https%3A%2F%2Fapp.circleci.com%2Fpipelines%2Fgithub%2Fkedro-org%2Fkedro&triggerSource=&scheduledTriggerId=61f7226f-f092-4449-98d9-40420f8c46b2&success=true + run_hourly: + type: boolean + default: false + run_nightly: + type: boolean + default: false + +orbs: + win: circleci/windows@2.4.1 + +# No windows executor is listed here since windows builds use win/default and modify +# the Python version through the conda environment. +executors: + docker: + parameters: + python_version: + type: string + docker: + - image: public.ecr.aws/g0x0s3o2/kedro-builder:<> + resource_class: medium+ + +commands: + setup_conda: + steps: + - run: + name: Run conda.sh + command: echo ". /home/circleci/miniconda/etc/profile.d/conda.sh" >> $BASH_ENV + - run: + name: Activate conda environment + command: echo "conda deactivate; conda activate kedro_builder" >> $BASH_ENV + + setup_requirements: + steps: + - run: + name: Install pip setuptools + command: make install-pip-setuptools + - run: + name: Install venv for some pre-commit hooks + command: conda install -y virtualenv + - run: + # pytables does not work properly with python 3.9 to handle our HDFDataSet + # if pip-installed, so we install this dependency via conda + name: Install pytables + command: conda install -c conda-forge pytables -y + - run: + name: Install requirements and test requirements + command: pip install --upgrade .[test] + - run: + # this is needed to fix java cacerts so + # spark can automatically download packages from mvn + # https://stackoverflow.com/a/50103533/1684058 + name: Fix cacerts + command: | + sudo rm /etc/ssl/certs/java/cacerts + sudo update-ca-certificates -f + - run: + # Since recently Spark installation for some reason does not have enough permissions to execute + # /home/circleci/miniconda/envs/kedro_builder/lib/python3.X/site-packages/pyspark/bin/spark-class. + # So fixing it manually here. + name: Fix Spark permissions + command: sudo chmod -R u+x /home/circleci/miniconda/envs/kedro_builder/lib/ + - run: + name: Print Python environment + command: make print-python-env + - run: + name: Pip freeze + command: pip freeze + + setup: + steps: + - checkout + - setup_conda + - setup_requirements + + # Windows specific commands + win_setup_conda: + parameters: + python_version: + type: string + steps: + - run: + name: Initialize conda + command: conda init powershell + - run: + name: Create 'kedro_builder' conda environment + command: conda create -n kedro_builder python=<> -y + + win_setup_env: + steps: + - run: + # Required for Tensorflow tests + name: Install Microsoft Visual C++ Redistributable + command: | + $ProgressPreference = "SilentlyContinue" + Invoke-WebRequest https://aka.ms/vs/16/release/vc_redist.x64.exe -OutFile vc_redist.x64.exe + .\vc_redist.x64.exe /S /v/qn + - run: + name: Install Java 8 + command: | + $ProgressPreference = "SilentlyContinue" + Invoke-WebRequest https://github.com/AdoptOpenJDK/openjdk8-upstream-binaries/releases/download/jdk8u252-b09/OpenJDK8U-jdk_x64_windows_8u252b09.zip -OutFile OpenJDK8U.zip + Expand-Archive .\OpenJDK8U.zip -DestinationPath C:\OpenJDK8U + - run: + name: Create Inbound rules for Java + command: | + New-NetFirewallRule -DisplayName "Allow JDK UDP" -Profile "Public" -Protocol "UDP" -Direction Inbound -Program "C:\OpenJDK8U\openjdk-8u252-b09\bin\java.exe" -Action Allow + New-NetFirewallRule -DisplayName "Allow JDK TCP" -Profile "Public" -Protocol "TCP" -Direction Inbound -Program "C:\OpenJDK8U\openjdk-8u252-b09\bin\java.exe" -Action Allow + - run: + name: Set Java environment variables + command: | + [Environment]::SetEnvironmentVariable("Path", [Environment]::GetEnvironmentVariable('Path', 'Machine') + ";C:\OpenJDK8U\openjdk-8u252-b09\bin", "Machine") + setx /m JAVA_HOME "C:\OpenJDK8U\openjdk-8u252-b09" + - run: + name: Setup Hadoop binary + command: | + $ProgressPreference = "SilentlyContinue" + Invoke-WebRequest https://github.com/steveloughran/winutils/raw/master/hadoop-2.6.3/bin/winutils.exe -OutFile winutils.exe + New-Item -ItemType directory -Path C:\hadoop\bin + mv .\winutils.exe C:\hadoop\bin + setx /m HADOOP_HOME "C:\hadoop\" + - run: + name: Install 'make' command + command: choco install make + + win_setup_requirements: + parameters: + python_version: + type: string + steps: + - restore_cache: + name: Restore package cache + key: kedro-deps-v1-win-{{ checksum "pyproject.toml" }}-{{ checksum "setup.py" }} + # We don't restore the conda environment cache for python 3.10 as it conflicts with the + # 'Install GDAL, Fiona and pytables' step breaking the conda environment (missing zlib.dll). + - unless: + condition: + equal: [ "3.10", <> ] + steps: + - restore_cache: + name: Restore conda environment cache + key: kedro-deps-v1-win-<>-{{ checksum "pyproject.toml" }}-{{ checksum "setup.py" }} + # pytables and Fiona have a series of binary dependencies under Windows that + # are best handled by conda-installing instead of pip-installing them. + # Dependency resolution works best when installing these altogether in one + # `conda install` command rather than one at a time in several sequential `conda install`s. + - run: + name: Install GDAL, Fiona and pytables + command: conda activate kedro_builder; conda install gdal fiona pytables -c conda-forge -y + - run: + name: Show pip information + command: conda activate kedro_builder; pip debug --verbose + - run: + name: Install all requirements + command: conda activate kedro_builder; pip install -v -U .[test] + - run: + name: Print Python environment + command: conda activate kedro_builder; make print-python-env + - run: + name: Pip freeze + command: conda activate kedro_builder; pip freeze + + win_setup: + parameters: + python_version: + type: string + steps: + - checkout + - win_setup_conda: + python_version: <> + - win_setup_env + - win_setup_requirements: + python_version: <> + +jobs: + e2e_tests: + parameters: + python_version: + type: string + executor: + name: docker + python_version: <> + environment: + COLUMNS: 120 + LINES: 25 + steps: + - setup + - run: + name: Run e2e tests + command: make e2e-tests + + win_e2e_tests: + parameters: + python_version: + type: string + executor: win/default + environment: + PIP_DISABLE_PIP_VERSION_CHECK: 1 + COLUMNS: 120 + LINES: 25 + PYTHONIOENCODING: utf-8 + steps: + - checkout + - win_setup_conda: + python_version: <> + - run: + name: Install 'make' command + command: choco install make + # We don't use the `win_setup` command here, which would install the full set + # of requirements used by unit tests. Even when those packages are cached + # it is faster to just install the minimal set of dependencies needed for e2e + # tests in a new empty environment rather than restore the cache. + - run: + name: Install dependencies + command: conda activate kedro_builder; pip install -r features/windows_reqs.txt + - run: + name: Run e2e tests + command: conda activate kedro_builder; make e2e-tests + + unit_tests: + parameters: + python_version: + type: string + executor: + name: docker + python_version: <> + steps: + - setup + - unless: + condition: + equal: ["3.10", <>] + steps: + - run: + name: Run unit tests in parallel + command: PYTEST_ADDOPTS="-v" make test + - when: + condition: + equal: [ "3.10", <> ] + steps: + - run: + name: Run unit tests sequentially + command: pytest -v tests --cov-config pyproject.toml + + + win_unit_tests: + parameters: + python_version: + type: string + executor: win/default + steps: + - win_setup: + python_version: <> + - run: + # geopandas and tensorflow conflicts when imported simultaneously. + # The HDF5 header files used to compile this application do not match + # the version used by the HDF5 library to which this application is linked. + # Data corruption or segmentation faults may occur if the application continues. + # This can happen when an application was compiled by one version of HDF5 but + # linked with a different version of static or shared HDF5 library. + # You should recompile the application or check your shared library related + # settings such as 'LD_LIBRARY_PATH'. + # You can, at your own risk, disable this warning by setting the environment + # variable 'HDF5_DISABLE_VERSION_CHECK' to a value of '1'. + # Setting it to 2 or higher will suppress the warning messages totally. + name: Set HDF5_DISABLE_VERSION_CHECK environment variable + command: setx /m HDF5_DISABLE_VERSION_CHECK 1 + - unless: + condition: + equal: [ "3.10", <> ] + steps: + - run: + name: Run unit tests without spark in parallel + command: conda activate kedro_builder; make test-no-spark + - when: + condition: + equal: [ "3.10", <> ] + steps: + - run: + name: Run unit tests without spark sequentially + command: conda activate kedro_builder; pytest tests --no-cov --ignore tests/extras/datasets/spark + + lint: + parameters: + python_version: + type: string + executor: + name: docker + python_version: <> + steps: + - setup + - run: + name: Run linters + command: make lint + + pip_compile: + parameters: + python_version: + type: string + executor: + name: docker + python_version: <> + steps: + - setup + - run: + name: Pip-compile requirements file + command: make pip-compile + + win_pip_compile: + parameters: + python_version: + type: string + executor: win/default + steps: + - win_setup: + python_version: <> + - when: + # Save Python package cache only for Python 3.7. The conda environment itself + # is specific to a Python version and is cached separately for each. + condition: + equal: ["3.7", <>] + steps: + - save_cache: + name: Save Python package cache + key: kedro-deps-v1-win-{{ checksum "pyproject.toml" }}-{{ checksum "setup.py" }} + paths: + # Cache pip cache and conda packages directories + - c:\tools\miniconda3\pkgs + - c:\users\circleci\appdata\local\pip\cache + # We don't save the conda environment cache for python 3.10 due to conflicts with the + # 'Install GDAL, Fiona and pytables' and 'Restore conda environment cache' steps. + - unless: + condition: + equal: [ "3.10", <> ] + steps: + - save_cache: + name: Save conda environment cache + key: kedro-deps-v1-win-<>-{{ checksum "pyproject.toml" }}-{{ checksum "setup.py" }} + paths: + - c:\tools\miniconda3\envs\kedro_builder + - run: + name: Pip-compile requirements file + command: conda activate kedro_builder; make pip-compile + + sync: + docker: + # https://circleci.com/docs/2.0/circleci-images/#circleci-base-image + - image: cimg/base:2020.01 + steps: + - checkout + - add_ssh_keys + - run: + name: Set git email and name + command: | + git config --global user.email "kedro@kedro.com" + git config --global user.name "Kedro" + - run: + name: Trigger Read The Docs build + command: ./tools/circleci/rtd-build.sh ${RTD_TOKEN} latest + - run: + name: Maybe merge main into develop or raise a PR + command: ./tools/circleci/github_scripts/merge.sh . "main" "develop" "${GITHUB_TAGGING_TOKEN}" + - run: + name: Maybe trigger the release workflow + command: | + KEDRO_VERSION=$(./tools/circleci/github_scripts/kedro_version.py ./kedro) + if ./tools/circleci/check-no-version-pypi.sh "${KEDRO_VERSION}" + then + echo "Starting the release of Kedro ${KEDRO_VERSION}!" + ./tools/circleci/circle-release.sh github/kedro-org/kedro + else + echo "Kedro version ${KEDRO_VERSION} already exists on PyPI, skipping..." + fi + + merge_pr_to_develop: + docker: + # https://circleci.com/docs/2.0/circleci-images/#circleci-base-image + - image: cimg/base:2020.01 + steps: + - checkout + - add_ssh_keys + - run: + name: Maybe merge an automatic PR into develop + command: ./tools/circleci/github_scripts/attempt_merge_pr.sh "merge-main-to-develop" "develop" "${GITHUB_TAGGING_TOKEN}" + + build_docker_image: + parameters: + python_version: + type: string + docker: + - image: cimg/python:3.8 + steps: + - setup_remote_docker: + docker_layer_caching: true + - checkout + - run: + name: Setup AWS CLI + command: pip install -U awscli + - run: + name: Login to AWS ECR + command: aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws + - run: + name: Build docker images + command: ./tools/circleci/docker_build_img/build.sh "." "public.ecr.aws/g0x0s3o2/kedro-builder" "<>" + no_output_timeout: 20m + - run: + name: Logout from AWS ECR + command: docker logout public.ecr.aws + when: always # logout even if the previous step has failed + + # This is effectively just a combination of the lint, unit_tests and e2e_tests jobs. + # It's used to check that the nightly docker image is working ok and before publishing a release. + build_kedro: + parameters: + python_version: + type: string + executor: + name: docker + python_version: <> + environment: + COLUMNS: 120 + LINES: 25 + steps: + - setup + - run: + name: Run linters + command: make lint + - unless: + condition: + equal: ["3.10", <>] + steps: + - run: + name: Run unit tests in parallel + command: make test + - when: + condition: + equal: [ "3.10", <> ] + steps: + - run: + name: Run unit tests sequentially + command: pytest tests --cov-config pyproject.toml + - run: + name: Run e2e tests + command: make e2e-tests + + publish_kedro: + executor: + name: docker + python_version: "3.7" + steps: + - setup + - add_ssh_keys + - run: + name: Check Kedro version + command: | + KEDRO_VERSION=$(./tools/circleci/github_scripts/kedro_version.py ./kedro) + if ./tools/circleci/check-no-version-pypi.sh "${KEDRO_VERSION}" + then + echo "export KEDRO_VERSION=\"${KEDRO_VERSION}\"" >> $BASH_ENV + else + echo "Error: Kedro version ${KEDRO_VERSION} already exists on PyPI" + exit 1 + fi + - run: + name: Tag and publish release on Github + command: ./tools/circleci/github_scripts/release.sh kedro-org kedro ${GITHUB_TAGGING_TOKEN} ${KEDRO_VERSION} + - run: + name: Publish to PyPI + command: | + make package + python -m pip install twine -U + python -m twine upload --repository-url ${TWINE_REPOSITORY_URL} dist/* + - run: + name: Trigger Read The Docs build + command: | + ./tools/circleci/rtd-build.sh ${RTD_TOKEN} stable + # give some time for GitHub release to propagate + # otherwise RTD fails to build a new tag + sleep 120 + ./tools/circleci/rtd-build.sh ${RTD_TOKEN} ${KEDRO_VERSION} + + # Trigger kedro-viz build to ensure tests in that project pass + viz_build: + docker: + - image: spotify/alpine # for bash and curl + steps: + - run: + name: Trigger kedro-viz build + command: | + curl --location --request POST \ + --url https://circleci.com/api/v2/project/github/kedro-org/kedro-viz/pipeline \ + --header "Circle-Token: $CIRCLE_VIZ_BUILD_TOKEN" \ + --header 'content-type: application/json' \ + --data '{"branch":"main"}' + + all_circleci_checks_succeeded: + docker: + - image: circleci/python # any light-weight image + steps: + - run: + name: Success! + command: echo "All checks passed" + +workflows: + version: 2.1 + + lint_only: + when: + and: + - <> + - not: <> + - not: <> + - not: <> + - not: <> + jobs: + - lint: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + - all_circleci_checks_succeeded: + requires: + - lint + + build_code: + when: + and: + - <> + - not: <> + - not: <> + - not: <> + jobs: + - e2e_tests: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + - win_e2e_tests: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + - unit_tests: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + - win_unit_tests: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + - lint: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + - pip_compile: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + - win_pip_compile: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + - all_circleci_checks_succeeded: + requires: + - e2e_tests + - win_e2e_tests + - unit_tests + - win_unit_tests + - lint + - pip_compile + - win_pip_compile + + main_updated: + when: + and: + - not: <> + - not: <> + - not: <> + jobs: + - sync: + filters: + branches: + only: main + - viz_build: + filters: + branches: + only: main + + hourly_pr_merge: + when: + and: + - <> + - not: <> + - not: <> + jobs: + - merge_pr_to_develop: + filters: + branches: + only: main + + # Python versions that are supported on `main`. + nightly_build_main: + when: + and: + - <> + - not: <> + - not: <> + jobs: + - build_docker_image: + context: + - kedro-ecr-publish + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + filters: + branches: + only: main + - build_kedro: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + requires: + - build_docker_image-<> + filters: + branches: + only: main + + # Python versions that are *only* supported on `develop`. + # If `develop` supports the same versions as `main`, comment this out. + # nightly_build_develop: + # when: + # and: + # - <> + # - not: <> + # - not: <> + # jobs: + # - build_docker_image: + # context: + # - kedro-ecr-publish + # matrix: + # parameters: + # python_version: [] + # filters: + # branches: + # only: develop + # - build_kedro: + # matrix: + # parameters: + # python_version: [] + # requires: + # - build_docker_image-<> + # filters: + # branches: + # only: develop + + kedro_release: + when: + and: + - <> + - not: <> + - not: <> + jobs: + - build_kedro: + matrix: + parameters: + python_version: ["3.7", "3.8", "3.9", "3.10"] + - publish_kedro: + requires: + - build_kedro diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index 645204ef7a..d7a2a02611 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -8,21 +8,21 @@ assignees: '' --- ## Description -Short description of the problem here. + ## Context -How has this bug affected you? What were you trying to accomplish? + ## Steps to Reproduce -1. [First Step] + ## Expected Result -Tell us what should happen. + ## Actual Result -Tell us what happens instead. + ``` -- If you received an error, place it here. @@ -33,7 +33,7 @@ Tell us what happens instead. ``` ## Your Environment -Include as many relevant details about the environment in which you experienced the bug: + * Kedro version used (`pip show kedro` or `kedro -V`): * Python version used (`python -V`): diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 349cce48a4..37923fa5b0 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,11 +1,11 @@ blank_issues_enabled: false contact_links: - - name: Discord server + - name: Slack organisation about: Come chat with the community! - url: https://discord.gg/akJDeVaxnB + url: https://slack.kedro.org/ - name: Documentation url: https://kedro.readthedocs.io/en/stable/ about: To learn more about how Kedro works - - name: Case studies, articles and video tutorials - url: https://github.com/quantumblacklabs/kedro-community - about: Community-generated content, project examples and video tutorials + - name: Website + url: https://kedro.org/ + about: Learn about features, project examples and our demo diff --git a/.github/ISSUE_TEMPLATE/design-doc.md b/.github/ISSUE_TEMPLATE/design-doc.md index e656bf7b7f..43f70cb12e 100644 --- a/.github/ISSUE_TEMPLATE/design-doc.md +++ b/.github/ISSUE_TEMPLATE/design-doc.md @@ -8,37 +8,29 @@ assignees: '' --- ## Introduction - -A high-level, short overview of the problem(s) you are designing a solution for. + ## Background - -Provide the reader with the context surrounding the problem(s) you are trying to solve. + ## Problem - -Be as concrete as you can about: + ### What's in scope ### What's not in scope ## Design - -Explain your design to the solution here. Diagrams could help. + ### Alternatives considered - -Explain the trade off between different alternatives to your solution. + ## Testing - -Explain the testing strategies to verify your design correctness (if possible). + ## Rollout strategy - -Is the change backward compatible? If not, what is the migration strategy? + ## Future iterations - -Will there be future iterations of this design? + diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md index a7911c2f11..b52de098bd 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.md +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -8,13 +8,13 @@ assignees: '' --- ## Description -Is your feature request related to a problem? A clear and concise description of what the problem is: "I'm always frustrated when ..." + ## Context -Why is this change important to you? How would you use it? How can it benefit other users? + ## Possible Implementation -(Optional) Suggest an idea for implementing the addition or change. + ## Possible Alternatives -(Optional) Describe any alternative solutions or features you've considered. + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index fead997a81..d8da8611d3 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,6 @@ +> **NOTE:** Kedro datasets are moving from `kedro.extras.datasets` to a separate `kedro-datasets` package in +> [`kedro-plugins` repository](https://github.com/kedro-org/kedro-plugins). Any changes to the dataset implementations +> should be done by opening a pull request in that repository. ## Description @@ -6,16 +9,8 @@ ## Checklist -- [ ] Read the [contributing](https://github.com/quantumblacklabs/kedro/blob/master/CONTRIBUTING.md) guidelines +- [ ] Read the [contributing](https://github.com/kedro-org/kedro/blob/main/CONTRIBUTING.md) guidelines - [ ] Opened this PR as a 'Draft Pull Request' if it is work-in-progress - [ ] Updated the documentation to reflect the code changes -- [ ] Added a description of this change and added my name to the list of supporting contributions in the [`RELEASE.md`](https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md) file +- [ ] Added a description of this change in the [`RELEASE.md`](https://github.com/kedro-org/kedro/blob/main/RELEASE.md) file - [ ] Added tests to cover my changes - -## Notice - -- [ ] I acknowledge and agree that, by checking this box and clicking "Submit Pull Request": - -- I submit this contribution under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0.txt) and represent that I am entitled to do so on behalf of myself, my employer, or relevant third parties, as applicable. -- I certify that (a) this contribution is my original creation and / or (b) to the extent it is not my original creation, I am authorised to submit this contribution on behalf of the original creator(s) or their licensees. -- I certify that the use of this contribution as authorised by the Apache 2.0 license does not violate the intellectual property rights of anyone else. diff --git a/.github/dco.yml b/.github/dco.yml new file mode 100644 index 0000000000..0c4b142e9a --- /dev/null +++ b/.github/dco.yml @@ -0,0 +1,2 @@ +require: + members: false diff --git a/.github/demo-dark.png b/.github/demo-dark.png new file mode 100644 index 0000000000..5f6a2bf125 Binary files /dev/null and b/.github/demo-dark.png differ diff --git a/.github/demo-light.png b/.github/demo-light.png new file mode 100644 index 0000000000..bdda6c9b03 Binary files /dev/null and b/.github/demo-light.png differ diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..65736fe738 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,13 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" + labels: + - "dependencies" diff --git a/.github/stale.yml b/.github/stale.yml index e556fa9854..2e0a4c333c 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -2,6 +2,9 @@ daysUntilStale: 60 # Number of days of inactivity before a stale issue is closed daysUntilClose: 7 +# Only issues or pull requests with all of these labels are checked if stale. Defaults to `[]` (disabled) +onlyLabels: + - Community # Issues with these labels will never be considered stale exemptLabels: - pinned diff --git a/.github/workflows/all-checks.yml b/.github/workflows/all-checks.yml new file mode 100644 index 0000000000..d023b0dfd2 --- /dev/null +++ b/.github/workflows/all-checks.yml @@ -0,0 +1,46 @@ +name: Run all checks on Kedro + +on: + push: + branches: + - main + - develop + paths-ignore: + - "docs/**" + pull_request: + branches: + - main + - develop + paths-ignore: + - "docs/**" + +jobs: + unit-tests: + strategy: + matrix: + os: [ ubuntu-latest, windows-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10" ] + uses: ./.github/workflows/unit-tests.yml + with: + os: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} + + lint: + strategy: + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.10" ] + uses: ./.github/workflows/lint.yml + with: + os: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} + + e2e-tests: + strategy: + matrix: + os: [ ubuntu-latest, windows-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10" ] + uses: ./.github/workflows/e2e-tests.yml + with: + os: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/docs-only-checks.yml b/.github/workflows/docs-only-checks.yml new file mode 100644 index 0000000000..4d6870d627 --- /dev/null +++ b/.github/workflows/docs-only-checks.yml @@ -0,0 +1,26 @@ +name: Run linter on Kedro Docs + +on: + push: + branches: + - main + - develop + paths: + - "docs/**" + pull_request: + branches: + - main + - develop + paths: + - "docs/**" + +jobs: + lint-tests: + strategy: + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10" ] + uses: ./.github/workflows/lint.yml + with: + os: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml new file mode 100644 index 0000000000..6c317c7c40 --- /dev/null +++ b/.github/workflows/e2e-tests.yml @@ -0,0 +1,46 @@ +name: Run e2e-tests on Kedro + +on: + workflow_call: + inputs: + os: + type: string + python-version: + type: string + +env: + COLUMNS: 120 + LINES: 25 + +jobs: + e2e-tests: + runs-on: ${{ inputs.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{inputs.python-version}} + uses: actions/setup-python@v3 + with: + python-version: ${{inputs.python-version}} + - run: make install-pip-setuptools + - name: Cache python packages for Linux + if: inputs.os == 'ubuntu-latest' + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Cache python packages for Windows + if: inputs.os == 'windows-latest' + uses: actions/cache@v3 + with: + path: ~\AppData\Local\pip\Cache + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Install dependencies + run: | + pip --version + make install-test-requirements + make install-pre-commit + - name: pip freeze + run: pip freeze + - name: Run e2e tests + run: make e2e-tests diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000..81712415fd --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,32 @@ +name: Run linters on Kedro + +on: + workflow_call: + inputs: + os: + type: string + python-version: + type: string + +jobs: + lint: + runs-on: ${{ inputs.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ inputs.python-version }} + - name: Cache python packages + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Install dependencies + run: | + make install-test-requirements + make install-pre-commit + pip freeze + - name: Run linter + run: make lint diff --git a/.github/workflows/merge-gatekeeper.yml b/.github/workflows/merge-gatekeeper.yml new file mode 100644 index 0000000000..4f5393a1e6 --- /dev/null +++ b/.github/workflows/merge-gatekeeper.yml @@ -0,0 +1,27 @@ +name: Merge Gatekeeper + +on: + pull_request: + branches: + - main + - develop + +jobs: + merge-gatekeeper: + runs-on: ubuntu-latest + # Restrict permissions of the GITHUB_TOKEN. + # Docs: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs + permissions: + checks: read + statuses: read + steps: + - name: Run Merge Gatekeeper + # NOTE: v1 is updated to reflect the latest v1.x.y. Please use any tag/branch that suits your needs: + # https://github.com/upsidr/merge-gatekeeper/tags + # https://github.com/upsidr/merge-gatekeeper/branches + uses: upsidr/merge-gatekeeper@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + timeout: 1800 + interval: 30 + ignored: 'ci/circleci: win_e2e_tests-3.7,ci/circleci: win_pip_compile-3.9,ci/circleci: win_e2e_tests-3.9,ci/circleci: win_pip_compile-3.8,ci/circleci: lint-3.7,ci/circleci: win_pip_compile-3.7,ci/circleci: pip_compile-3.7,ci/circleci: e2e_tests-3.7,ci/circleci: win_unit_tests-3.7,ci/circleci: win_unit_tests-3.9,ci/circleci: e2e_tests-3.8,ci/circleci: win_unit_tests-3.10,ci/circleci: win_pip_compile-3.10,ci/circleci: win_unit_tests-3.8,ci/circleci: e2e_tests-3.9,ci/circleci: unit_tests-3.10,ci/circleci: unit_tests-3.8,ci/circleci: e2e_tests-3.10,ci/circleci: lint-3.8,ci/circleci: unit_tests-3.9,ci/circleci: unit_tests-3.7,ci/circleci: win_e2e_tests-3.10,ci/circleci: pip_compile-3.8,ci/circleci: pip_compile-3.10,ci/circleci: win_e2e_tests-3.8,ci/circleci: lint-3.9,ci/circleci: pip_compile-3.9,ci/circleci: lint-3.10,build_code,ci/circlecici: check-updated-files,regular' diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000000..87ca8e0eed --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,53 @@ +name: Run unit-tests on Kedro + +on: + workflow_call: + inputs: + os: + type: string + python-version: + type: string +jobs: + unit-tests: + runs-on: ${{ inputs.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{inputs.python-version}} + uses: actions/setup-python@v3 + with: + python-version: ${{inputs.python-version}} + - run: make install-pip-setuptools + - name: Cache python packages for Linux + if: inputs.os == 'ubuntu-latest' + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Cache python packages for Windows + if: inputs.os == 'windows-latest' + uses: actions/cache@v3 + with: + path: ~\AppData\Local\pip\Cache + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Install dependencies + run: | + make install-test-requirements + make install-pre-commit + - name: Install pytables (only for windows) + if: inputs.os == 'windows-latest' + run: pip install tables + - name: pip freeze + run: pip freeze + - name: Run unit tests + if: inputs.os == 'ubuntu-latest' && inputs.python-version == '3.10' + run: make test-sequential + - name: Run unit tests + if: inputs.os == 'ubuntu-latest' && inputs.python-version != '3.10' + run: make test + - name: Run unit tests (Windows) + if: inputs.os == 'windows-latest' && inputs.python-version == '3.10' + run: make test-no-spark-sequential + - name: Run unit tests (Windows) + if: inputs.os == 'windows-latest' && inputs.python-version != '3.10' + run: make test-no-spark diff --git a/.gitignore b/.gitignore index b24dc09988..2fdac4dbdb 100644 --- a/.gitignore +++ b/.gitignore @@ -114,6 +114,7 @@ celerybeat-schedule # Environments .env +.envrc .venv env/ venv/ @@ -133,6 +134,12 @@ venv.bak/ /site /kedro/framework/html +# Sphinx documentation +# Additional files created by sphinx.ext.autosummary +# Some of them are actually tracked to control the output +/docs/source/kedro.* +/docs/source/kedro_datasets.* + # mypy .mypy_cache/ @@ -153,8 +160,13 @@ kedro.db kedro/html docs/tmp-build-artifacts docs/build +docs/temp docs/node_modules docs/source/04_user_guide/source/.ipynb tests/template/fake_project/ default.profraw +package-lock.json + +# Kedro-Datasets plugin +kedro/datasets/* diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile new file mode 100644 index 0000000000..e03635f196 --- /dev/null +++ b/.gitpod.Dockerfile @@ -0,0 +1,13 @@ +FROM gitpod/workspace-full:2023-05-08-21-16-55 + +# Some datasets work on 3.8 only +RUN pyenv install 3.8.15\ + && pyenv global 3.8.15 + +# VideoDataSet +RUN sudo apt-get update && sudo apt-get install -y --no-install-recommends libgl1 +RUN sudo apt-get install make +RUN npm install -g @mermaid-js/mermaid-cli +# https://stackoverflow.com/questions/69564238/puppeteer-error-failed-to-launch-the-browser-process +# https://github.com/puppeteer/puppeteer/blob/main/docs/troubleshooting.md#chrome-doesnt-launch-on-linux +RUN sudo apt-get install -y --no-install-recommends libatk-bridge2.0-0 libcups2 ca-certificates fonts-liberation libasound2 libatk-bridge2.0-0 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 lsb-release wget xdg-utils diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000000..6fe5e8e825 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,31 @@ +image: + file: .gitpod.Dockerfile +tasks: + - name: kedro + + init: | + make sign-off + pip install -e /workspace/kedro[test] + cd /workspace + yes project | kedro new -s pandas-iris --checkout main + cd /workspace/kedro + pre-commit install --install-hooks + + command: | + clear + kedro info + +github: + prebuilds: + # enable for the master/default branch (defaults to true) + master: true + # enable for all branches in this repo (defaults to false) + branches: true + # enable for pull requests coming from this repo (defaults to true) + pullRequests: true + # enable for pull requests coming from forks (defaults to false) + pullRequestsFromForks: true + # add a "Review in Gitpod" button as a comment to pull requests (defaults to true) + addComment: false + # add a "Review in Gitpod" button to pull requests (defaults to false) + addBadge: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b24a608ce1..57bafd2416 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,11 +19,9 @@ repos: exclude: "^kedro/templates/|^features/steps/test_starter/" - id: requirements-txt-fixer # Sorts entries in requirements.txt exclude: "^kedro/templates/|^features/steps/test_starter/" - - id: flake8 - exclude: "^kedro/templates/|^features/steps/test_starter/" - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.812 + rev: v0.961 hooks: - id: mypy args: [--allow-redefinition, --ignore-missing-imports] @@ -33,22 +31,45 @@ repos: ^docs/| ^features/steps/test_starter/ ) + additional_dependencies: + - types-cachetools + - types-filelock + - types-PyYAML + - types-redis + - types-requests + - types-setuptools + - types-toml + - attrs - repo: https://github.com/asottile/blacken-docs - rev: v1.9.2 + rev: v1.12.1 hooks: - - id: blacken-docs - additional_dependencies: [black==21.5b1] - entry: blacken-docs --skip-errors + - id: blacken-docs + additional_dependencies: [black~=22.0] + entry: blacken-docs --skip-errors + + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.0.277 + hooks: + - id: ruff + name: "ruff on kedro/" + args: ["--fix", "--show-fixes", "--exit-non-zero-on-fix"] + exclude: "^kedro/templates/|^features/steps/test_starter/|tests|docs" + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.0.277 + hooks: + - id: ruff + name: "ruff on tests/ and docs/" + # PLR2004: Magic value used + # PLR0913: Too many arguments + args: ["--fix", "--show-fixes", "--exit-non-zero-on-fix", "--ignore=PLR2004,PLR0913"] + # include: "tests" + exclude: "^kedro/templates/|^features/steps/test_starter/|kedro" - repo: local hooks: - - id: isort - name: "Sort imports" - language: system - types: [file, python] - exclude: ^kedro/templates/|^features/steps/test_starter - entry: isort - id: black name: "Black" language: system @@ -56,16 +77,35 @@ repos: types: [file, python] exclude: ^features/steps/test_starter|^kedro/templates/ entry: black - - id: legal - name: "Licence check" - language: system - pass_filenames: false - entry: make legal - id: imports name: "Import Linter" language: system pass_filenames: false entry: lint-imports + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.961 + hooks: + - id: mypy + args: [--allow-redefinition, --ignore-missing-imports] + exclude: | + (?x)( + ^kedro/templates/| + ^docs/| + ^features/steps/test_starter/ + ) + additional_dependencies: + - types-cachetools + - types-filelock + - types-PyYAML + - types-redis + - types-requests + - types-setuptools + - types-toml + - attrs + - repo: local + hooks: + # Slow lintint - id: secret_scan name: "Secret scan" language: system @@ -79,50 +119,4 @@ repos: exclude: ^kedro/templates/|^tests/|^features/steps/test_starter entry: bandit -ll - # It's impossible to specify per-directory configuration, so we just run it many times. - # https://github.com/PyCQA/pylint/issues/618 - # The first set of pylint checks if for local pre-commit, it only runs on the files changed. - - id: pylint-quick-kedro - name: "Quick PyLint on kedro/*" - language: system - types: [file, python] - files: ^kedro/ - exclude: ^kedro/templates/ - entry: pylint -j 4 --disable=unnecessary-pass - stages: [commit] - - id: pylint-quick-features - name: "Quick PyLint on features/*" - language: system - types: [file, python] - files: ^features/ - exclude: ^features/steps/test_starter - entry: pylint -j 4 --disable=missing-docstring,no-name-in-module - stages: [commit] - - id: pylint-quick-tests - name: "Quick PyLint on tests/*" - language: system - types: [file, python] - files: ^tests/ - entry: pylint -j 4 --disable=missing-docstring,redefined-outer-name,no-self-use,invalid-name,protected-access,too-many-arguments,too-many-public-methods - stages: [commit] - - # The same pylint checks, but running on all files. It's for manual run with `make lint` - - id: pylint-kedro - name: "PyLint on kedro/*" - language: system - pass_filenames: false - stages: [manual] - entry: pylint -j 4 --disable=unnecessary-pass --init-hook="import sys; sys.setrecursionlimit(2000)" kedro - - id: pylint-features - name: "PyLint on features/*" - language: system - pass_filenames: false - stages: [manual] - exclude: ^features/steps/test_starter - entry: pylint -j 4 --disable=missing-docstring,no-name-in-module features - - id: pylint-tests - name: "PyLint on tests/*" - language: system - pass_filenames: false - stages: [manual] - entry: pylint -j 4 --disable=missing-docstring,redefined-outer-name,no-self-use,invalid-name,protected-access,too-many-arguments,too-many-public-methods tests +# Manual only diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index a6aa8fde01..0000000000 --- a/.pylintrc +++ /dev/null @@ -1,425 +0,0 @@ -[MASTER] - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code -extension-pkg-whitelist= - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns=kedro/templates/* - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. -jobs=1 - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins=pylint.extensions.docparams - -# Pickle collected data for later comparisons. -persistent=yes - -# Specify a configuration file. -#rcfile= - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" -disable=ungrouped-imports,bad-continuation,duplicate-code - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=useless-suppression - - -[REPORTS] - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio).You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages -reports=no - -# Activate the evaluation score. -score=yes - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - - -[BASIC] - -# Naming hint for argument names -argument-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct argument names -argument-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Naming hint for attribute names -attr-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct attribute names -attr-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Bad variable names which should always be refused, separated by a comma -bad-names=foo,bar,baz,toto,tutu,tata - -# Naming hint for class attribute names -class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Regular expression matching correct class attribute names -class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Naming hint for class names -class-name-hint=[A-Z_][a-zA-Z0-9]+$ - -# Regular expression matching correct class names -class-rgx=[A-Z_][a-zA-Z0-9]+$ - -# Naming hint for constant names -const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Regular expression matching correct constant names -const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming hint for function names -function-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct function names -function-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Good variable names which should always be accepted, separated by a comma -good-names=i,j,k,ex,Run,_ - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# Naming hint for inline iteration names -inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ - -# Regular expression matching correct inline iteration names -inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ - -# Naming hint for method names -method-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct method names -method-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Naming hint for module names -module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Regular expression matching correct module names -module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -property-classes=abc.abstractproperty - -# Naming hint for variable names -variable-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct variable names -variable-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=100 - -# Maximum number of lines in a module -max-module-lines=1000 - -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma,dict-separator - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME,XXX - - -[SIMILARITIES] - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - -# Minimum lines number of a similarity. -min-similarity-lines=20 - - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,future.builtins - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__,__new__,setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict,_fields,_replace,_source,_make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - - -[DESIGN] - -# Maximum number of arguments for function / method -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in a if statement -max-bool-expr=5 - -# Maximum number of branch for function / method body -max-branches=12 - -# Maximum number of locals for function / method body -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body -max-returns=6 - -# Maximum number of statements in function / method body -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=1 - - -[IMPORTS] - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=optparse,tkinter.tix - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception diff --git a/.readthedocs.yml b/.readthedocs.yml index 507f8d271a..2435aac483 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,26 +5,35 @@ # Required version: 2 +build: + os: ubuntu-22.04 + tools: + python: "3.8" + nodejs: "19" + apt_packages: + - libasound2 + jobs: + post_create_environment: + - npm install -g @mermaid-js/mermaid-cli + pre_build: + - pip freeze + - python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck + # Build documentation in the docs/ directory with Sphinx sphinx: builder: html - configuration: docs/conf.py + configuration: docs/source/conf.py fail_on_warning: true # Build documentation with MkDocs # mkdocs: # configuration: mkdocs.yml -# Optionally build your docs in additional formats such as PDF and ePub -formats: - - pdf - # Optionally set the version of Python and requirements required to build your docs python: - version: 3.7 install: - method: pip path: . extra_requirements: - docs - - requirements: test_requirements.txt + - test diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000..256c577eb8 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,50 @@ +cff-version: 1.2.0 +message: If you use this software, please cite it as below. +authors: +- family-names: Alam + given-names: Sajid +- family-names: Chan + given-names: Nok Lam +- family-names: Dada + given-names: Yetunde +- family-names: Danov + given-names: Ivan +- family-names: Datta + given-names: Deepyaman +- family-names: DeBold + given-names: Tynan +- family-names: Holzer + given-names: Jannic +- family-names: Kaiser + given-names: Stephanie +- family-names: Kanchwala + given-names: Rashida +- family-names: Katiyar + given-names: Ankita +- family-names: Kumar Pilla + given-names: Ravi +- family-names: Koh + given-names: Amanda +- family-names: Mackay + given-names: Andrew +- family-names: Merali + given-names: Ahdra +- family-names: Milne + given-names: Antony +- family-names: Nguyen + given-names: Huong +- family-names: Okwa + given-names: Nero +- family-names: Cano Rodríguez + given-names: Juan Luis + orcid: https://orcid.org/0000-0002-2187-161X +- family-names: Schwarzmann + given-names: Joel +- family-names: Stichbury + given-names: Jo +- family-names: Theisen + given-names: Merel +title: Kedro +version: 0.18.11 +date-released: 2023-07-03 +url: https://github.com/kedro-org/kedro diff --git a/CODEOWNERS b/CODEOWNERS index 1186be1c3d..a4c421df40 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,2 +1,2 @@ -* @idanov -docs/ @yetudada +* @merelcht +docs/ @yetudada @astrojuanlu @stichbury diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 3fd1c9a5c0..b5ffc37d05 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -55,7 +55,7 @@ further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behaviour may be -reported by contacting the project team at kedro@quantumblack.com. All +reported by contacting the project team on [Slack](https://slack.kedro.org). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4767dfd423..40da48ccdf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,12 +3,8 @@ We welcome any and all contributions to Kedro, at whatever level you can manage. For example, you could: - [Join the community](#join-the-community-and-help-it-grow) -- [Troubleshoot other users' questions or get answers to your own queries](#troubleshoot-for-other-users-and-get-help-from-the-community) -- [Report a bug](#report-a-bug) -- [Propose a new feature](#propose-a-new-feature) -- [Review other contributors' PRs](#review-community-pull-requests) -- [Contribute a bug fix or a new feature](#contribute-a-fix-or-feature) -- [Contribute to the documentation](#contribute-to-the-documentation) +- [Contribute to the project](#contribute-to-the-project) +- [Join our Technical Steering Committee](#join-our-technical-steering-committee) You can also suggest anything else that you think improves the community for us all! @@ -17,38 +13,29 @@ You can also suggest anything else that you think improves the community for us The Kedro team pledges to foster and maintain a friendly community. We enforce a [Code of Conduct](./CODE_OF_CONDUCT.md) to ensure every Kedroid is welcomed and treated with respect. -## Join the community and help it grow! +## Join the community -You can find the Kedro community on our [Discord server](https://discord.gg/akJDeVaxnB), which is where we share news and announcements, and general chat. You're also welcome to post links here to any articles or videos about Kedro that you create, or find, such as how-tos, showcases, demos, blog posts or tutorials. +You can find the Kedro community on our [Slack organisation](https://slack.kedro.org/), which is where we share news and announcements, and general chat. You're also welcome to post links here to any articles or videos about Kedro that you create, or find, such as how-tos, showcases, demos, blog posts or tutorials. -We occasionally post on the [QuantumBlack blog](https://medium.com/quantumblack/) and we curate a [Github repo that lists content created by the Kedro community](https://github.com/quantumblacklabs/kedro-community). +We also curate a [GitHub repo that lists content created by the Kedro community](https://github.com/kedro-org/awesome-kedro). -## Community Q&A +## Contribute to the project -We encourage you to ask and answer technical questions on [GitHub discussions](https://github.com/quantumblacklabs/kedro/discussions). +There are quite a few ways to contribute to the project, find inspiration from the table below. -## Report a bug +|Activity|Description| +|-|-| +|Community Q&A|We encourage you to ask and answer technical questions on [GitHub discussions](https://github.com/kedro-org/kedro/discussions) or [Slack](https://slack.kedro.org/), but the former is often preferable since it will be picked up by search engines.| +|Report bugs and security vulnerabilities |We use [GitHub issues](https://github.com/kedro-org/kedro/issues) to keep track of known bugs and security vulnerabilities. We keep a close eye on them and update them when we have an internal fix in progress. Before you report a new issue, do your best to ensure your problem hasn't already been reported. If it has, just leave a comment on the existing issue, rather than create a new one.
If you have already checked the existing [GitHub issues](https://github.com/kedro-org/kedro/issues) and are still convinced that you have found odd or erroneous behaviour then please file a new one.| +|Propose a new feature|If you have new ideas for Kedro functionality then please open a [GitHub issue](https://github.com/kedro-org/kedro/issues) and describe the feature you would like to see, why you need it, and how it should work.| +|Review pull requests|Check the [Kedro repo to find open pull requests](https://github.com/kedro-org/kedro/pulls) and contribute a review!| +|Contribute a fix or feature|If you're interested in contributing fixes to code or documentation, first read our [guidelines for contributing developers](https://docs.kedro.org/en/stable/contribution/developer_contributor_guidelines.html) for an explanation of how to get set up and the process you'll follow. Once you are ready to contribute, a good place to start is to take a look at the `good first issues` and `help wanted issues` on [GitHub](https://github.com/kedro-org/kedro/issues).| +|Contribute to the documentation|You can help us improve the [Kedro documentation online](https://docs.kedro.org/en/stable/). Send us feedback as a [GitHub issue](https://github.com/kedro-org/kedro/issues) or start a documentation discussion on [GitHub](https://github.com/kedro-org/kedro/discussions).You are also welcome to make a raise a PR with a bug fix or addition to the documentation. First read the guide [Contribute to the Kedro documentation](https://docs.kedro.org/en/stable/contribution/documentation_contributor_guidelines.html). -We use [GitHub issues](https://github.com/quantumblacklabs/kedro/issues) to keep track of known bugs. We keep a close eye on them and update them when we have an internal fix in progress. -Before you report a new issue, do your best to ensure your problem hasn't already been reported. If it has, just leave a comment on the existing issue, rather than create a new one. If you have already checked the existing [GitHub issues](https://github.com/quantumblacklabs/kedro/issues) and are still convinced that you have found odd or erroneous behaviour then please file a new one. +## Join our Technical Steering Committee -## Propose a new feature -If you have new ideas for Kedro functionality then please open a [GitHub issue](https://github.com/quantumblacklabs/kedro/issues) and describe the feature you would like to see, why you need it, and how it should work. +Kedro is an incubating project in [LF AI & Data](https://lfaidata.foundation/), a sub-organisation within the Linux +Foundation that focuses on open innovation within the data and AI space. A group of maintainers, known as the Technical Steering Committee (TSC), govern the project. You can read more about the structure of our TSC in our [Technical Charter](./kedro_technical_charter.pdf). - -## Review community pull requests - -Check the [Kedro repo to find open pull requests](https://github.com/quantumblacklabs/kedro/pulls) and contribute a review! - -## Contribute a fix or feature - -If you're interested in contributing fixes to code or documentation, first read our [guidelines for contributing developers](https://kedro.readthedocs.io/en/stable/14_contribution/02_developer_contributor_guidelines.html) for an explanation of how to get set up and the process you'll follow. - -Once you are ready to contribute, a good place to start is to take a look at the `good first issues` and `help wanted issues` on [GitHub](https://github.com/quantumblacklabs/kedro/issues). - -## Contribute to the documentation - -You can help us improve the [Kedro documentation online](https://kedro.readthedocs.io/en/stable/). Send us feedback as a [GitHub issue](https://github.com/quantumblacklabs/kedro/issues) or start a documentation discussion on [GitHub](https://github.com/quantumblacklabs/kedro/discussions). - -You are also welcome to make a raise a PR with a bug fix or addition to the documentation. First read the guide [Contribute to the Kedro documentation](https://kedro.readthedocs.io/en/stable/14_contribution/04_documentation_contributor_guidelines.html). +We invite community members to join the TSC and help define the future of the Kedro project. Read the [guidance on becoming a Kedro maintainer](https://docs.kedro.org/en/stable/contribution/technical_steering_committee.html) to understand the process of joining the TSC. diff --git a/LICENSE.md b/LICENSE.md index b5925dc509..261eeb9e9f 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,27 +1,201 @@ -Copyright 2021 QuantumBlack Visual Analytics Limited - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -(either separately or in combination, "QuantumBlack Trademarks") are -trademarks of QuantumBlack. The License does not grant you any right or -license to the QuantumBlack Trademarks. You may not use the QuantumBlack -Trademarks or any confusingly similar mark as a trademark for your product, -or use the QuantumBlack Trademarks in any other manner that might cause -confusion in the marketplace, including but not limited to in advertising, -on websites, or on software. - -See the License for the specific language governing permissions and -limitations under the License. + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in index 2041ea8daf..ad41ac26a3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,6 @@ include README.md include LICENSE.md -include legal_header.txt -include requirements.txt -include test_requirements.txt -include kedro/config/logging.yml +include kedro/framework/project/default_logging.yml +include kedro/ipython/*.png +include kedro/ipython/*.svg recursive-include templates * diff --git a/Makefile b/Makefile index df9f6aaf4c..2e8436390b 100644 --- a/Makefile +++ b/Makefile @@ -1,28 +1,31 @@ -install: build-docs - rm -rf kedro/framework/html - cp -r docs/build/html kedro/framework +install: pip install . clean: - rm -rf build dist docs/build kedro/html pip-wheel-metadata .mypy_cache .pytest_cache features/steps/test_plugin/test_plugin.egg-info + rm -rf build dist docs/build kedro/html pip-wheel-metadata .mypy_cache .pytest_cache features/steps/test_plugin/test_plugin.egg-info kedro/datasets find . -regex ".*/__pycache__" -exec rm -rf {} + find . -regex ".*\.egg-info" -exec rm -rf {} + pre-commit clean || true install-pip-setuptools: - python -m pip install -U "pip>=20.0" "setuptools>=38.0" wheel - -legal: - python tools/license_and_headers.py + python -m pip install -U "pip>=21.2, <23.2" "setuptools>=65.5.1" wheel lint: pre-commit run -a --hook-stage manual $(hook) - test: - pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile + pytest --numprocesses 4 --dist loadfile test-no-spark: - pytest tests --cov-config pyproject_no_spark.toml --ignore tests/extras/datasets/spark --numprocesses 4 --dist loadfile + pytest --no-cov --ignore tests/extras/datasets/spark --numprocesses 4 --dist loadfile + +test-sequential: + pytest tests --cov-config pyproject.toml + +test-no-spark-sequential: + pytest tests --no-cov --ignore tests/extras/datasets/spark + +test-no-datasets: + pytest --no-cov --ignore tests/extras/datasets/ --numprocesses 4 --dist loadfile e2e-tests: behave @@ -36,26 +39,37 @@ secret-scan: SPHINXPROJ = Kedro build-docs: + pip install -e ".[docs]" ./docs/build-docs.sh "docs" +show-docs: + open docs/build/html/index.html + linkcheck: + pip install -e ".[docs]" ./docs/build-docs.sh "linkcheck" -devserver: build-docs - cd docs && npm install && npm start - package: clean install - python setup.py sdist bdist_wheel + python -m pip install build && python -m build install-test-requirements: - pip install -r test_requirements.txt + pip install .[test] install-pre-commit: install-test-requirements pre-commit install --install-hooks uninstall-pre-commit: pre-commit uninstall - pre-commit uninstall --hook-type pre-push print-python-env: @./tools/print_env.sh + +databricks-build: + python -m pip install build && python -m build + python ./tools/databricks_build.py + +sign-off: + echo "git interpret-trailers --if-exists doNothing \c" >> .git/hooks/commit-msg + echo '--trailer "Signed-off-by: $$(git config user.name) <$$(git config user.email)>" \c' >> .git/hooks/commit-msg + echo '--in-place "$$1"' >> .git/hooks/commit-msg + chmod +x .git/hooks/commit-msg diff --git a/README.md b/README.md index d6cdee2809..a909df2535 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,26 @@ -![Kedro Logo Banner](https://raw.githubusercontent.com/quantumblacklabs/kedro/develop/static/img/kedro_banner.png) - -[![Python version](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue.svg)](https://pypi.org/project/kedro/) +![Kedro Logo Banner - Light](.github/demo-dark.png#gh-dark-mode-only) +![Kedro Logo Banner - Dark](.github/demo-light.png#gh-light-mode-only) +[![Python version](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue.svg)](https://pypi.org/project/kedro/) [![PyPI version](https://badge.fury.io/py/kedro.svg)](https://pypi.org/project/kedro/) [![Conda version](https://img.shields.io/conda/vn/conda-forge/kedro.svg)](https://anaconda.org/conda-forge/kedro) -[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/quantumblacklabs/kedro/blob/master/LICENSE.md) -[![Discord Server](https://img.shields.io/discord/778216384475693066.svg?color=7289da&label=Kedro%20Discord&logo=discord&style=flat-square)](https://discord.gg/akJDeVaxnB) -![CircleCI - Master Branch](https://img.shields.io/circleci/build/github/quantumblacklabs/kedro/master?label=master) -![Develop Branch Build](https://img.shields.io/circleci/build/github/quantumblacklabs/kedro/develop?label=develop) -[![Documentation](https://readthedocs.org/projects/kedro/badge/?version=stable)](https://kedro.readthedocs.io/) -[![DOI](https://zenodo.org/badge/182067506.svg)](https://zenodo.org/badge/latestdoi/182067506) - +[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/kedro-org/kedro/blob/main/LICENSE.md) +[![Slack Organisation](https://img.shields.io/badge/slack-chat-blueviolet.svg?label=Kedro%20Slack&logo=slack)](https://slack.kedro.org) +![CircleCI - Main Branch](https://img.shields.io/circleci/build/github/kedro-org/kedro/main?label=main) +![Develop Branch Build](https://img.shields.io/circleci/build/github/kedro-org/kedro/develop?label=develop) +[![Documentation](https://readthedocs.org/projects/kedro/badge/?version=stable)](https://docs.kedro.org/) +[![OpenSSF Best Practices](https://bestpractices.coreinfrastructure.org/projects/6711/badge)](https://bestpractices.coreinfrastructure.org/projects/6711) +[![Monthly downloads](https://static.pepy.tech/badge/kedro/month)](https://pepy.tech/project/kedro) +[![Total downloads](https://static.pepy.tech/badge/kedro)](https://pepy.tech/project/kedro) ## What is Kedro? -Kedro is an open-source Python framework for creating reproducible, maintainable and modular data science code. It borrows concepts from software engineering and applies them to machine-learning code; applied concepts include modularity, separation of concerns and versioning. +Kedro is a toolbox for production-ready data science. It uses software engineering best practices to help you create data engineering and data science pipelines that are reproducible, maintainable, and modular. +Kedro is an open-source Python framework hosted by the [LF AI & Data Foundation](https://lfaidata.foundation/). ## How do I install Kedro? -To install Kedro from the Python Package Index (PyPI) simply run: +To install Kedro from the Python Package Index (PyPI) run: ``` pip install kedro @@ -30,70 +32,74 @@ It is also possible to install Kedro using `conda`: conda install -c conda-forge kedro ``` -Our [Get Started guide](https://kedro.readthedocs.io/en/stable/02_get_started/01_prerequisites.html) contains full installation instructions, and includes how to set up Python virtual environments. - +Our [Get Started guide](https://docs.kedro.org/en/stable/get_started/install.html) contains full installation instructions, and includes how to set up Python virtual environments. ## What are the main features of Kedro? -![Kedro-Viz Pipeline Visualisation](https://raw.githubusercontent.com/quantumblacklabs/kedro/develop/static/img/pipeline_visualisation.png) -*A pipeline visualisation generated using [Kedro-Viz](https://github.com/quantumblacklabs/kedro-viz)* +![Kedro-Viz Pipeline Visualisation](https://github.com/kedro-org/kedro-viz/blob/main/.github/img/banner.png) +_A pipeline visualisation generated using [Kedro-Viz](https://github.com/kedro-org/kedro-viz)_ +| Feature | What is this? | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Project Template | A standard, modifiable and easy-to-use project template based on [Cookiecutter Data Science](https://github.com/drivendata/cookiecutter-data-science/). | +| Data Catalog | A series of lightweight data connectors used to save and load data across many different file formats and file systems, including local and network file systems, cloud object stores, and HDFS. The Data Catalog also includes data and model versioning for file-based systems. | +| Pipeline Abstraction | Automatic resolution of dependencies between pure Python functions and data pipeline visualisation using [Kedro-Viz](https://github.com/kedro-org/kedro-viz). | +| Coding Standards | Test-driven development using [`pytest`](https://github.com/pytest-dev/pytest), produce well-documented code using [Sphinx](http://www.sphinx-doc.org/en/master/), create linted code with support for [`flake8`](https://github.com/PyCQA/flake8), [`isort`](https://github.com/PyCQA/isort) and [`black`](https://github.com/psf/black) and make use of the standard Python logging library. | +| Flexible Deployment | Deployment strategies that include single or distributed-machine deployment as well as additional support for deploying on Argo, Prefect, Kubeflow, AWS Batch and Databricks. | -| Feature | What is this? | -|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Project Template | A standard, modifiable and easy-to-use project template based on [Cookiecutter Data Science](https://github.com/drivendata/cookiecutter-data-science/). | -| Data Catalog | A series of lightweight data connectors used to save and load data across many different file formats and file systems, including local and network file systems, cloud object stores, and HDFS. The Data Catalog also includes data and model versioning for file-based systems. | -| Pipeline Abstraction | Automatic resolution of dependencies between pure Python functions and data pipeline visualisation using [Kedro-Viz](https://github.com/quantumblacklabs/kedro-viz). | -| Coding Standards | Test-driven development using [`pytest`](https://github.com/pytest-dev/pytest), produce well-documented code using [Sphinx](http://www.sphinx-doc.org/en/master/), create linted code with support for [`flake8`](https://github.com/PyCQA/flake8), [`isort`](https://github.com/PyCQA/isort) and [`black`](https://github.com/psf/black) and make use of the standard Python logging library. | -| Flexible Deployment | Deployment strategies that include single or distributed-machine deployment as well as additional support for deploying on Argo, Prefect, Kubeflow, AWS Batch and Databricks. | +## How do I use Kedro? +The [Kedro documentation](https://docs.kedro.org/en/stable/) first explains [how to install Kedro](https://docs.kedro.org/en/stable/get_started/install.html) and then introduces [key Kedro concepts](https://docs.kedro.org/en/stable/get_started/kedro_concepts.html). -## How do I use Kedro? +- The first example illustrates the [basics of a Kedro project](https://docs.kedro.org/en/stable/get_started/new_project.html) using the Iris dataset +- You can then review the [spaceflights tutorial](https://docs.kedro.org/en/stable/tutorial/tutorial_template.html) to build a Kedro project for hands-on experience -The [Kedro documentation](https://kedro.readthedocs.io/en/stable/) includes three examples to help get you started: -- A typical "Hello World" example, for an [entry-level description of the main Kedro concepts](https://kedro.readthedocs.io/en/stable/02_get_started/03_hello_kedro.html) -- An [introduction to the project template](https://kedro.readthedocs.io/en/stable/02_get_started/05_example_project.html) using the Iris dataset -- A more detailed [spaceflights tutorial](https://kedro.readthedocs.io/en/stable/03_tutorial/02_tutorial_template.html) to give you hands-on experience +For new and intermediate Kedro users, there's a comprehensive section on [how to visualise Kedro projects using Kedro-Viz](https://docs.kedro.org/en/stable/visualisation/kedro-viz_visualisation.html) and [how to work with Kedro and Jupyter notebooks](https://docs.kedro.org/en/stable/notebooks_and_ipython/kedro_and_notebooks). +Further documentation is available for more advanced Kedro usage and deployment. We also recommend the [glossary](https://docs.kedro.org/en/stable/resources/glossary.html) and the [API reference documentation](/kedro) for additional information. ## Why does Kedro exist? Kedro is built upon our collective best-practice (and mistakes) trying to deliver real-world ML applications that have vast amounts of raw unvetted data. We developed Kedro to achieve the following: - - To address the main shortcomings of Jupyter notebooks, one-off scripts, and glue-code because there is a focus on + +- To address the main shortcomings of Jupyter notebooks, one-off scripts, and glue-code because there is a focus on creating **maintainable data science code** - - To enhance **team collaboration** when different team members have varied exposure to software engineering concepts - - To increase efficiency, because applied concepts like modularity and separation of concerns inspire the creation of +- To enhance **team collaboration** when different team members have varied exposure to software engineering concepts +- To increase efficiency, because applied concepts like modularity and separation of concerns inspire the creation of **reusable analytics code** - ## The humans behind Kedro -Kedro is maintained by a [product team from QuantumBlack](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html) and a number of [contributors from across the world](https://github.com/quantumblacklabs/kedro/releases). - +The [Kedro product team](https://docs.kedro.org/en/stable/contribution/technical_steering_committee.html#kedro-maintainers) and a number of [open source contributors from across the world](https://github.com/kedro-org/kedro/releases) maintain Kedro. ## Can I contribute? -Yes! Want to help build Kedro? Check out our [guide to contributing to Kedro](https://github.com/quantumblacklabs/kedro/blob/master/CONTRIBUTING.md). - +Yes! Want to help build Kedro? Check out our [guide to contributing to Kedro](https://github.com/kedro-org/kedro/blob/main/CONTRIBUTING.md). ## Where can I learn more? -There is a growing community around Kedro. Have a look at the [Kedro FAQs](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#how-can-i-find-out-more-about-kedro) to find projects using Kedro and links to articles, podcasts and talks. - +There is a growing community around Kedro. Have a look at the [Kedro FAQs](https://docs.kedro.org/en/stable/faq/faq.html#how-can-i-find-out-more-about-kedro) to find projects using Kedro and links to articles, podcasts and talks. ## Who likes Kedro? There are Kedro users across the world, who work at start-ups, major enterprises and academic institutions like [Absa](https://www.absa.co.za/), [Acensi](https://acensi.eu/page/home), +[Advanced Programming Solutions SL](https://www.linkedin.com/feed/update/urn:li:activity:6863494681372721152/), [AI Singapore](https://makerspace.aisingapore.org/2020/08/leveraging-kedro-in-100e/), +[AMAI GmbH](https://www.am.ai/), +[Augment Partners](https://www.linkedin.com/posts/augment-partners_kedro-cheat-sheet-by-augment-activity-6858927624631283712-Ivqk), [AXA UK](https://www.axa.co.uk/), [Belfius](https://www.linkedin.com/posts/vangansen_mlops-machinelearning-kedro-activity-6772379995953238016-JUmo), +[Beamery](https://medium.com/hacking-talent/production-code-for-data-science-and-our-experience-with-kedro-60bb69934d1f), [Caterpillar](https://www.caterpillar.com/), [CRIM](https://www.crim.ca/en/), [Dendra Systems](https://www.dendra.io/), [Element AI](https://www.elementai.com/), +[GetInData](https://getindata.com/blog/running-machine-learning-pipelines-kedro-kubeflow-airflow), [GMO](https://recruit.gmo.jp/engineer/jisedai/engineer/jisedai/engineer/jisedai/engineer/jisedai/engineer/jisedai/blog/kedro_and_mlflow_tracking/), +[Indicium](https://medium.com/indiciumtech/how-to-build-models-as-products-using-mlops-part-2-machine-learning-pipelines-with-kedro-10337c48de92), [Imperial College London](https://github.com/dssg/barefoot-winnie-public), +[ING](https://www.ing.com), [Jungle Scout](https://junglescouteng.medium.com/jungle-scout-case-study-kedro-airflow-and-mlflow-use-on-production-code-150d7231d42e), [Helvetas](https://www.linkedin.com/posts/lionel-trebuchon_mlflow-kedro-ml-ugcPost-6747074322164154368-umKw), [Leapfrog](https://www.lftechnology.com/blog/ai-pipeline-kedro/), @@ -102,20 +108,25 @@ There are Kedro users across the world, who work at start-ups, major enterprises [Modec](https://www.modec.com/), [Mosaic Data Science](https://www.youtube.com/watch?v=fCWGevB366g), [NaranjaX](https://www.youtube.com/watch?v=_0kMmRfltEQ), +[NASA](https://github.com/nasa/ML-airport-taxi-out), +[NHS AI Lab](https://nhsx.github.io/skunkworks/synthetic-data-pipeline), [Open Data Science LatAm](https://www.odesla.org/), [Prediqt](https://prediqt.co/), [QuantumBlack](https://medium.com/quantumblack/introducing-kedro-the-open-source-library-for-production-ready-machine-learning-code-d1c6d26ce2cf), -[Retrieva](https://tech.retrieva.jp/entry/2020/07/28/181414), [Roche](https://www.roche.com/), +[ReSpo.Vision](https://neptune.ai/customers/respo-vision), +[Retrieva](https://tech.retrieva.jp/entry/2020/07/28/181414), +[Roche](https://www.roche.com/), [Sber](https://www.linkedin.com/posts/seleznev-artem_welcome-to-kedros-documentation-kedro-activity-6767523561109385216-woTt), -[Telkomsel](https://www.linkedin.com/feed/update/urn:li:activity:6749338226403766272/updateEntityUrn=urn%3Ali%3Afs_feedUpdate%3A%28V2%2Curn%3Ali%3Aactivity%3A6749338226403766272%29), +[Société Générale](https://www.societegenerale.com/en), +[Telkomsel](https://medium.com/life-at-telkomsel/how-we-build-a-production-grade-data-pipeline-7004e56c8c98), [Universidad Rey Juan Carlos](https://github.com/vchaparro/MasterThesis-wind-power-forecasting/blob/master/thesis.pdf), [UrbanLogiq](https://urbanlogiq.com/), [Wildlife Studios](https://wildlifestudios.com), [WovenLight](https://www.wovenlight.com/) and [XP](https://youtu.be/wgnGOVNkXqU?t=2210). -Kedro has also won [Best Technical Tool or Framework for AI](https://awards.ai/the-awards/previous-awards/the-4th-ai-award-winners/) in the 2019 Awards AI competition and a merit award for the 2020 [UK Technical Communication Awards](https://uktcawards.com/announcing-the-award-winners-for-2020/). It is listed on the 2020 [ThoughtWorks Technology Radar](https://www.thoughtworks.com/radar/languages-and-frameworks/kedro) and the 2020 [Data & AI Landscape](https://mattturck.com/data2020/). +Kedro won [Best Technical Tool or Framework for AI](https://awards.ai/the-awards/previous-awards/the-4th-ai-award-winners/) in the 2019 Awards AI competition and a merit award for the 2020 [UK Technical Communication Awards](https://uktcawards.com/announcing-the-award-winners-for-2020/). It is listed on the 2020 [ThoughtWorks Technology Radar](https://www.thoughtworks.com/radar/languages-and-frameworks/kedro) and the 2020 [Data & AI Landscape](https://mattturck.com/data2020/). Kedro has received an [honorable mention in the User Experience category in Fast Company’s 2022 Innovation by Design Awards](https://www.fastcompany.com/90772252/user-experience-innovation-by-design-2022). ## How can I cite Kedro? -If you're an academic, Kedro can also help you, for example, as a tool to solve the problem of reproducible research. Find our citation reference on [Zenodo](https://zenodo.org/record/4336685). +If you're an academic, Kedro can also help you, for example, as a tool to solve the problem of reproducible research. Use the "Cite this repository" button on [our repository](https://github.com/kedro-org/kedro) to generate a citation from the [CITATION.cff file](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-citation-files). diff --git a/RELEASE.md b/RELEASE.md index 80ac21f6c1..431a313e50 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,98 +1,689 @@ -# Upcoming Release 0.18.0 +# Upcoming Release 0.19.0 ## Major features and improvements -* Added support for Python 3.9, dropped support for Python 3.6. -* Support specifying parameters mapping in `pipeline()` without the `params:` prefix. * `PartitionedDataSet` and `IncrementalDataSet` now both support versioning of the underlying dataset. + +## Bug fixes and other changes + +## Breaking changes to the API + +### DataSets +* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. + +### CLI +* Removed deprecated `kedro docs` command. + +### ConfigLoader +* `logging` is removed from `ConfigLoader` in favour of the environment variable `KEDRO_LOGGING_CONFIG`. + +### Other +* Removed deprecated `kedro.extras.ColorHandler`. +* The Kedro IPython extension is no longer available as `%load_ext kedro.extras.extensions.ipython`; use `%load_ext kedro.ipython` instead. +* Anonymous nodes are given default names of the form `([in1;in2;...]) -> [out1;out2;...]`, with the names of inputs and outputs separated by semicolons. + +## Migration guide from Kedro 0.18.* to 0.19.* +### DataSets +* If you use `APIDataSet`, move all `requests` specific arguments (e.g. `params`, `headers`), except for `url` and `method`, to under `load_args`. +### Logging +`logging.yml` is now independent of Kedro's run environment and only used if `KEDRO_LOGGING_CONFIG` is set to point to it. + +# Upcoming Release 0.18.12 + +## Major features and improvements +* Added dataset factories feature which uses pattern matching to reduce the number of catalog entries. +* Activated all built-in resolvers by default for `OmegaConfigLoader` except for `oc.env`. + +## Bug fixes and other changes +* Consolidated dependencies and optional dependencies in `pyproject.toml`. +* Pin `pip<23.2` for CI due to a breaking change. See https://github.com/kedro-org/kedro/pull/2813 + +## Documentation changes +- Recommended `ruff` as the linter and remove mentions of `pylint`, `isort`, `flake8`. + +## Breaking changes to the API + +## Upcoming deprecations for Kedro 0.19.0 + +# Release 0.18.11 + +## Major features and improvements +* Added `databricks-iris` as an official starter. + +## Bug fixes and other changes +* Reworked micropackaging workflow to use standard Python packaging practices. +* Make `kedro micropkg package` accept `--verbose`. +* Compare for protocol and delimiter in `PartitionedDataSet` to be able to pass the protocol to partitions which paths starts with the same characters as the protocol (e.g. `s3://s3-my-bucket`). + +## Documentation changes +* Significant improvements to the documentation that covers working with Databricks and Kedro, including a new page for workspace-only development, and a guide to choosing the best workflow for your use case. +* Updated documentation for deploying with Prefect for version 2.0. +* Added documentation for developing a Kedro project using a Databricks workspace. + +## Breaking changes to the API +* Logging is decoupled from `ConfigLoader`, use `KEDRO_LOGGING_CONFIG` to configure logging. + +## Upcoming deprecations for Kedro 0.19.0 +* Renamed dataset and error classes, in accordance with the [Kedro lexicon](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide#kedro-lexicon). Dataset classes ending with "DataSet" and error classes starting with "DataSet" are deprecated and will be removed in 0.19.0. Note that all of the below classes are also importable from `kedro.io`; only the module where they are defined is listed as the location. + +| Type | Deprecated Alias | Location | +| --------------------------- | --------------------------- | ------------------------------ | +| `CachedDataset` | `CachedDataSet` | `kedro.io.cached_dataset` | +| `LambdaDataset` | `LambdaDataSet` | `kedro.io.lambda_dataset` | +| `IncrementalDataset` | `IncrementalDataSet` | `kedro.io.partitioned_dataset` | +| `MemoryDataset` | `MemoryDataSet` | `kedro.io.memory_dataset` | +| `PartitionedDataset` | `PartitionedDataSet` | `kedro.io.partitioned_dataset` | +| `DatasetError` | `DataSetError` | `kedro.io.core` | +| `DatasetAlreadyExistsError` | `DataSetAlreadyExistsError` | `kedro.io.core` | +| `DatasetNotFoundError` | `DataSetNotFoundError` | `kedro.io.core` | + +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: + +* [jmalovera10](https://github.com/jmalovera10) +* [debugger24](https://github.com/debugger24) +* [juliushetzel](https://github.com/juliushetzel) +* [jacobweiss2305](https://github.com/jacobweiss2305) +* [eduardoconto](https://github.com/eduardoconto) + +# Release 0.18.10 + +## Major features and improvements +* Rebrand across all documentation and Kedro assets. +* Added support for variable interpolation in the catalog with the `OmegaConfigLoader`. + +# Release 0.18.9 + +## Major features and improvements +* `kedro run --params` now updates interpolated parameters correctly when using `OmegaConfigLoader`. +* Added `metadata` attribute to `kedro.io` datasets. This is ignored by Kedro, but may be consumed by users or external plugins. +* Added `kedro.logging.RichHandler`. This replaces the default `rich.logging.RichHandler` and is more flexible, user can turn off the `rich` traceback if needed. + +## Bug fixes and other changes +* `OmegaConfigLoader` will return a `dict` instead of `DictConfig`. +* `OmegaConfigLoader` does not show a `MissingConfigError` when the config files exist but are empty. + +## Documentation changes +* Added documentation for collaborative experiment tracking within Kedro-Viz. +* Revised section on deployment to better organise content and reflect how recently docs have been updated. +* Minor improvements to fix typos and revise docs to align with engineering changes. + +## Breaking changes to the API +* `kedro package` does not produce `.egg` files anymore, and now relies exclusively on `.whl` files. + +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: + +* [tomasvanpottelbergh](https://github.com/tomasvanpottelbergh) +* [https://github.com/debugger24](https://github.com/debugger24) + +## Upcoming deprecations for Kedro 0.19.0 + +# Release 0.18.8 + +## Major features and improvements +* Added `KEDRO_LOGGING_CONFIG` environment variable, which can be used to configure logging from the beginning of the `kedro` process. +* Removed logs folder from the kedro new project template. File-based logging will remain but just be level INFO and above and go to project root instead. + + +## Bug fixes and other changes +* Improvements to Jupyter E2E tests. +* Added full `kedro run` CLI command to session store to improve run reproducibility using `Kedro-Viz` experiment tracking. + +### Documentation changes +* Improvements to documentation about configuration. +* Improvements to Sphinx toolchain including incrementing to use a newer version. +* Improvements to documentation on visualising Kedro projects on Databricks, and additional documentation about the development workflow for Kedro projects on Databricks. +* Updated Technical Steering Committee membership documentation. +* Revised documentation section about linting and formatting and extended to give details of `flake8` configuration. +* Updated table of contents for documentation to reduce scrolling. +* Expanded FAQ documentation. +* Added a 404 page to documentation. +* Added deprecation warnings about the removal of `kedro.extras.datasets`. + +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: + +* [MaximeSteinmetz](https://github.com/MaximeSteinmetz) + + +# Release 0.18.7 + +## Major features and improvements +* Added new Kedro CLI `kedro jupyter setup` to setup Jupyter Kernel for Kedro. +* `kedro package` now includes the project configuration in a compressed `tar.gz` file. +* Added functionality to the `OmegaConfigLoader` to load configuration from compressed files of `zip` or `tar` format. This feature requires `fsspec>=2023.1.0`. +* Significant improvements to on-boarding documentation that covers setup for new Kedro users. Also some major changes to the spaceflights tutorial to make it faster to work through. We think it's a better read. Tell us if it's not. + +## Bug fixes and other changes +* Added a guide and tooling for developing Kedro for Databricks. +* Implemented missing dict-like interface for `_ProjectPipeline`. + + +# Release 0.18.6 + +## Bug fixes and other changes +* Fixed bug that didn't allow to read or write datasets with `s3a` or `s3n` filepaths +* Fixed bug with overriding nested parameters using the `--params` flag +* Fixed bug that made session store incompatible with `Kedro-Viz` experiment tracking + +## Migration guide from Kedro 0.18.5 to 0.18.6 +A regression introduced in Kedro version `0.18.5` caused the `Kedro-Viz` console to fail to show experiment tracking correctly. If you experienced this issue, you will need to: +* upgrade to Kedro version `0.18.6` +* delete any erroneous session entries created with Kedro 0.18.5 from your session_store.db stored at `/data/session_store.db`. + +Thanks to Kedroids tomohiko kato, [tsanikgr](https://github.com/tsanikgr) and [maddataanalyst](https://github.com/maddataanalyst) for very detailed reports about the bug. + + +# Release 0.18.5 + +> This release introduced a bug that causes a failure in experiment tracking within the `Kedro-Viz` console. We recommend that you use Kedro version `0.18.6` in preference. + +## Major features and improvements +* Added new `OmegaConfigLoader` which uses `OmegaConf` for loading and merging configuration. +* Added the `--conf-source` option to `kedro run`, allowing users to specify a source for project configuration for the run. +* Added `omegaconf` syntax as option for `--params`. Keys and values can now be separated by colons or equals signs. +* Added support for generator functions as nodes, i.e. using `yield` instead of return. + * Enable chunk-wise processing in nodes with generator functions. + * Save node outputs after every `yield` before proceeding with next chunk. +* Fixed incorrect parsing of Azure Data Lake Storage Gen2 URIs used in datasets. +* Added support for loading credentials from environment variables using `OmegaConfigLoader`. +* Added new `--namespace` flag to `kedro run` to enable filtering by node namespace. +* Added a new argument `node` for all four dataset hooks. +* Added the `kedro run` flags `--nodes`, `--tags`, and `--load-versions` to replace `--node`, `--tag`, and `--load-version`. + +## Bug fixes and other changes +* Commas surrounded by square brackets (only possible for nodes with default names) will no longer split the arguments to `kedro run` options which take a list of nodes as inputs (`--from-nodes` and `--to-nodes`). +* Fixed bug where `micropkg` manifest section in `pyproject.toml` isn't recognised as allowed configuration. +* Fixed bug causing `load_ipython_extension` not to register the `%reload_kedro` line magic when called in a directory that does not contain a Kedro project. +* Added `anyconfig`'s `ac_context` parameter to `kedro.config.commons` module functions for more flexible `ConfigLoader` customizations. +* Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. +* Fixed bug causing the `after_dataset_saved` hook only to be called for one output dataset when multiple are saved in a single node and async saving is in use. +* Log level for "Credentials not found in your Kedro project config" was changed from `WARNING` to `DEBUG`. +* Added safe extraction of tar files in `micropkg pull` to fix vulnerability caused by [CVE-2007-4559](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm). +* Documentation improvements + * Bug fix in table font size + * Updated API docs links for datasets + * Improved CLI docs for `kedro run` + * Revised documentation for visualisation to build plots and for experiment tracking + * Added example for loading external credentials to the Hooks documentation + +## Breaking changes to the API + +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: + +* [adamfrly](https://github.com/adamfrly) +* [corymaklin](https://github.com/corymaklin) +* [Emiliopb](https://github.com/Emiliopb) +* [grhaonan](https://github.com/grhaonan) +* [JStumpp](https://github.com/JStumpp) +* [michalbrys](https://github.com/michalbrys) +* [sbrugman](https://github.com/sbrugman) + +## Upcoming deprecations for Kedro 0.19.0 +* `project_version` will be deprecated in `pyproject.toml` please use `kedro_init_version` instead. +* Deprecated `kedro run` flags `--node`, `--tag`, and `--load-version` in favour of `--nodes`, `--tags`, and `--load-versions`. + +# Release 0.18.4 + +## Major features and improvements +* Make Kedro instantiate datasets from `kedro_datasets` with higher priority than `kedro.extras.datasets`. `kedro_datasets` is the namespace for the new `kedro-datasets` python package. +* The config loader objects now implement `UserDict` and the configuration is accessed through `conf_loader['catalog']`. +* You can configure config file patterns through `settings.py` without creating a custom config loader. +* Added the following new datasets: + +| Type | Description | Location | +| ------------------------------------ | -------------------------------------------------------------------------- | -------------------------------- | +| `svmlight.SVMLightDataSet` | Work with svmlight/libsvm files using scikit-learn library | `kedro.extras.datasets.svmlight` | +| `video.VideoDataSet` | Read and write video files from a filesystem | `kedro.extras.datasets.video` | +| `video.video_dataset.SequenceVideo` | Create a video object from an iterable sequence to use with `VideoDataSet` | `kedro.extras.datasets.video` | +| `video.video_dataset.GeneratorVideo` | Create a video object from a generator to use with `VideoDataSet` | `kedro.extras.datasets.video` | +* Implemented support for a functional definition of schema in `dask.ParquetDataSet` to work with the `dask.to_parquet` API. + +## Bug fixes and other changes +* Fixed `kedro micropkg pull` for packages on PyPI. +* Fixed `format` in `save_args` for `SparkHiveDataSet`, previously it didn't allow you to save it as delta format. +* Fixed save errors in `TensorFlowModelDataset` when used without versioning; previously, it wouldn't overwrite an existing model. +* Added support for `tf.device` in `TensorFlowModelDataset`. +* Updated error message for `VersionNotFoundError` to handle insufficient permission issues for cloud storage. +* Updated Experiment Tracking docs with working examples. +* Updated `MatplotlibWriter`, `text.TextDataSet`, `plotly.PlotlyDataSet` and `plotly.JSONDataSet` docs with working examples. +* Modified implementation of the Kedro IPython extension to use `local_ns` rather than a global variable. +* Refactored `ShelveStore` to its own module to ensure multiprocessing works with it. +* `kedro.extras.datasets.pandas.SQLQueryDataSet` now takes optional argument `execution_options`. +* Removed `attrs` upper bound to support newer versions of Airflow. +* Bumped the lower bound for the `setuptools` dependency to <=61.5.1. + +## Minor breaking changes to the API + +## Upcoming deprecations for Kedro 0.19.0 +* `kedro test` and `kedro lint` will be deprecated. + +## Documentation +* Revised the Introduction to shorten it +* Revised the Get Started section to remove unnecessary information and clarify the learning path +* Updated the spaceflights tutorial to simplify the later stages and clarify what the reader needed to do in each phase +* Moved some pages that covered advanced materials into more appropriate sections +* Moved visualisation into its own section +* Fixed a bug that degraded user experience: the table of contents is now sticky when you navigate between pages +* Added redirects where needed on ReadTheDocs for legacy links and bookmarks + +## Contributions from the Kedroid community +We are grateful to the following for submitting PRs that contributed to this release: [jstammers](https://github.com/jstammers), [FlorianGD](https://github.com/FlorianGD), [yash6318](https://github.com/yash6318), [carlaprv](https://github.com/carlaprv), [dinotuku](https://github.com/dinotuku), [williamcaicedo](https://github.com/williamcaicedo), [avan-sh](https://github.com/avan-sh), [Kastakin](https://github.com/Kastakin), [amaralbf](https://github.com/amaralbf), [BSGalvan](https://github.com/BSGalvan), [levimjoseph](https://github.com/levimjoseph), [daniel-falk](https://github.com/daniel-falk), [clotildeguinard](https://github.com/clotildeguinard), [avsolatorio](https://github.com/avsolatorio), and [picklejuicedev](https://github.com/picklejuicedev) for comments and input to documentation changes + +# Release 0.18.3 + +## Major features and improvements +* Implemented autodiscovery of project pipelines. A pipeline created with `kedro pipeline create ` can now be accessed immediately without needing to explicitly register it in `src//pipeline_registry.py`, either individually by name (e.g. `kedro run --pipeline=`) or as part of the combined default pipeline (e.g. `kedro run`). By default, the simplified `register_pipelines()` function in `pipeline_registry.py` looks like: + + ```python + def register_pipelines() -> Dict[str, Pipeline]: + """Register the project's pipelines. + + Returns: + A mapping from pipeline names to ``Pipeline`` objects. + """ + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + return pipelines + ``` + +* The Kedro IPython extension should now be loaded with `%load_ext kedro.ipython`. +* The line magic `%reload_kedro` now accepts keywords arguments, e.g. `%reload_kedro --env=prod`. +* Improved resume pipeline suggestion for `SequentialRunner`, it will backtrack the closest persisted inputs to resume. + +## Bug fixes and other changes + +* Changed default `False` value for rich logging `show_locals`, to make sure credentials and other sensitive data isn't shown in logs. +* Rich traceback handling is disabled on Databricks so that exceptions now halt execution as expected. This is a workaround for a [bug in `rich`](https://github.com/Textualize/rich/issues/2455). +* When using `kedro run -n [some_node]`, if `some_node` is missing a namespace the resulting error message will suggest the correct node name. +* Updated documentation for `rich` logging. +* Updated Prefect deployment documentation to allow for reruns with saved versioned datasets. +* The Kedro IPython extension now surfaces errors when it cannot load a Kedro project. +* Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. +* Added `gdrive` to list of cloud protocols, enabling Google Drive paths for datasets. +* Added svg logo resource for ipython kernel. + +## Upcoming deprecations for Kedro 0.19.0 +* The Kedro IPython extension will no longer be available as `%load_ext kedro.extras.extensions.ipython`; use `%load_ext kedro.ipython` instead. +* `kedro jupyter convert`, `kedro build-docs`, `kedro build-reqs` and `kedro activate-nbstripout` will be deprecated. + +# Release 0.18.2 + +## Major features and improvements +* Added `abfss` to list of cloud protocols, enabling abfss paths. +* Kedro now uses the [Rich](https://github.com/Textualize/rich) library to format terminal logs and tracebacks. +* The file `conf/base/logging.yml` is now optional. See [our documentation](https://docs.kedro.org/en/0.18.2/logging/logging.html) for details. +* Introduced a `kedro.starters` entry point. This enables plugins to create custom starter aliases used by `kedro starter list` and `kedro new`. +* Reduced the `kedro new` prompts to just one question asking for the project name. + +## Bug fixes and other changes +* Bumped `pyyaml` upper bound to make Kedro compatible with the [pyodide](https://pyodide.org/en/stable/usage/loading-packages.html#micropip) stack. +* Updated project template's Sphinx configuration to use `myst_parser` instead of `recommonmark`. +* Reduced number of log lines by changing the logging level from `INFO` to `DEBUG` for low priority messages. +* Kedro's framework-side logging configuration no longer performs file-based logging. Hence superfluous `info.log`/`errors.log` files are no longer created in your project root, and running Kedro on read-only file systems such as Databricks Repos is now possible. +* The `root` logger is now set to the Python default level of `WARNING` rather than `INFO`. Kedro's logger is still set to emit `INFO` level messages. +* `SequentialRunner` now has consistent execution order across multiple runs with sorted nodes. +* Bumped the upper bound for the Flake8 dependency to <5.0. +* `kedro jupyter notebook/lab` no longer reuses a Jupyter kernel. +* Required `cookiecutter>=2.1.1` to address a [known command injection vulnerability](https://security.snyk.io/vuln/SNYK-PYTHON-COOKIECUTTER-2414281). +* The session store no longer fails if a username cannot be found with `getpass.getuser`. +* Added generic typing for `AbstractDataSet` and `AbstractVersionedDataSet` as well as typing to all datasets. +* Rendered the deployment guide flowchart as a Mermaid diagram, and added Dask. + +## Minor breaking changes to the API +* The module `kedro.config.default_logger` no longer exists; default logging configuration is now set automatically through `kedro.framework.project.LOGGING`. Unless you explicitly import `kedro.config.default_logger` you do not need to make any changes. + +## Upcoming deprecations for Kedro 0.19.0 +* `kedro.extras.ColorHandler` will be removed in 0.19.0. + +# Release 0.18.1 + +## Major features and improvements +* Added a new hook `after_context_created` that passes the `KedroContext` instance as `context`. +* Added a new CLI hook `after_command_run`. +* Added more detail to YAML `ParserError` exception error message. +* Added option to `SparkDataSet` to specify a `schema` load argument that allows for supplying a user-defined schema as opposed to relying on the schema inference of Spark. +* The Kedro package no longer contains a built version of the Kedro documentation significantly reducing the package size. + +## Bug fixes and other changes +* Removed fatal error from being logged when a Kedro session is created in a directory without git. +* `KedroContext` is now an `attrs`'s frozen class and `config_loader` is available as public attribute. +* Fixed `CONFIG_LOADER_CLASS` validation so that `TemplatedConfigLoader` can be specified in settings.py. Any `CONFIG_LOADER_CLASS` must be a subclass of `AbstractConfigLoader`. +* Added runner name to the `run_params` dictionary used in pipeline hooks. +* Updated [Databricks documentation](https://docs.kedro.org/en/0.18.1/deployment/databricks.html) to include how to get it working with IPython extension and Kedro-Viz. +* Update sections on visualisation, namespacing, and experiment tracking in the spaceflight tutorial to correspond to the complete spaceflights starter. +* Fixed `Jinja2` syntax loading with `TemplatedConfigLoader` using `globals.yml`. +* Removed global `_active_session`, `_activate_session` and `_deactivate_session`. Plugins that need to access objects such as the config loader should now do so through `context` in the new `after_context_created` hook. +* `config_loader` is available as a public read-only attribute of `KedroContext`. +* Made `hook_manager` argument optional for `runner.run`. +* `kedro docs` now opens an online version of the Kedro documentation instead of a locally built version. + +## Upcoming deprecations for Kedro 0.19.0 +* `kedro docs` will be removed in 0.19.0. + +## Upcoming deprecations for Kedro 0.19.0 +* `kedro docs` will be removed in 0.19.0. + + +# Release 0.18.0 + +## TL;DR ✨ +Kedro 0.18.0 strives to reduce the complexity of the project template and get us closer to a stable release of the framework. We've introduced the full [micro-packaging workflow](https://docs.kedro.org/en/0.18.0/nodes_and_pipelines/micro_packaging.html) 📦, which allows you to import packages, utility functions and existing pipelines into your Kedro project. [Integration with IPython and Jupyter](https://docs.kedro.org/en/0.18.0/tools_integration/ipython.html) has been streamlined in preparation for enhancements to Kedro's interactive workflow. Additionally, the release comes with long-awaited Python 3.9 and 3.10 support 🐍. + +## Major features and improvements + +### Framework +* Added `kedro.config.abstract_config.AbstractConfigLoader` as an abstract base class for all `ConfigLoader` implementations. `ConfigLoader` and `TemplatedConfigLoader` now inherit directly from this base class. +* Streamlined the `ConfigLoader.get` and `TemplatedConfigLoader.get` API and delegated the actual `get` method functional implementation to the `kedro.config.common` module. +* The `hook_manager` is no longer a global singleton. The `hook_manager` lifecycle is now managed by the `KedroSession`, and a new `hook_manager` will be created every time a `session` is instantiated. +* Added support for specifying parameters mapping in `pipeline()` without the `params:` prefix. * Added new API `Pipeline.filter()` (previously in `KedroContext._filter_pipeline()`) to filter parts of a pipeline. +* Added `username` to Session store for logging during Experiment Tracking. +* A packaged Kedro project can now be imported and run from another Python project as following: +```python +from my_package.__main__ import main + +main( + ["--pipleine", "my_pipeline"] +) # or just main() if no parameters are needed for the run +``` + +### Project template +* Removed `cli.py` from the Kedro project template. By default, all CLI commands, including `kedro run`, are now defined on the Kedro framework side. You can still define custom CLI commands by creating your own `cli.py`. +* Removed `hooks.py` from the Kedro project template. Registration hooks have been removed in favour of `settings.py` configuration, but you can still define execution timeline hooks by creating your own `hooks.py`. +* Removed `.ipython` directory from the Kedro project template. The IPython/Jupyter workflow no longer uses IPython profiles; it now uses an IPython extension. +* The default `kedro` run configuration environment names can now be set in `settings.py` using the `CONFIG_LOADER_ARGS` variable. The relevant keyword arguments to supply are `base_env` and `default_run_env`, which are set to `base` and `local` respectively by default. + +### DataSets +* Added the following new datasets: + +| Type | Description | Location | +| ------------------------- | ------------------------------------------------------------- | -------------------------------- | +| `pandas.XMLDataSet` | Read XML into Pandas DataFrame. Write Pandas DataFrame to XML | `kedro.extras.datasets.pandas` | +| `networkx.GraphMLDataSet` | Work with NetworkX using GraphML files | `kedro.extras.datasets.networkx` | +| `networkx.GMLDataSet` | Work with NetworkX using Graph Modelling Language files | `kedro.extras.datasets.networkx` | +| `redis.PickleDataSet` | loads/saves data from/to a Redis database | `kedro.extras.datasets.redis` | + * Added `partitionBy` support and exposed `save_args` for `SparkHiveDataSet`. * Exposed `open_args_save` in `fs_args` for `pandas.ParquetDataSet`. -* Bumped the minimum version of `pandas` to 1.2. Any `storage_options` should continue to be specified under `fs_args` and/or `credentials`. * Refactored the `load` and `save` operations for `pandas` datasets in order to leverage `pandas` own API and delegate `fsspec` operations to them. This reduces the need to have our own `fsspec` wrappers. -* Removed `cli.py` from the Kedro project template. By default, all CLI commands, including `kedro run`, are now defined on the Kedro framework side. These can be overridden in turn by a plugin or a `cli.py` file in your project. A packaged Kedro project will respect the same hierarchy when executed with `python -m my_package`. * Merged `pandas.AppendableExcelDataSet` into `pandas.ExcelDataSet`. * Added `save_args` to `feather.FeatherDataSet`. -* The default `kedro` environment names can now be set in `settings.py` with the help of the `CONFIG_LOADER_ARGS` variable. The relevant keys to be supplied are `base_env` and `default_run_env`. These values are set to `base` and `local` respectively as a default. -* Added `kedro.config.abstract_config.AbstractConfigLoader` as an abstract base class for all `ConfigLoader` implementations. `ConfigLoader` and `TemplatedConfigLoader` now inherit directly from this base class. -* Streamlined the `ConfigLoader.get` and `TemplatedConfigLoader.get` API and delegated the actual `get` method functional implementation to the `kedro.config.common` module. + +### Jupyter and IPython integration +* The [only recommended way to work with Kedro in Jupyter or IPython is now the Kedro IPython extension](https://docs.kedro.org/en/0.18.0/tools_integration/ipython.html). Managed Jupyter instances should load this via `%load_ext kedro.ipython` and use the line magic `%reload_kedro`. +* `kedro ipython` launches an IPython session that preloads the Kedro IPython extension. +* `kedro jupyter notebook/lab` creates a custom Jupyter kernel that preloads the Kedro IPython extension and launches a notebook with that kernel selected. There is no longer a need to specify `--all-kernels` to show all available kernels. + +### Dependencies +* Bumped the minimum version of `pandas` to 1.3. Any `storage_options` should continue to be specified under `fs_args` and/or `credentials`. +* Added support for Python 3.9 and 3.10, dropped support for Python 3.6. +* Updated `black` dependency in the project template to a non pre-release version. + +### Other +* Documented distribution of Kedro pipelines with Dask. ## Breaking changes to the API -* Add namespace to parameters in a modular pipeline, which addresses [Issue 399](https://github.com/quantumblacklabs/kedro/issues/399). + +### Framework +* Removed `RegistrationSpecs` and its associated `register_config_loader` and `register_catalog` hook specifications in favour of `CONFIG_LOADER_CLASS`/`CONFIG_LOADER_ARGS` and `DATA_CATALOG_CLASS` in `settings.py`. +* Removed deprecated functions `load_context` and `get_project_context`. +* Removed deprecated `CONF_SOURCE`, `package_name`, `pipeline`, `pipelines`, `config_loader` and `io` attributes from `KedroContext` as well as the deprecated `KedroContext.run` method. +* Added the `PluginManager` `hook_manager` argument to `KedroContext` and the `Runner.run()` method, which will be provided by the `KedroSession`. +* Removed the public method `get_hook_manager()` and replaced its functionality by `_create_hook_manager()`. +* Enforced that only one run can be successfully executed as part of a `KedroSession`. `run_id` has been renamed to `session_id` as a result. + +### Configuration loaders +* The `settings.py` setting `CONF_ROOT` has been renamed to `CONF_SOURCE`. Default value of `conf` remains unchanged. +* `ConfigLoader` and `TemplatedConfigLoader` argument `conf_root` has been renamed to `conf_source`. +* `extra_params` has been renamed to `runtime_params` in `kedro.config.config.ConfigLoader` and `kedro.config.templated_config.TemplatedConfigLoader`. +* The environment defaulting behaviour has been removed from `KedroContext` and is now implemented in a `ConfigLoader` class (or equivalent) with the `base_env` and `default_run_env` attributes. + +### DataSets * `pandas.ExcelDataSet` now uses `openpyxl` engine instead of `xlrd`. * `pandas.ParquetDataSet` now calls `pd.to_parquet()` upon saving. Note that the argument `partition_cols` is not supported. -* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline contains no nodes. The same `ValueError` is raised when there are no matching tags. -* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline name doesn't exist in the pipeline registry. -* Removed deprecated functions `load_context` and `get_project_context`. -* `spark.SparkHiveDataSet` API has been updated to reflect `spark.SparkDataSet`. The `write_mode=insert` option has also been replaced with `write_mode=append` as per Spark styleguide. This change addresses [Issue 725](https://github.com/quantumblacklabs/kedro/issues/725) and [Issue 745](https://github.com/quantumblacklabs/kedro/issues/745). Additionally, `upsert` mode now leverages `checkpoint` functionality and requires a valid `checkpointDir` be set for current `SparkContext`. -* Deprecated and removed `ProjectHooks.register_config_loader` `hook_spec` in favour of loading `CONFIG_LOADER_CLASS` directly from `settings.py`. The default option for `CONFIG_LOADER_CLASS` is now set to `kedro.config.ConfigLoader`. -* Added `CONFIG_LOADER_ARGS` to `settings.py` to facilitate the provision of additional keyword arguments to the constructor of the project `config_loader`. The default option for `CONFIG_LOADER_ARGS` is an empty dictionary. +* `spark.SparkHiveDataSet` API has been updated to reflect `spark.SparkDataSet`. The `write_mode=insert` option has also been replaced with `write_mode=append` as per Spark styleguide. This change addresses [Issue 725](https://github.com/kedro-org/kedro/issues/725) and [Issue 745](https://github.com/kedro-org/kedro/issues/745). Additionally, `upsert` mode now leverages `checkpoint` functionality and requires a valid `checkpointDir` be set for current `SparkContext`. * `yaml.YAMLDataSet` can no longer save a `pandas.DataFrame` directly, but it can save a dictionary. Use `pandas.DataFrame.to_dict()` to convert your `pandas.DataFrame` to a dictionary before you attempt to save it to YAML. -* Removed `--version` CLI option for `kedro pipeline package` command. Specific pipeline package version can be added by setting the `__version__` variable in the pipeline package's `__init__.py` file. -* The `kedro package` and `kedro pipeline package` now save `egg` and `whl` files in the `/dist` folder (previously `/src/dist`). -* Removed `kedro pipeline list` and `kedro pipeline describe` commands in favour of `kedro registry list` and `kedro registry describe`. * Removed `open_args_load` and `open_args_save` from the following datasets: - * pandas.CSVDataSet - * pandas.ExcelDataSet - * pandas.FeatherDataSet - * pandas.JSONDataSet - * pandas.ParquetDataSet + * `pandas.CSVDataSet` + * `pandas.ExcelDataSet` + * `pandas.FeatherDataSet` + * `pandas.JSONDataSet` + * `pandas.ParquetDataSet` * `storage_options` are now dropped if they are specified under `load_args` or `save_args` for the following datasets: - * pandas.CSVDataSet - * pandas.ExcelDataSet - * pandas.FeatherDataSet - * pandas.JSONDataSet - * pandas.ParquetDataSet -* The environment defaulting behaviour has been removed from `KedroContext` and is now implemented in a `ConfigLoader` class (or equivalent) with the `base_env` and `default_run_env` attributes. -* `ConfigLoader` and `TemplatedConfigLoader` argument `conf_root` has been renamed to `conf_source` to align the API. -* The `settings.py` setting `CONF_ROOT` has been renamed to `CONF_SOURCE` to align the API. Default value of `conf` remains unchanged. -* Renamed `extra_params` to `runtime_params` in `kedro.config.config.ConfigLoader` and `kedro.config.templated_config.TemplatedConfigLoader`. + * `pandas.CSVDataSet` + * `pandas.ExcelDataSet` + * `pandas.FeatherDataSet` + * `pandas.JSONDataSet` + * `pandas.ParquetDataSet` +* Renamed `lambda_data_set`, `memory_data_set`, and `partitioned_data_set` to `lambda_dataset`, `memory_dataset`, and `partitioned_dataset`, respectively, in `kedro.io`. +* The dataset `networkx.NetworkXDataSet` has been renamed to `networkx.JSONDataSet`. + +### CLI +* Removed `kedro install` in favour of `pip install -r src/requirements.txt` to install project dependencies. +* Removed `--parallel` flag from `kedro run` in favour of `--runner=ParallelRunner`. The `-p` flag is now an alias for `--pipeline`. +* `kedro pipeline package` has been replaced by `kedro micropkg package` and, in addition to the `--alias` flag used to rename the package, now accepts a module name and path to the pipeline or utility module to package, relative to `src//`. The `--version` CLI option has been removed in favour of setting a `__version__` variable in the micro-package's `__init__.py` file. +* `kedro pipeline pull` has been replaced by `kedro micropkg pull` and now also supports `--destination` to provide a location for pulling the package. +* Removed `kedro pipeline list` and `kedro pipeline describe` in favour of `kedro registry list` and `kedro registry describe`. +* `kedro package` and `kedro micropkg package` now save `egg` and `whl` or `tar` files in the `/dist` folder (previously `/src/dist`). +* Changed the behaviour of `kedro build-reqs` to compile requirements from `requirements.txt` instead of `requirements.in` and save them to `requirements.lock` instead of `requirements.txt`. +* `kedro jupyter notebook/lab` no longer accept `--all-kernels` or `--idle-timeout` flags. `--all-kernels` is now the default behaviour. +* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline contains no nodes. The same `ValueError` is raised when there are no matching tags. +* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline name doesn't exist in the pipeline registry. + +### Other +* Added namespace to parameters in a modular pipeline, which addresses [Issue 399](https://github.com/kedro-org/kedro/issues/399). +* Switched from packaging pipelines as wheel files to tar archive files compressed with gzip (`.tar.gz`). +* Removed decorator API from `Node` and `Pipeline`, as well as the modules `kedro.extras.decorators` and `kedro.pipeline.decorators`. +* Removed transformer API from `DataCatalog`, as well as the modules `kedro.extras.transformers` and `kedro.io.transformers`. +* Removed the `Journal` and `DataCatalogWithDefault`. +* Removed `%init_kedro` IPython line magic, with its functionality incorporated into `%reload_kedro`. This means that if `%reload_kedro` is called with a filepath, that will be set as default for subsequent calls. ## Migration guide from Kedro 0.17.* to 0.18.* -* Please remove any existing `hook_impl` of the `register_config_loader` method from `ProjectHooks` (or custom alternatives). -* Populate `settings.py` with `CONFIG_LOADER_CLASS` set to your expected config loader class (for example `kedro.config.TemplatedConfigLoader` or custom implementation). If `CONFIG_LOADER_CLASS` value is not set, it will default to `kedro.config.ConfigLoader` at runtime. -* Populate `settings.py` with `CONFIG_LOADER_ARGS` set to a dictionary with expected keyword arguments. If `CONFIG_LOADER_ARGS` is not set, it will default to an empty dictionary. + +### Hooks +* Remove any existing `hook_impl` of the `register_config_loader` and `register_catalog` methods from `ProjectHooks` in `hooks.py` (or custom alternatives). +* If you use `run_id` in the `after_catalog_created` hook, replace it with `save_version` instead. +* If you use `run_id` in any of the `before_node_run`, `after_node_run`, `on_node_error`, `before_pipeline_run`, `after_pipeline_run` or `on_pipeline_error` hooks, replace it with `session_id` instead. + +### `settings.py` file +* If you use a custom config loader class such as `kedro.config.TemplatedConfigLoader`, alter `CONFIG_LOADER_CLASS` to specify the class and `CONFIG_LOADER_ARGS` to specify keyword arguments. If not set, these default to `kedro.config.ConfigLoader` and an empty dictionary respectively. +* If you use a custom data catalog class, alter `DATA_CATALOG_CLASS` to specify the class. If not set, this defaults to `kedro.io.DataCatalog`. +* If you have a custom config location (i.e. not `conf`), update `CONF_ROOT` to `CONF_SOURCE` and set it to a string with the expected configuration location. If not set, this defaults to `"conf"`. + +### Modular pipelines +* If you use any modular pipelines with parameters, make sure they are declared with the correct namespace. See example below: + +For a given pipeline: +```python +active_pipeline = pipeline( + pipe=[ + node( + func=some_func, + inputs=["model_input_table", "params:model_options"], + outputs=["**my_output"], + ), + ..., + ], + inputs="model_input_table", + namespace="candidate_modelling_pipeline", +) +``` + +The parameters should look like this: + +```diff +-model_options: +- test_size: 0.2 +- random_state: 8 +- features: +- - engines +- - passenger_capacity +- - crew ++candidate_modelling_pipeline: ++ model_options: ++ test_size: 0.2 ++ random_state: 8 ++ features: ++ - engines ++ - passenger_capacity ++ - crew + +``` * Optional: You can now remove all `params:` prefix when supplying values to `parameters` argument in a `pipeline()` call. -* If you're using `pandas.ExcelDataSet`, make sure you have `openpyxl` installed in your environment. Note that this is automatically pulled if you specify `kedro[pandas.ExcelDataSet]==0.18.0` in your `requirements.in`. You can uninstall `xlrd` if you were only using it for this dataset. -* If you're using `pandas.ParquetDataSet`, please pass pandas saving arguments directly to `save_args` instead of nested in `from_pandas` (e.g. `save_args = {"preserve_index": False}` instead of `save_args = {"from_pandas": {"preserve_index": False}}`). -* If you're using `spark.SparkHiveDataSet` with `write_mode` option set to `insert`, please update this to `append` in line with the Spark styleguide. If you're using `spark.SparkHiveDataSet` with `write_mode` option set to `upsert`, please make sure that your `SparkContext` has a valid `checkpointDir` set either by `SparkContext.setCheckpointDir` method or directly in the `conf` folder. -* Edit any scripts containing `kedro pipeline package --version` to remove the `--version` option. If you wish to set a specific pipeline package version, set the `__version__` variable in the pipeline package's `__init__.py` file. -* If you had any `pandas.AppendableExcelDataSet` entries in your catalog, replace them with `pandas.ExcelDataSet`. -* If you were using `pandas~=1.2.0` and passing `storage_options` through `load_args` or `savs_args`, please specify them under `fs_args` or via `credentials` instead. -* Update the `settings.py` setting `CONF_ROOT` to `CONF_SOURCE`. -* Update the key-word argument `conf_root` to `conf_source` when calling `ConfigLoader` or `TemplatedConfigLoader` directly. -* Rename `extra_params` to `runtime_params` in `kedro.config.config.ConfigLoader` and `kedro.config.templated_config.TemplatedConfigLoader`, or your custom implementation, if it calls to `ConfigLoader` or any of its parent classes. - -# Upcoming Release 0.17.5 +* If you pull modular pipelines with `kedro pipeline pull my_pipeline --alias other_pipeline`, now use `kedro micropkg pull my_pipeline --alias pipelines.other_pipeline` instead. +* If you package modular pipelines with `kedro pipeline package my_pipeline`, now use `kedro micropkg package pipelines.my_pipeline` instead. +* Similarly, if you package any modular pipelines using `pyproject.toml`, you should modify the keys to include the full module path, and wrapped in double-quotes, e.g: + +```diff +[tool.kedro.micropkg.package] +-data_engineering = {destination = "path/to/here"} +-data_science = {alias = "ds", env = "local"} ++"pipelines.data_engineering" = {destination = "path/to/here"} ++"pipelines.data_science" = {alias = "ds", env = "local"} + +[tool.kedro.micropkg.pull] +-"s3://my_bucket/my_pipeline" = {alias = "aliased_pipeline"} ++"s3://my_bucket/my_pipeline" = {alias = "pipelines.aliased_pipeline"} +``` + +### DataSets +* If you use `pandas.ExcelDataSet`, make sure you have `openpyxl` installed in your environment. This is automatically installed if you specify `kedro[pandas.ExcelDataSet]==0.18.0` in your `requirements.txt`. You can uninstall `xlrd` if you were only using it for this dataset. +* If you use`pandas.ParquetDataSet`, pass pandas saving arguments directly to `save_args` instead of nested in `from_pandas` (e.g. `save_args = {"preserve_index": False}` instead of `save_args = {"from_pandas": {"preserve_index": False}}`). +* If you use `spark.SparkHiveDataSet` with `write_mode` option set to `insert`, change this to `append` in line with the Spark styleguide. If you use `spark.SparkHiveDataSet` with `write_mode` option set to `upsert`, make sure that your `SparkContext` has a valid `checkpointDir` set either by `SparkContext.setCheckpointDir` method or directly in the `conf` folder. +* If you use `pandas~=1.2.0` and pass `storage_options` through `load_args` or `savs_args`, specify them under `fs_args` or via `credentials` instead. +* If you import from `kedro.io.lambda_data_set`, `kedro.io.memory_data_set`, or `kedro.io.partitioned_data_set`, change the import to `kedro.io.lambda_dataset`, `kedro.io.memory_dataset`, or `kedro.io.partitioned_dataset`, respectively (or import the dataset directly from `kedro.io`). +* If you have any `pandas.AppendableExcelDataSet` entries in your catalog, replace them with `pandas.ExcelDataSet`. +* If you have any `networkx.NetworkXDataSet` entries in your catalog, replace them with `networkx.JSONDataSet`. + +### Other +* Edit any scripts containing `kedro pipeline package --version` to use `kedro micropkg package` instead. If you wish to set a specific pipeline package version, set the `__version__` variable in the pipeline package's `__init__.py` file. +* To run a pipeline in parallel, use `kedro run --runner=ParallelRunner` rather than `--parallel` or `-p`. +* If you call `ConfigLoader` or `TemplatedConfigLoader` directly, update the keyword arguments `conf_root` to `conf_source` and `extra_params` to `runtime_params`. +* If you use `KedroContext` to access `ConfigLoader`, use `settings.CONFIG_LOADER_CLASS` to access the currently used `ConfigLoader` instead. +* The signature of `KedroContext` has changed and now needs `config_loader` and `hook_manager` as additional arguments of type `ConfigLoader` and `PluginManager` respectively. + +# Release 0.17.7 + +## Major features and improvements +* `pipeline` now accepts `tags` and a collection of `Node`s and/or `Pipeline`s rather than just a single `Pipeline` object. `pipeline` should be used in preference to `Pipeline` when creating a Kedro pipeline. +* `pandas.SQLTableDataSet` and `pandas.SQLQueryDataSet` now only open one connection per database, at instantiation time (therefore at catalog creation time), rather than one per load/save operation. +* Added new command group, `micropkg`, to replace `kedro pipeline pull` and `kedro pipeline package` with `kedro micropkg pull` and `kedro micropkg package` for Kedro 0.18.0. `kedro micropkg package` saves packages to `project/dist` while `kedro pipeline package` saves packages to `project/src/dist`. + +## Bug fixes and other changes +* Added tutorial documentation for [experiment tracking](https://docs.kedro.org/en/0.17.7/08_logging/02_experiment_tracking.html). +* Added [Plotly dataset documentation](https://docs.kedro.org/en/0.17.7/03_tutorial/05_visualise_pipeline.html#visualise-plotly-charts-in-kedro-viz). +* Added the upper limit `pandas<1.4` to maintain compatibility with `xlrd~=1.0`. +* Bumped the `Pillow` minimum version requirement to 9.0 (Python 3.7+ only) following [CVE-2022-22817](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-22817). +* Fixed `PickleDataSet` to be copyable and hence work with the parallel runner. +* Upgraded `pip-tools`, which is used by `kedro build-reqs`, to 6.5 (Python 3.7+ only). This `pip-tools` version is compatible with `pip>=21.2`, including the most recent releases of `pip`. Python 3.6 users should continue to use `pip-tools` 6.4 and `pip<22`. +* Added `astro-iris` as alias for `astro-airlow-iris`, so that old tutorials can still be followed. +* Added details about [Kedro's Technical Steering Committee and governance model](https://docs.kedro.org/en/0.17.7/14_contribution/technical_steering_committee.html). + +## Upcoming deprecations for Kedro 0.18.0 +* `kedro pipeline pull` and `kedro pipeline package` will be deprecated. Please use `kedro micropkg` instead. + + +# Release 0.17.6 + +## Major features and improvements +* Added `pipelines` global variable to IPython extension, allowing you to access the project's pipelines in `kedro ipython` or `kedro jupyter notebook`. +* Enabled overriding nested parameters with `params` in CLI, i.e. `kedro run --params="model.model_tuning.booster:gbtree"` updates parameters to `{"model": {"model_tuning": {"booster": "gbtree"}}}`. +* Added option to `pandas.SQLQueryDataSet` to specify a `filepath` with a SQL query, in addition to the current method of supplying the query itself in the `sql` argument. +* Extended `ExcelDataSet` to support saving Excel files with multiple sheets. +* Added the following new datasets: + +| Type | Description | Location | +| ------------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------ | +| `plotly.JSONDataSet` | Works with plotly graph object Figures (saves as json file) | `kedro.extras.datasets.plotly` | +| `pandas.GenericDataSet` | Provides a 'best effort' facility to read / write any format provided by the `pandas` library | `kedro.extras.datasets.pandas` | +| `pandas.GBQQueryDataSet` | Loads data from a Google Bigquery table using provided SQL query | `kedro.extras.datasets.pandas` | +| `spark.DeltaTableDataSet` | Dataset designed to handle Delta Lake Tables and their CRUD-style operations, including `update`, `merge` and `delete` | `kedro.extras.datasets.spark` | + +## Bug fixes and other changes +* Fixed an issue where `kedro new --config config.yml` was ignoring the config file when `prompts.yml` didn't exist. +* Added documentation for `kedro viz --autoreload`. +* Added support for arbitrary backends (via importable module paths) that satisfy the `pickle` interface to `PickleDataSet`. +* Added support for `sum` syntax for connecting pipeline objects. +* Upgraded `pip-tools`, which is used by `kedro build-reqs`, to 6.4. This `pip-tools` version requires `pip>=21.2` while [adding support for `pip>=21.3`](https://github.com/jazzband/pip-tools/pull/1501). To upgrade `pip`, please refer to [their documentation](https://pip.pypa.io/en/stable/installing/#upgrading-pip). +* Relaxed the bounds on the `plotly` requirement for `plotly.PlotlyDataSet` and the `pyarrow` requirement for `pandas.ParquetDataSet`. +* `kedro pipeline package ` now raises an error if the `` argument doesn't look like a valid Python module path (e.g. has `/` instead of `.`). +* Added new `overwrite` argument to `PartitionedDataSet` and `MatplotlibWriter` to enable deletion of existing partitions and plots on dataset `save`. +* `kedro pipeline pull` now works when the project requirements contains entries such as `-r`, `--extra-index-url` and local wheel files ([Issue #913](https://github.com/kedro-org/kedro/issues/913)). +* Fixed slow startup because of catalog processing by reducing the exponential growth of extra processing during `_FrozenDatasets` creations. +* Removed `.coveragerc` from the Kedro project template. `coverage` settings are now given in `pyproject.toml`. +* Fixed a bug where packaging or pulling a modular pipeline with the same name as the project's package name would throw an error (or silently pass without including the pipeline source code in the wheel file). +* Removed unintentional dependency on `git`. +* Fixed an issue where nested pipeline configuration was not included in the packaged pipeline. +* Deprecated the "Thanks for supporting contributions" section of release notes to simplify the contribution process; Kedro 0.17.6 is the last release that includes this. This process has been replaced with the [automatic GitHub feature](https://github.com/kedro-org/kedro/graphs/contributors). +* Fixed a bug where the version on the tracking datasets didn't match the session id and the versions of regular versioned datasets. +* Fixed an issue where datasets in `load_versions` that are not found in the data catalog would silently pass. +* Altered the string representation of nodes so that node inputs/outputs order is preserved rather than being alphabetically sorted. +* Update `APIDataSet` to accept `auth` through `credentials` and allow any iterable for `auth`. + +## Upcoming deprecations for Kedro 0.18.0 +* `kedro.extras.decorators` and `kedro.pipeline.decorators` are being deprecated in favour of Hooks. +* `kedro.extras.transformers` and `kedro.io.transformers` are being deprecated in favour of Hooks. +* The `--parallel` flag on `kedro run` is being removed in favour of `--runner=ParallelRunner`. The `-p` flag will change to be an alias for `--pipeline`. +* `kedro.io.DataCatalogWithDefault` is being deprecated, to be removed entirely in 0.18.0. + +## Thanks for supporting contributions +[Deepyaman Datta](https://github.com/deepyaman), +[Brites](https://github.com/brites101), +[Manish Swami](https://github.com/ManishS6), +[Avaneesh Yembadi](https://github.com/avan-sh), +[Zain Patel](https://github.com/mzjp2), +[Simon Brugman](https://github.com/sbrugman), +[Kiyo Kunii](https://github.com/921kiyo), +[Benjamin Levy](https://github.com/BenjaminLevyQB), +[Louis de Charsonville](https://github.com/louisdecharson), +[Simon Picard](https://github.com/simonpicard) + +# Release 0.17.5 ## Major features and improvements * Added new CLI group `registry`, with the associated commands `kedro registry list` and `kedro registry describe`, to replace `kedro pipeline list` and `kedro pipeline describe`. -* Added support for dependency management at a modular pipeline level. When a pipeline with `requirements.txt` is packaged, its dependencies are embedded in the modular pipeline wheel file. Upon pulling the pipeline, Kedro will append dependencies to the project's `requirements.in`. More information is available in [our documentation](https://kedro.readthedocs.io/en/stable/06_nodes_and_pipelines/03_modular_pipelines.html#package-a-modular-pipeline). -* Added support for bulk packaging modular pipelines using `kedro pipeline package --all` and `pyproject.toml`. +* Added support for dependency management at a modular pipeline level. When a pipeline with `requirements.txt` is packaged, its dependencies are embedded in the modular pipeline wheel file. Upon pulling the pipeline, Kedro will append dependencies to the project's `requirements.in`. More information is available in [our documentation](https://docs.kedro.org/en/0.17.5/06_nodes_and_pipelines/03_modular_pipelines.html). +* Added support for bulk packaging/pulling modular pipelines using `kedro pipeline package/pull --all` and `pyproject.toml`. * Removed `cli.py` from the Kedro project template. By default all CLI commands, including `kedro run`, are now defined on the Kedro framework side. These can be overridden in turn by a plugin or a `cli.py` file in your project. A packaged Kedro project will respect the same hierarchy when executed with `python -m my_package`. * Removed `.ipython/profile_default/startup/` from the Kedro project template in favour of `.ipython/profile_default/ipython_config.py` and the `kedro.extras.extensions.ipython`. +* Added support for `dill` backend to `PickleDataSet`. +* Imports are now refactored at `kedro pipeline package` and `kedro pipeline pull` time, so that _aliasing_ a modular pipeline doesn't break it. +* Added the following new datasets to support basic Experiment Tracking: + +| Type | Description | Location | +| ------------------------- | -------------------------------------------------------- | -------------------------------- | +| `tracking.MetricsDataSet` | Dataset to track numeric metrics for experiment tracking | `kedro.extras.datasets.tracking` | +| `tracking.JSONDataSet` | Dataset to track data for experiment tracking | `kedro.extras.datasets.tracking` | ## Bug fixes and other changes * Bumped minimum required `fsspec` version to 2021.04. +* Fixed the `kedro install` and `kedro build-reqs` flows when uninstalled dependencies are present in a project's `settings.py`, `context.py` or `hooks.py` ([Issue #829](https://github.com/kedro-org/kedro/issues/829)). +* Imports are now refactored at `kedro pipeline package` and `kedro pipeline pull` time, so that _aliasing_ a modular pipeline doesn't break it. ## Minor breaking changes to the API +* Pinned `dynaconf` to `<3.1.6` because the method signature for `_validate_items` changed which is used in Kedro. ## Upcoming deprecations for Kedro 0.18.0 -* `kedro pipeline list` and `kedro pipeline describe` are being deprecated in favour of new commands `kedro registry list ` and `kedro registry describe` +* `kedro pipeline list` and `kedro pipeline describe` are being deprecated in favour of new commands `kedro registry list ` and `kedro registry describe`. +* `kedro install` is being deprecated in favour of using `pip install -r src/requirements.txt` to install project dependencies. ## Thanks for supporting contributions -[Moussa Taifi](https://github.com/moutai/) +[Moussa Taifi](https://github.com/moutai), +[Deepyaman Datta](https://github.com/deepyaman) # Release 0.17.4 ## Major features and improvements * Added the following new datasets: -| Type | Description | Location | -| --------------------------- | ---------------------------------------------------- | --------------------------------- | +| Type | Description | Location | +| ---------------------- | ----------------------------------------------------------- | ------------------------------ | | `plotly.PlotlyDataSet` | Works with plotly graph object Figures (saves as json file) | `kedro.extras.datasets.plotly` | ## Bug fixes and other changes -* Defined our set of Kedro Principles! Have a read through [our docs](https://kedro.readthedocs.io/en/0.17.4/12_faq/03_kedro_principles.html). +* Defined our set of Kedro Principles! Have a read through [our docs](https://docs.kedro.org/en/0.17.4/12_faq/03_kedro_principles.html). * `ConfigLoader.get()` now raises a `BadConfigException`, with a more helpful error message, if a configuration file cannot be loaded (for instance due to wrong syntax or poor formatting). * `run_id` now defaults to `save_version` when `after_catalog_created` is called, similarly to what happens during a `kedro run`. * Fixed a bug where `kedro ipython` and `kedro jupyter notebook` didn't work if the `PYTHONPATH` was already set. @@ -102,13 +693,13 @@ * `kedro pipeline describe` now defaults to the `__default__` pipeline when no pipeline name is provided and also shows the namespace the nodes belong to. * Fixed an issue where spark.SparkDataSet with enabled versioning would throw a VersionNotFoundError when using databricks-connect from a remote machine and saving to dbfs filesystem. * `EmailMessageDataSet` added to doctree. -* When node inputs do not pass validation, the error message is now shown as the most recent exception in the traceback ([Issue #761](https://github.com/quantumblacklabs/kedro/issues/761)). +* When node inputs do not pass validation, the error message is now shown as the most recent exception in the traceback ([Issue #761](https://github.com/kedro-org/kedro/issues/761)). * `kedro pipeline package` now only packages the parameter file that exactly matches the pipeline name specified and the parameter files in a directory with the pipeline name. -* Extended support to newer versions of third-party dependencies ([Issue #735](https://github.com/quantumblacklabs/kedro/issues/735)). +* Extended support to newer versions of third-party dependencies ([Issue #735](https://github.com/kedro-org/kedro/issues/735)). * Ensured consistent references to `model input` tables in accordance with our Data Engineering convention. * Changed behaviour where `kedro pipeline package` takes the pipeline package version, rather than the kedro package version. If the pipeline package version is not present, then the package version is used. -* Launched [GitHub Discussions](https://github.com/quantumblacklabs/kedro/discussions/) and [Kedro Discord Server](https://discord.gg/akJDeVaxnB) -* Improved error message when versioning is enabled for a dataset previously saved as non-versioned ([Issue #625](https://github.com/quantumblacklabs/kedro/issues/625)). +* Launched [GitHub Discussions](https://github.com/kedro-org/kedro/discussions/) and [Kedro Discord Server](https://discord.gg/akJDeVaxnB) +* Improved error message when versioning is enabled for a dataset previously saved as non-versioned ([Issue #625](https://github.com/kedro-org/kedro/issues/625)). ## Minor breaking changes to the API @@ -116,7 +707,7 @@ ## Thanks for supporting contributions [Lou Kratz](https://github.com/lou-k), -[Lucas Jamar](https://github.com/lucasjamar/) +[Lucas Jamar](https://github.com/lucasjamar) # Release 0.17.3 @@ -125,11 +716,11 @@ * Added a `before_command_run` hook for plugins to add extra behaviour before Kedro CLI commands run. * `pipelines` from `pipeline_registry.py` and `register_pipeline` hooks are now loaded lazily when they are first accessed, not on startup: -```python -from kedro.framework.project import pipelines + ```python + from kedro.framework.project import pipelines -print(pipelines["__default__"]) # pipeline loading is only triggered here -``` + print(pipelines["__default__"]) # pipeline loading is only triggered here + ``` ## Bug fixes and other changes * `TemplatedConfigLoader` now correctly inserts default values when no globals are supplied. @@ -139,11 +730,11 @@ print(pipelines["__default__"]) # pipeline loading is only triggered here * CLI commands from sources with the same name will show under one list in the help screen. * The setup of a Kedro project, including adding src to path and configuring settings, is now handled via the `bootstrap_project` method. * `configure_project` is invoked if a `package_name` is supplied to `KedroSession.create`. This is added for backward-compatibility purpose to support a workflow that creates `Session` manually. It will be removed in `0.18.0`. -* Stopped swallowing up all `ModuleNotFoundError` if `register_pipelines` not found, so that a more helpful error message will appear when a dependency is missing, e.g. [Issue #722](https://github.com/quantumblacklabs/kedro/issues/722). +* Stopped swallowing up all `ModuleNotFoundError` if `register_pipelines` not found, so that a more helpful error message will appear when a dependency is missing, e.g. [Issue #722](https://github.com/kedro-org/kedro/issues/722). * When `kedro new` is invoked using a configuration yaml file, `output_dir` is no longer a required key; by default the current working directory will be used. * When `kedro new` is invoked using a configuration yaml file, the appropriate `prompts.yml` file is now used for validating the provided configuration. Previously, validation was always performed against the kedro project template `prompts.yml` file. * When a relative path to a starter template is provided, `kedro new` now generates user prompts to obtain configuration rather than supplying empty configuration. -* Fixed error when using starters on Windows with Python 3.7 (Issue [#722](https://github.com/quantumblacklabs/kedro/issues/722)). +* Fixed error when using starters on Windows with Python 3.7 (Issue [#722](https://github.com/kedro-org/kedro/issues/722)). * Fixed decoding error of config files that contain accented characters by opening them for reading in UTF-8. * Fixed an issue where `after_dataset_loaded` run would finish before a dataset is actually loaded when using `--async` flag. @@ -161,11 +752,11 @@ print(pipelines["__default__"]) # pipeline loading is only triggered here * Added support for `compress_pickle` backend to `PickleDataSet`. * Enabled loading pipelines without creating a `KedroContext` instance: -```python -from kedro.framework.project import pipelines + ```python + from kedro.framework.project import pipelines -print(pipelines) -``` + print(pipelines) + ``` * Projects generated with kedro>=0.17.2: - should define pipelines in `pipeline_registry.py` rather than `hooks.py`. @@ -183,7 +774,7 @@ print(pipelines) * `kedro.framework.context.KedroContext.run` will be removed in release 0.18.0. ## Thanks for supporting contributions -[Sasaki Takeru](https://github.com/takeru/) +[Sasaki Takeru](https://github.com/takeru) # Release 0.17.1 @@ -194,11 +785,12 @@ print(pipelines) * Added the `env` and `extra_params` arguments to `register_config_loader` hook. * Refactored the way `settings` are loaded. You will now be able to run: -```python -from kedro.framework.project import settings + ```python + from kedro.framework.project import settings + + print(settings.CONF_ROOT) + ``` -print(settings.CONF_ROOT) -``` * Added a check on `kedro.runner.parallel_runner.ParallelRunner` which checks datasets for the `_SINGLE_PROCESS` attribute in the `_validate_catalog` method. If this attribute is set to `True` in an instance of a dataset (e.g. `SparkDataSet`), the `ParallelRunner` will raise an `AttributeError`. * Any user-defined dataset that should not be used with `ParallelRunner` may now have the `_SINGLE_PROCESS` attribute set to `True`. @@ -220,7 +812,7 @@ print(settings.CONF_ROOT) * This release has broken the `kedro ipython` and `kedro jupyter` workflows. To fix this, follow the instructions in the migration guide below. * You will also need to upgrade `kedro-viz` to 3.10.1 if you use the `%run_viz` line magic in Jupyter Notebook. -> *Note:* If you're using the `ipython` [extension](https://kedro.readthedocs.io/en/stable/11_tools_integration/02_ipython.html#ipython-extension) instead, you will not encounter this problem. +> *Note:* If you're using the `ipython` [extension](https://docs.kedro.org/en/0.17.1/11_tools_integration/02_ipython.html#ipython-extension) instead, you will not encounter this problem. ## Migration guide You will have to update the file `/.ipython/profile_default/startup/00-kedro-init.py` in order to make `kedro ipython` and/or `kedro jupyter` work. Add the following line before the `KedroSession` is created: @@ -252,15 +844,15 @@ from kedro.framework.session import KedroSession ## Major features and improvements -* In a significant change, [we have introduced `KedroSession`](https://kedro.readthedocs.io/en/stable/04_kedro_project_setup/03_session.html) which is responsible for managing the lifecycle of a Kedro run. -* Created a new Kedro Starter: `kedro new --starter=mini-kedro`. It is possible to [use the DataCatalog as a standalone component](https://github.com/quantumblacklabs/kedro-starters/tree/master/mini-kedro) in a Jupyter notebook and transition into the rest of the Kedro framework. +* In a significant change, [we have introduced `KedroSession`](https://docs.kedro.org/en/0.17.0/04_kedro_project_setup/03_session.html) which is responsible for managing the lifecycle of a Kedro run. +* Created a new Kedro Starter: `kedro new --starter=mini-kedro`. It is possible to [use the DataCatalog as a standalone component](https://github.com/kedro-org/kedro-starters/tree/master/mini-kedro) in a Jupyter notebook and transition into the rest of the Kedro framework. * Added `DatasetSpecs` with Hooks to run before and after datasets are loaded from/saved to the catalog. * Added a command: `kedro catalog create`. For a registered pipeline, it creates a `//catalog/.yml` configuration file with `MemoryDataSet` datasets for each dataset that is missing from `DataCatalog`. * Added `settings.py` and `pyproject.toml` (to replace `.kedro.yml`) for project configuration, in line with Python best practice. * `ProjectContext` is no longer needed, unless for very complex customisations. `KedroContext`, `ProjectHooks` and `settings.py` together implement sensible default behaviour. As a result `context_path` is also now an _optional_ key in `pyproject.toml`. * Removed `ProjectContext` from `src//run.py`. * `TemplatedConfigLoader` now supports [Jinja2 template syntax](https://jinja.palletsprojects.com/en/2.11.x/templates/) alongside its original syntax. -* Made [registration Hooks](https://kedro.readthedocs.io/en/stable/07_extend_kedro/04_hooks.html#registration-hooks) mandatory, as the only way to customise the `ConfigLoader` or the `DataCatalog` used in a project. If no such Hook is provided in `src//hooks.py`, a `KedroContextError` is raised. There are sensible defaults defined in any project generated with Kedro >= 0.16.5. +* Made [registration Hooks](https://docs.kedro.org/en/0.17.0/07_extend_kedro/02_hooks.html#registration-hooks) mandatory, as the only way to customise the `ConfigLoader` or the `DataCatalog` used in a project. If no such Hook is provided in `src//hooks.py`, a `KedroContextError` is raised. There are sensible defaults defined in any project generated with Kedro >= 0.16.5. ## Bug fixes and other changes @@ -313,14 +905,14 @@ from kedro.framework.session import KedroSession ## Migration guide from Kedro 0.16.* to 0.17.* -**Reminder:** Our documentation on [how to upgrade Kedro](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#how-do-i-upgrade-kedro) covers a few key things to remember when updating any Kedro version. +**Reminder:** Our documentation on [how to upgrade Kedro](https://docs.kedro.org/en/0.17.0/12_faq/01_faq.html#how-do-i-upgrade-kedro) covers a few key things to remember when updating any Kedro version. The Kedro 0.17.0 release contains some breaking changes. If you update Kedro to 0.17.0 and then try to work with projects created against earlier versions of Kedro, you may encounter some issues when trying to run `kedro` commands in the terminal for that project. Here's a short guide to getting your projects running against the new version of Kedro. >*Note*: As always, if you hit any problems, please check out our documentation: ->* [How can I find out more about Kedro?](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#how-can-i-find-out-more-about-kedro) ->* [How can I get my questions answered?](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#how-can-i-get-my-question-answered). +>* [How can I find out more about Kedro?](https://docs.kedro.org/en/0.17.0/12_faq/01_faq.html#how-can-i-find-out-more-about-kedro) +>* [How can I get my questions answered?](https://docs.kedro.org/en/0.17.0/12_faq/01_faq.html#how-can-i-get-my-question-answered). To get an existing Kedro project to work after you upgrade to Kedro 0.17.0, we recommend that you create a new project against Kedro 0.17.0 and move the code from your existing project into it. Let's go through the changes, but first, note that if you create a new Kedro project with Kedro 0.17.0 you will not be asked whether you want to include the boilerplate code for the Iris dataset example. We've removed this option (you should now use a Kedro starter if you want to create a project that is pre-populated with code). @@ -329,12 +921,12 @@ To create a new, blank Kedro 0.17.0 project to drop your existing code into, you * **Update `pyproject.toml`**: Copy the following three keys from the `.kedro.yml` of your existing Kedro project into the `pyproject.toml` file of your new Kedro 0.17.0 project: -```toml -[tools.kedro] -package_name = "" -project_name = "" -project_version = "0.17.0" -``` + ```toml + [tools.kedro] + package_name = "" + project_name = "" + project_version = "0.17.0" + ``` Check your source directory. If you defined a different source directory (`source_dir`), make sure you also move that to `pyproject.toml`. @@ -352,36 +944,35 @@ Check your source directory. If you defined a different source directory (`sourc * **Update `settings.py`**: For example, if you specified additional Hook implementations in `hooks`, or listed plugins under `disable_hooks_by_plugin` in your `.kedro.yml`, you will need to move them to `settings.py` accordingly: -```python -from .hooks import MyCustomHooks, ProjectHooks - + ```python + from .hooks import MyCustomHooks, ProjectHooks -HOOKS = (ProjectHooks(), MyCustomHooks()) + HOOKS = (ProjectHooks(), MyCustomHooks()) -DISABLE_HOOKS_FOR_PLUGINS = ("my_plugin1",) -``` + DISABLE_HOOKS_FOR_PLUGINS = ("my_plugin1",) + ``` * **Migration for `node` names**. From 0.17.0 the only allowed characters for node names are letters, digits, hyphens, underscores and/or fullstops. If you have previously defined node names that have special characters, spaces or other characters that are no longer permitted, you will need to rename those nodes. * **Copy changes to `kedro_cli.py`**. If you previously customised the `kedro run` command or added more CLI commands to your `kedro_cli.py`, you should move them into `/src//cli.py`. Note, however, that the new way to run a Kedro pipeline is via a `KedroSession`, rather than using the `KedroContext`: -```python -with KedroSession.create(package_name=...) as session: - session.run() -``` + ```python + with KedroSession.create(package_name=...) as session: + session.run() + ``` * **Copy changes made to `ConfigLoader`**. If you have defined a custom class, such as `TemplatedConfigLoader`, by overriding `ProjectContext._create_config_loader`, you should move the contents of the function in `src//hooks.py`, under `register_config_loader`. * **Copy changes made to `DataCatalog`**. Likewise, if you have `DataCatalog` defined with `ProjectContext._create_catalog`, you should copy-paste the contents into `register_catalog`. -* **Optional**: If you have plugins such as [Kedro-Viz](https://github.com/quantumblacklabs/kedro-viz) installed, it's likely that Kedro 0.17.0 won't work with their older versions, so please either upgrade to the plugin's newest version or follow their migration guides. +* **Optional**: If you have plugins such as [Kedro-Viz](https://github.com/kedro-org/kedro-viz) installed, it's likely that Kedro 0.17.0 won't work with their older versions, so please either upgrade to the plugin's newest version or follow their migration guides. # Release 0.16.6 ## Major features and improvements -* Added documentation with a focus on single machine and distributed environment deployment; the series includes Docker, Argo, Prefect, Kubeflow, AWS Batch, AWS Sagemaker and extends our section on Databricks -* Added [kedro-starter-spaceflights](https://github.com/quantumblacklabs/kedro-starter-spaceflights/) alias for generating a project: `kedro new --starter spaceflights`. +* Added documentation with a focus on single machine and distributed environment deployment; the series includes Docker, Argo, Prefect, Kubeflow, AWS Batch, AWS Sagemaker and extends our section on Databricks. +* Added [kedro-starter-spaceflights](https://github.com/kedro-org/kedro-starter-spaceflights/) alias for generating a project: `kedro new --starter spaceflights`. ## Bug fixes and other changes * Fixed `TypeError` when converting dict inputs to a node made from a wrapped `partial` function. @@ -396,7 +987,7 @@ with KedroSession.create(package_name=...) as session: * Improved error messages for incorrect parameters passed into a node. * Fixed issue with saving a `TensorFlowModelDataset` in the HDF5 format with versioning enabled. * Added missing `run_result` argument in `after_pipeline_run` Hooks spec. -* Fixed a bug in IPython script that was causing context hooks to be registered twice. To apply this fix to a project generated with an older Kedro version, apply the same changes made in [this PR](https://github.com/quantumblacklabs/kedro-starter-pandas-iris/pull/16) to your `00-kedro-init.py` file. +* Fixed a bug in IPython script that was causing context hooks to be registered twice. To apply this fix to a project generated with an older Kedro version, apply the same changes made in [this PR](https://github.com/kedro-org/kedro-starter-pandas-iris/pull/16) to your `00-kedro-init.py` file. * Improved documentation. ## Breaking changes to the API @@ -419,7 +1010,7 @@ with KedroSession.create(package_name=...) as session: * `register_pipelines()`, to replace `_get_pipelines()` * `register_config_loader()`, to replace `_create_config_loader()` * `register_catalog()`, to replace `_create_catalog()` -These can be defined in `src//hooks.py` and added to `.kedro.yml` (or `pyproject.toml`). The order of execution is: plugin hooks, `.kedro.yml` hooks, hooks in `ProjectContext.hooks`. +These can be defined in `src//hooks.py` and added to `.kedro.yml` (or `pyproject.toml`). The order of execution is: plugin hooks, `.kedro.yml` hooks, hooks in `ProjectContext.hooks`. * Added ability to disable auto-registered Hooks using `.kedro.yml` (or `pyproject.toml`) configuration file. ## Bug fixes and other changes @@ -521,7 +1112,7 @@ package_name: "" * `kedro jupyter` CLI command improvements: - Improved error message when running `kedro jupyter notebook`, `kedro jupyter lab` or `kedro ipython` with Jupyter/IPython dependencies not being installed. - Fixed `%run_viz` line magic for showing kedro viz inside a Jupyter notebook. For the fix to be applied on existing Kedro project, please see the migration guide. - - Fixed the bug in IPython startup script ([issue 298](https://github.com/quantumblacklabs/kedro/issues/298)). + - Fixed the bug in IPython startup script ([issue 298](https://github.com/kedro-org/kedro/issues/298)). * Documentation improvements: - Updated community-generated content in FAQ. - Added [find-kedro](https://github.com/WaylonWalker/find-kedro) and [kedro-static-viz](https://github.com/WaylonWalker/kedro-static-viz) to the list of community plugins. @@ -533,7 +1124,7 @@ package_name: "" #### Guide to apply the fix for `%run_viz` line magic in existing project -Even though this release ships a fix for project generated with `kedro==0.16.2`, after upgrading, you will still need to make a change in your existing project if it was generated with `kedro>=0.16.0,<=0.16.1` for the fix to take effect. Specifically, please change the content of your project's IPython init script located at `.ipython/profile_default/startup/00-kedro-init.py` with the content of [this file](https://github.com/quantumblacklabs/kedro/blob/0.16.2/kedro/templates/project/%7B%7B%20cookiecutter.repo_name%20%7D%7D/.ipython/profile_default/startup/00-kedro-init.py). You will also need `kedro-viz>=3.3.1`. +Even though this release ships a fix for project generated with `kedro==0.16.2`, after upgrading, you will still need to make a change in your existing project if it was generated with `kedro>=0.16.0,<=0.16.1` for the fix to take effect. Specifically, please change the content of your project's IPython init script located at `.ipython/profile_default/startup/00-kedro-init.py` with the content of [this file](https://github.com/kedro-org/kedro/blob/0.16.2/kedro/templates/project/%7B%7B%20cookiecutter.repo_name%20%7D%7D/.ipython/profile_default/startup/00-kedro-init.py). You will also need `kedro-viz>=3.3.1`. ## Thanks for supporting contributions [Miguel Rodriguez Gutierrez](https://github.com/MigQ2), [Joel Schwarzmann](https://github.com/datajoely), [w0rdsm1th](https://github.com/w0rdsm1th), [Deepyaman Datta](https://github.com/deepyaman), [Tam-Sanh Nguyen](https://github.com/tamsanh), [Marcus Gawronsky](https://github.com/marcusinthesky) @@ -581,7 +1172,7 @@ Even though this release ships a fix for project generated with `kedro==0.16.2`, * Added `joblib` backend support to `pickle.PickleDataSet`. * Added versioning support to `MatplotlibWriter` dataset. * Added the ability to install dependencies for a given dataset with more granularity, e.g. `pip install "kedro[pandas.ParquetDataSet]"`. -* Added the ability to specify extra arguments, e.g. `encoding` or `compression`, for `fsspec.spec.AbstractFileSystem.open()` calls when loading/saving a dataset. See Example 3 under [docs](https://kedro.readthedocs.io/en/stable/04_user_guide/04_data_catalog.html#using-the-data-catalog-with-the-yaml-api). +* Added the ability to specify extra arguments, e.g. `encoding` or `compression`, for `fsspec.spec.AbstractFileSystem.open()` calls when loading/saving a dataset. See Example 3 under [docs](https://docs.kedro.org/en/0.16.0/04_user_guide/04_data_catalog.html#use-the-data-catalog-with-the-yaml-api). ### Other * Added `namespace` property on ``Node``, related to the modular pipeline where the node belongs. @@ -590,14 +1181,14 @@ Even though this release ships a fix for project generated with `kedro==0.16.2`, * Removed the requirement to have all dependencies for a dataset module to use only a subset of the datasets within. * Added support for `pandas>=1.0`. * Enabled Python 3.8 compatibility. _Please note that a Spark workflow may be unreliable for this Python version as `pyspark` is not fully-compatible with 3.8 yet._ -* Renamed "features" layer to "feature" layer to be consistent with (most) other layers and the [relevant FAQ](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention). +* Renamed "features" layer to "feature" layer to be consistent with (most) other layers and the [relevant FAQ](https://docs.kedro.org/en/0.16.0/06_resources/01_faq.html#what-is-data-engineering-convention). ## Bug fixes and other changes * Fixed a bug where a new version created mid-run by an external system caused inconsistencies in the load versions used in the current run. * Documentation improvements * Added instruction in the documentation on how to create a custom runner). * Updated contribution process in `CONTRIBUTING.md` - added Developer Workflow. - * Documented installation of development version of Kedro in the [FAQ section](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#how-can-i-use-a-development-version-of-kedro). + * Documented installation of development version of Kedro in the [FAQ section](https://docs.kedro.org/en/0.16.0/06_resources/01_faq.html#how-can-i-use-development-version-of-kedro). * Added missing `_exists` method to `MyOwnDataSet` example in 04_user_guide/08_advanced_io. * Fixed a bug where `PartitionedDataSet` and `IncrementalDataSet` were not working with `s3a` or `s3n` protocol. * Added ability to read partitioned parquet file from a directory in `pandas.ParquetDataSet`. @@ -632,7 +1223,7 @@ Even though this release ships a fix for project generated with `kedro==0.16.2`, #### General Migration -**reminder** [How do I upgrade Kedro](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#how-do-i-upgrade-kedro) covers a few key things to remember when updating any kedro version. +**reminder** [How do I upgrade Kedro](https://docs.kedro.org/en/0.16.0/06_resources/01_faq.html#how-do-i-upgrade-kedro) covers a few key things to remember when updating any kedro version. #### Migration for datasets @@ -676,7 +1267,7 @@ result = pipeline( #### Migration for decorators, color logger, transformers etc. Since some modules were moved to other locations you need to update import paths appropriately. -You can find the list of moved files in the [`0.15.6` release notes](https://github.com/quantumblacklabs/kedro/releases/tag/0.15.6) under the section titled `Files with a new location`. +You can find the list of moved files in the [`0.15.6` release notes](https://github.com/kedro-org/kedro/releases/tag/0.15.6) under the section titled `Files with a new location`. #### Migration for CLI and KEDRO_ENV environment variable > Note: If you haven't made significant changes to your `kedro_cli.py`, it may be easier to simply copy the updated `kedro_cli.py` `.ipython/profile_default/startup/00-kedro-init.py` and from GitHub or a newly generated project into your old project. @@ -737,7 +1328,7 @@ You can find the list of moved files in the [`0.15.6` release notes](https://git # 0.15.6 ## Major features and improvements -> _TL;DR_ We're launching [`kedro.extras`](https://github.com/quantumblacklabs/kedro/tree/master/extras), the new home for our revamped series of datasets, decorators and dataset transformers. The datasets in [`kedro.extras.datasets`](https://github.com/quantumblacklabs/kedro/tree/master/extras/datasets) use [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to access a variety of data stores including local file systems, network file systems, cloud object stores (including S3 and GCP), and Hadoop, read more about this [**here**](https://kedro.readthedocs.io/en/latest/04_user_guide/04_data_catalog.html#specifying-the-location-of-the-dataset). The change will allow [#178](https://github.com/quantumblacklabs/kedro/issues/178) to happen in the next major release of Kedro. +> _TL;DR_ We're launching [`kedro.extras`](https://github.com/kedro-org/kedro/tree/master/extras), the new home for our revamped series of datasets, decorators and dataset transformers. The datasets in [`kedro.extras.datasets`](https://github.com/kedro-org/kedro/tree/master/extras/datasets) use [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to access a variety of data stores including local file systems, network file systems, cloud object stores (including S3 and GCP), and Hadoop, read more about this [**here**](https://docs.kedro.org/en/0.15.6/04_user_guide/04_data_catalog.html#specifying-the-location-of-the-dataset). The change will allow [#178](https://github.com/kedro-org/kedro/issues/178) to happen in the next major release of Kedro. An example of this new system can be seen below, loading the CSV `SparkDataSet` from S3: @@ -749,13 +1340,13 @@ weather: file_format: csv ``` -You can also load data incrementally whenever it is dumped into a directory with the extension to [`PartionedDataSet`](https://kedro.readthedocs.io/en/latest/04_user_guide/08_advanced_io.html#partitioned-dataset), a feature that allows you to load a directory of files. The [`IncrementalDataSet`](https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#incremental-loads-with-incrementaldataset) stores the information about the last processed partition in a `checkpoint`, read more about this feature [**here**](https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#incremental-loads-with-incrementaldataset). +You can also load data incrementally whenever it is dumped into a directory with the extension to [`PartionedDataSet`](https://docs.kedro.org/en/0.15.6/04_user_guide/08_advanced_io.html#partitioned-dataset), a feature that allows you to load a directory of files. The [`IncrementalDataSet`](https://docs.kedro.org/en/0.15.6/04_user_guide/08_advanced_io.html#incremental-loads-with-incrementaldataset) stores the information about the last processed partition in a `checkpoint`, read more about this feature [**here**](https://docs.kedro.org/en/0.15.6/04_user_guide/08_advanced_io.html#incremental-loads-with-incrementaldataset). ### New features -* Added `layer` attribute for datasets in `kedro.extras.datasets` to specify the name of a layer according to [data engineering convention](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention), this feature will be passed to [`kedro-viz`](https://github.com/quantumblacklabs/kedro-viz) in future releases. +* Added `layer` attribute for datasets in `kedro.extras.datasets` to specify the name of a layer according to [data engineering convention](https://docs.kedro.org/en/0.15.6/06_resources/01_faq.html#what-is-data-engineering-convention), this feature will be passed to [`kedro-viz`](https://github.com/kedro-org/kedro-viz) in future releases. * Enabled loading a particular version of a dataset in Jupyter Notebooks and iPython, using `catalog.load("dataset_name", version="<2019-12-13T15.08.09.255Z>")`. -* Added property `run_id` on `ProjectContext`, used for versioning using the [`Journal`](https://kedro.readthedocs.io/en/stable/04_user_guide/13_journal.html). To customise your journal `run_id` you can override the private method `_get_run_id()`. +* Added property `run_id` on `ProjectContext`, used for versioning using the [`Journal`](https://docs.kedro.org/en/0.15.6/04_user_guide/13_journal.html). To customise your journal `run_id` you can override the private method `_get_run_id()`. * Added the ability to install all optional kedro dependencies via `pip install "kedro[all]"`. * Modified the `DataCatalog`'s load order for datasets, loading order is the following: - `kedro.io` @@ -909,7 +1500,7 @@ You can also load data incrementally whenever it is dumped into a directory with * `kedro jupyter` now gives the default kernel a sensible name. * `Pipeline.name` has been deprecated in favour of `Pipeline.tags`. * Reuse pipelines within a Kedro project using `Pipeline.transform`, it simplifies dataset and node renaming. -* Added Jupyter Notebook line magic (`%run_viz`) to run `kedro viz` in a Notebook cell (requires [`kedro-viz`](https://github.com/quantumblacklabs/kedro-viz) version 3.0.0 or later). +* Added Jupyter Notebook line magic (`%run_viz`) to run `kedro viz` in a Notebook cell (requires [`kedro-viz`](https://github.com/kedro-org/kedro-viz) version 3.0.0 or later). * Added the following datasets: - `NetworkXLocalDataSet` in `kedro.contrib.io.networkx` to load and save local graphs (JSON format) via NetworkX. (by [@josephhaaga](https://github.com/josephhaaga)) - `SparkHiveDataSet` in `kedro.contrib.io.pyspark.SparkHiveDataSet` allowing usage of Spark and insert/upsert on non-transactional Hive tables. @@ -1028,7 +1619,7 @@ The breaking changes were introduced in the following project template files: - `/.ipython/profile_default/startup/00-kedro-init.py` - `/kedro_cli.py` - `/src/tests/test_run.py` -- `/src//run.py` +- `/src//run.py` - `/.kedro.yml` (new file) The easiest way to migrate your project from Kedro 0.14.* to Kedro 0.15.0 is to create a new project (by using `kedro new`) and move code and files bit by bit as suggested in the detailed guide below: @@ -1071,7 +1662,7 @@ If you defined any custom dataset classes which support versioning in your proje 5. Ensure you convert the output of `_get_load_path` and `_get_save_path` appropriately, as these now return [`PurePath`s](https://docs.python.org/3/library/pathlib.html#pure-paths) instead of strings. 6. Make sure `_check_paths_consistency` is called with [`PurePath`s](https://docs.python.org/3/library/pathlib.html#pure-paths) as input arguments, instead of strings. -These steps should have brought your project to Kedro 0.15.0. There might be some more minor tweaks needed as every project is unique, but now you have a pretty solid base to work with. If you run into any problems, please consult the [Kedro documentation](https://kedro.readthedocs.io). +These steps should have brought your project to Kedro 0.15.0. There might be some more minor tweaks needed as every project is unique, but now you have a pretty solid base to work with. If you run into any problems, please consult the [Kedro documentation](https://docs.kedro.org). ## Thanks for supporting contributions [Dmitry Vukolov](https://github.com/dvukolov), [Jo Stichbury](https://github.com/stichbury), [Angus Williams](https://github.com/awqb), [Deepyaman Datta](https://github.com/deepyaman), [Mayur Chougule](https://github.com/mmchougule), [Marat Kopytjuk](https://github.com/kopytjuk), [Evan Miller](https://github.com/evanmiller29), [Yusuke Minami](https://github.com/Minyus) diff --git a/docs/build-docs.sh b/docs/build-docs.sh index 8efe355a7d..d55076e118 100755 --- a/docs/build-docs.sh +++ b/docs/build-docs.sh @@ -1,33 +1,5 @@ #!/usr/bin/env bash -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - set -e # Exit script if you try to use an uninitialized variable. @@ -35,25 +7,8 @@ set -o nounset action=$1 -pip install -e ".[docs]" -pip install -r test_requirements.txt -python -m ipykernel install --user --name=kedro --display-name=Kedro - -# Move some files around. We need a separate build directory, which would -# have all the files, build scripts would shuffle the files, -# we don't want that happening on the actual code locally. -# When running on ReadTheDocs, sphinx-build would run directly on the original files, -# but we don't care about the code state there. -rm -rf docs/build -mkdir docs/build/ -cp -r docs/_templates docs/conf.py docs/*.svg docs/*.json docs/build/ - if [ "$action" == "linkcheck" ]; then - sphinx-build -c docs/ -WETan -j auto -D language=en -b linkcheck docs/build/ docs/build/html + sphinx-build -WETan -j auto -D language=en -b linkcheck -d docs/build/doctrees docs/source docs/build/linkcheck elif [ "$action" == "docs" ]; then - sphinx-build -c docs/ -WETa -j auto -D language=en docs/build/ docs/build/html + sphinx-build -WETa -j auto -D language=en -b html -d docs/build/doctrees docs/source docs/build/html fi - -# Clean up build artefacts -rm -rf docs/build/html/_sources -rm -rf docs/build/[0-9][0-9]_* diff --git a/docs/draft/installed-kedro-project.puml b/docs/draft/installed-kedro-project.puml deleted file mode 100644 index d2b768f36c..0000000000 --- a/docs/draft/installed-kedro-project.puml +++ /dev/null @@ -1,13 +0,0 @@ -@startuml -title "Installed Kedro project" - -participant "third-party Python script" as script -participant "Directory with Kedro conf/ in it" as curr_dir -participant "KedroSession" as session - -script->script: run third-party script -script->curr_dir: get path to the project config -script->session: create a session with Kedro project name and project config dir -session->run: run a pipeline and/or nodes - -@enduml diff --git a/docs/draft/kedro-ipython.puml b/docs/draft/kedro-ipython.puml deleted file mode 100644 index 2b51539a4b..0000000000 --- a/docs/draft/kedro-ipython.puml +++ /dev/null @@ -1,29 +0,0 @@ -@startuml -title "$ kedro ipython" - -participant "$ kedro ipython" as cli -participant "Environment variables" as env -participant "IPython" as ipython -participant "00-kedro-init.py\nreload_kedro" as entrypoint -participant "Hook manager" as hook_manager -participant "Kedro project directory" as project -participant "KedroSession" as session -participant "KedroContext" as context - -cli->cli: Check if IPython is importable -cli->env: Set IPYTHONDIR to metadata.project_path / ".ipython" -cli->env: Set KEDRO_ENV to the chosen Kedro environment -cli->cli: Print an info message -cli->ipython: Start ipython -ipython->entrypoint: load startup script -entrypoint->entrypoint: import Kedro -entrypoint->hook_manager: clear the hook manager -entrypoint->project: bootstrap the project -entrypoint->entrypoint: remove imported project package modules -entrypoint->session: create a KedroSession -entrypoint->session: activate the session -entrypoint->session: load KedroContext -entrypoint->context: get the data catalog -entrypoint->entrypoint: expose session, context and catalog variables -entrypoint->entrypoint: register reload_kedro line magic -@enduml diff --git a/docs/draft/kedro-no-project.puml b/docs/draft/kedro-no-project.puml deleted file mode 100644 index 2903487209..0000000000 --- a/docs/draft/kedro-no-project.puml +++ /dev/null @@ -1,21 +0,0 @@ -@startuml -title "$ kedro\ndirectory without Kedro project" - -participant "$ kedro" as kedro -participant "setup.py\nkedro = kedro.framework.cli:main" as entrypoint -participant "Kedro Plugins\nentry_point = kedro.init" as init_plugins -participant "Kedro CLI\nglobal commands\ninfo, new, docs, starter" as kedro_cli -participant "Kedro Plugins\nentry_point = kedro.global_commands" as global_plugins -participant "Current directory\npyproject.toml" as pyproject.toml -participant "Click" as click - - -kedro->entrypoint: Python calls this - -entrypoint->init_plugins: load and run all installed plugins -entrypoint->kedro_cli: collect built-in commands -entrypoint->global_plugins: load and collect global plugin commands -entrypoint->pyproject.toml: check current dir for a Kedro project -pyproject.toml-->>entrypoint: not found or missing [tool.kedro] -entrypoint->click: combine all command collections and run click -@enduml diff --git a/docs/draft/kedro-plugin.puml b/docs/draft/kedro-plugin.puml deleted file mode 100644 index cd4ca0a8d4..0000000000 --- a/docs/draft/kedro-plugin.puml +++ /dev/null @@ -1,15 +0,0 @@ -@startuml -title "$ kedro plugin" - -participant "$ kedro plugin" as cli -participant "See kedro-with-project.puml for details" as prelude -participant "Kedro Plugin\nentry_point = kedro.project_commands" as project_plugin -participant "Click context" as click -participant "KedroSession" as session - -cli->prelude: prepare click commands as prelude to this -prelude->project_plugin: execute plugin click command -project_plugin->click: get ProjectMetadata from the click context -project_plugin->project_plugin: plugin code -project_plugin->session: need to create KedroSession for all runtime config and info -@enduml diff --git a/docs/draft/kedro-run.puml b/docs/draft/kedro-run.puml deleted file mode 100644 index 5b70d14cf0..0000000000 --- a/docs/draft/kedro-run.puml +++ /dev/null @@ -1,26 +0,0 @@ -@startuml -title "$ kedro run" - -participant "$ kedro run" as cli -participant "See kedro-with-project.puml for details" as prelude -participant "Project directory\ncli.py" as project_cli -participant "KedroSession" as session -participant "KedroContext" as context -participant "Runner" as runner -participant "Hook manager" as hooks - -cli->prelude: prepare click commands as prelude to this -prelude->project_cli: run -project_cli->session: create KedroSession -session->session: run -session->session: load KedroContext -session->context: get the selected pipeline -context->context: filter the pipeline based on command line arguments -session->context: get catalog with load version / save version -session->runner: create runner -session->hooks: get hook manager -hooks->hooks: before_pipeline_run -runner->runner: run the filtered pipeline with the catalog -hooks->hooks: on_pipeline_error (if runner fails) -hooks->hooks: after_pipeline_run -@enduml diff --git a/docs/draft/kedro-with-project.puml b/docs/draft/kedro-with-project.puml deleted file mode 100644 index 6f0c21f3e9..0000000000 --- a/docs/draft/kedro-with-project.puml +++ /dev/null @@ -1,25 +0,0 @@ -@startuml -title "$ kedro\ndirectory without Kedro project" - -participant "$ kedro" as kedro -participant "setup.py\nkedro = kedro.framework.cli:main" as entrypoint -participant "Kedro Plugins\nentry_point = kedro.init" as init_plugins -participant "Kedro CLI\nglobal commands\ninfo, new, docs, starter" as kedro_cli -participant "Kedro Plugins\nentry_point = kedro.global_commands" as global_plugins -participant "Current directory\npyproject.toml" as pyproject.toml -participant "Kedro Plugins\nentry_point = kedro.project_commands" as project_plugins -participant "Current directory\nKedro Project: cli.py" as kedro_project -participant "Click" as click - - -kedro->entrypoint: Python calls this -entrypoint->init_plugins: load and run all installed -entrypoint->kedro_cli: collect built-in commands -entrypoint->global_plugins: load and collect global plugin commands -entrypoint->pyproject.toml: check current dir for a Kedro project -entrypoint->pyproject.toml: bootstrap the project -entrypoint->entrypoint: add project metadata to the click cli context -entrypoint->project_plugins: load and collect project plugin commands -entrypoint->kedro_project: load and collect project cli commands -entrypoint->click: combine all command collections and run click -@enduml diff --git a/docs/draft/python-m-project.puml b/docs/draft/python-m-project.puml deleted file mode 100644 index 3e8d042b6d..0000000000 --- a/docs/draft/python-m-project.puml +++ /dev/null @@ -1,12 +0,0 @@ -@startuml -title "$ \n$ python -m .run" - -participant "$ \n$ python -m " as cli -participant "setup.py\n = .~__main__" as entrypoint -participant "KedroSession" as session - -cli->entrypoint: Python calls the entrypoint -entrypoint->session: create session -session->session: run - -@enduml diff --git a/docs/kedro_logo.svg b/docs/kedro_logo.svg deleted file mode 100644 index b1c608a708..0000000000 --- a/docs/kedro_logo.svg +++ /dev/null @@ -1 +0,0 @@ - diff --git a/docs/package-lock.json b/docs/package-lock.json deleted file mode 100644 index 4470931115..0000000000 --- a/docs/package-lock.json +++ /dev/null @@ -1,4147 +0,0 @@ -{ - "name": "kedro-docs", - "version": "1.0.0", - "lockfileVersion": 1, - "requires": true, - "dependencies": { - "accepts": { - "version": "1.3.7", - "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz", - "integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==", - "dev": true, - "requires": { - "mime-types": "~2.1.24", - "negotiator": "0.6.2" - } - }, - "after": { - "version": "0.8.2", - "resolved": "https://registry.npmjs.org/after/-/after-0.8.2.tgz", - "integrity": "sha1-/ts5T58OAqqXaOcCvaI7UF+ufh8=", - "dev": true - }, - "ansi-regex": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-2.1.1.tgz", - "integrity": "sha1-w7M6te42DYbg5ijwRorn7yfWVN8=", - "dev": true - }, - "ansi-styles": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", - "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", - "dev": true - }, - "anymatch": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", - "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", - "dev": true, - "requires": { - "micromatch": "^3.1.4", - "normalize-path": "^2.1.1" - }, - "dependencies": { - "normalize-path": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-2.1.1.tgz", - "integrity": "sha1-GrKLVW4Zg2Oowab35vogE3/mrtk=", - "dev": true, - "requires": { - "remove-trailing-separator": "^1.0.1" - } - } - } - }, - "arr-diff": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", - "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", - "dev": true - }, - "arr-flatten": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/arr-flatten/-/arr-flatten-1.1.0.tgz", - "integrity": "sha512-L3hKV5R/p5o81R7O02IGnwpDmkp6E982XhtbuwSe3O4qOtMMMtodicASA1Cny2U+aCXcNpml+m4dPsvsJ3jatg==", - "dev": true - }, - "arr-union": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz", - "integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ=", - "dev": true - }, - "array-filter": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/array-filter/-/array-filter-0.0.1.tgz", - "integrity": "sha1-fajPLiZijtcygDWB/SH2fKzS7uw=", - "dev": true - }, - "array-map": { - "version": "0.0.0", - "resolved": "https://registry.npmjs.org/array-map/-/array-map-0.0.0.tgz", - "integrity": "sha1-iKK6tz0c97zVwbEYoAP2b2ZfpmI=", - "dev": true - }, - "array-reduce": { - "version": "0.0.0", - "resolved": "https://registry.npmjs.org/array-reduce/-/array-reduce-0.0.0.tgz", - "integrity": "sha1-FziZ0//Rx9k4PkR5Ul2+J4yrXys=", - "dev": true - }, - "array-unique": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", - "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", - "dev": true - }, - "arraybuffer.slice": { - "version": "0.0.7", - "resolved": "https://registry.npmjs.org/arraybuffer.slice/-/arraybuffer.slice-0.0.7.tgz", - "integrity": "sha512-wGUIVQXuehL5TCqQun8OW81jGzAWycqzFF8lFp+GOM5BXLYj3bKNsYC4daB7n6XjCqxQA/qgTJ+8ANR3acjrog==", - "dev": true - }, - "assign-symbols": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/assign-symbols/-/assign-symbols-1.0.0.tgz", - "integrity": "sha1-WWZ/QfrdTyDMvCu5a41Pf3jsA2c=", - "dev": true - }, - "async": { - "version": "1.5.2", - "resolved": "https://registry.npmjs.org/async/-/async-1.5.2.tgz", - "integrity": "sha1-7GphrlZIDAw8skHJVhjiCJL5Zyo=", - "dev": true - }, - "async-each": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/async-each/-/async-each-1.0.3.tgz", - "integrity": "sha512-z/WhQ5FPySLdvREByI2vZiTWwCnF0moMJ1hK9YQwDTHKh6I7/uSckMetoRGb5UBZPC1z0jlw+n/XCgjeH7y1AQ==", - "dev": true - }, - "async-each-series": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/async-each-series/-/async-each-series-0.1.1.tgz", - "integrity": "sha1-dhfBkXQB/Yykooqtzj266Yr+tDI=", - "dev": true - }, - "async-limiter": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.0.tgz", - "integrity": "sha512-jp/uFnooOiO+L211eZOoSyzpOITMXx1rBITauYykG3BRYPu8h0UcxsPNB04RR5vo4Tyz3+ay17tR6JVf9qzYWg==", - "dev": true - }, - "atob": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/atob/-/atob-2.1.2.tgz", - "integrity": "sha512-Wm6ukoaOGJi/73p/cl2GvLjTI5JM1k/O14isD73YML8StrH/7/lRFgmg8nICZgD3bZZvjwCGxtMOD3wWNAu8cg==", - "dev": true - }, - "axios": { - "version": "0.19.0", - "resolved": "https://registry.npmjs.org/axios/-/axios-0.19.0.tgz", - "integrity": "sha512-1uvKqKQta3KBxIz14F2v06AEHZ/dIoeKfbTRkK1E5oqjDnuEerLmYTgJB5AiQZHJcljpg1TuRzdjDR06qNk0DQ==", - "dev": true, - "requires": { - "follow-redirects": "1.5.10", - "is-buffer": "^2.0.2" - }, - "dependencies": { - "is-buffer": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz", - "integrity": "sha512-U15Q7MXTuZlrbymiz95PJpZxu8IlipAp4dtS3wOdgPXx3mqBnslrWU14kxfHB+Py/+2PVKSr37dMAgM2A4uArw==", - "dev": true - } - } - }, - "backo2": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/backo2/-/backo2-1.0.2.tgz", - "integrity": "sha1-MasayLEpNjRj41s+u2n038+6eUc=", - "dev": true - }, - "balanced-match": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", - "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=", - "dev": true - }, - "base": { - "version": "0.11.2", - "resolved": "https://registry.npmjs.org/base/-/base-0.11.2.tgz", - "integrity": "sha512-5T6P4xPgpp0YDFvSWwEZ4NoE3aM4QBQXDzmVbraCkFj8zHM+mba8SyqB5DbZWyR7mYHo6Y7BdQo3MoA4m0TeQg==", - "dev": true, - "requires": { - "cache-base": "^1.0.1", - "class-utils": "^0.3.5", - "component-emitter": "^1.2.1", - "define-property": "^1.0.0", - "isobject": "^3.0.1", - "mixin-deep": "^1.2.0", - "pascalcase": "^0.1.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - } - } - }, - "base64-arraybuffer": { - "version": "0.1.5", - "resolved": "https://registry.npmjs.org/base64-arraybuffer/-/base64-arraybuffer-0.1.5.tgz", - "integrity": "sha1-c5JncZI7Whl0etZmqlzUv5xunOg=", - "dev": true - }, - "base64id": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/base64id/-/base64id-1.0.0.tgz", - "integrity": "sha1-R2iMuZu2gE8OBtPnY7HDLlfY5rY=", - "dev": true - }, - "batch": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/batch/-/batch-0.6.1.tgz", - "integrity": "sha1-3DQxT05nkxgJP8dgJyUl+UvyXBY=", - "dev": true - }, - "better-assert": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/better-assert/-/better-assert-1.0.2.tgz", - "integrity": "sha1-QIZrnhueC1W0gYlDEeaPr/rrxSI=", - "dev": true, - "requires": { - "callsite": "1.0.0" - } - }, - "binary-extensions": { - "version": "1.13.1", - "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-1.13.1.tgz", - "integrity": "sha512-Un7MIEDdUC5gNpcGDV97op1Ywk748MpHcFTHoYs6qnj1Z3j7I53VG3nwZhKzoBZmbdRNnb6WRdFlwl7tSDuZGw==", - "dev": true - }, - "blob": { - "version": "0.0.5", - "resolved": "https://registry.npmjs.org/blob/-/blob-0.0.5.tgz", - "integrity": "sha512-gaqbzQPqOoamawKg0LGVd7SzLgXS+JH61oWprSLH+P+abTczqJbhTR8CmJ2u9/bUYNmHTGJx/UEmn6doAvvuig==", - "dev": true - }, - "brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, - "requires": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "braces": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", - "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", - "dev": true, - "requires": { - "arr-flatten": "^1.1.0", - "array-unique": "^0.3.2", - "extend-shallow": "^2.0.1", - "fill-range": "^4.0.0", - "isobject": "^3.0.1", - "repeat-element": "^1.1.2", - "snapdragon": "^0.8.1", - "snapdragon-node": "^2.0.1", - "split-string": "^3.0.2", - "to-regex": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "browser-sync": { - "version": "2.26.7", - "resolved": "https://registry.npmjs.org/browser-sync/-/browser-sync-2.26.7.tgz", - "integrity": "sha512-lY3emme0OyvA2ujEMpRmyRy9LY6gHLuTr2/ABxhIm3lADOiRXzP4dgekvnDrQqZ/Ec2Fz19lEjm6kglSG5766w==", - "dev": true, - "requires": { - "browser-sync-client": "^2.26.6", - "browser-sync-ui": "^2.26.4", - "bs-recipes": "1.3.4", - "bs-snippet-injector": "^2.0.1", - "chokidar": "^2.0.4", - "connect": "3.6.6", - "connect-history-api-fallback": "^1", - "dev-ip": "^1.0.1", - "easy-extender": "^2.3.4", - "eazy-logger": "^3", - "etag": "^1.8.1", - "fresh": "^0.5.2", - "fs-extra": "3.0.1", - "http-proxy": "1.15.2", - "immutable": "^3", - "localtunnel": "1.9.2", - "micromatch": "^3.1.10", - "opn": "5.3.0", - "portscanner": "2.1.1", - "qs": "6.2.3", - "raw-body": "^2.3.2", - "resp-modifier": "6.0.2", - "rx": "4.1.0", - "send": "0.16.2", - "serve-index": "1.9.1", - "serve-static": "1.13.2", - "server-destroy": "1.0.1", - "socket.io": "2.1.1", - "ua-parser-js": "0.7.17", - "yargs": "6.4.0" - } - }, - "browser-sync-client": { - "version": "2.26.6", - "resolved": "https://registry.npmjs.org/browser-sync-client/-/browser-sync-client-2.26.6.tgz", - "integrity": "sha512-mGrkZdNzttKdf/16I+y+2dTQxoMCIpKbVIMJ/uP8ZpnKu9f9qa/2CYVtLtbjZG8nsM14EwiCrjuFTGBEnT3Gjw==", - "dev": true, - "requires": { - "etag": "1.8.1", - "fresh": "0.5.2", - "mitt": "^1.1.3", - "rxjs": "^5.5.6" - } - }, - "browser-sync-ui": { - "version": "2.26.4", - "resolved": "https://registry.npmjs.org/browser-sync-ui/-/browser-sync-ui-2.26.4.tgz", - "integrity": "sha512-u20P3EsZoM8Pt+puoi3BU3KlbQAH1lAcV+/O4saF26qokrBqIDotmGonfWwoRbUmdxZkM9MBmA0K39ZTG1h4sA==", - "dev": true, - "requires": { - "async-each-series": "0.1.1", - "connect-history-api-fallback": "^1", - "immutable": "^3", - "server-destroy": "1.0.1", - "socket.io-client": "^2.0.4", - "stream-throttle": "^0.1.3" - } - }, - "bs-recipes": { - "version": "1.3.4", - "resolved": "https://registry.npmjs.org/bs-recipes/-/bs-recipes-1.3.4.tgz", - "integrity": "sha1-DS1NSKcYyMBEdp/cT4lZLci2lYU=", - "dev": true - }, - "bs-snippet-injector": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/bs-snippet-injector/-/bs-snippet-injector-2.0.1.tgz", - "integrity": "sha1-YbU5PxH1JVntEgaTEANDtu2wTdU=", - "dev": true - }, - "bytes": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz", - "integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==", - "dev": true - }, - "cache-base": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/cache-base/-/cache-base-1.0.1.tgz", - "integrity": "sha512-AKcdTnFSWATd5/GCPRxr2ChwIJ85CeyrEyjRHlKxQ56d4XJMGym0uAiKn0xbLOGOl3+yRpOTi484dVCEc5AUzQ==", - "dev": true, - "requires": { - "collection-visit": "^1.0.0", - "component-emitter": "^1.2.1", - "get-value": "^2.0.6", - "has-value": "^1.0.0", - "isobject": "^3.0.1", - "set-value": "^2.0.0", - "to-object-path": "^0.3.0", - "union-value": "^1.0.0", - "unset-value": "^1.0.0" - } - }, - "callsite": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/callsite/-/callsite-1.0.0.tgz", - "integrity": "sha1-KAOY5dZkvXQDi28JBRU+borxvCA=", - "dev": true - }, - "camelcase": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-3.0.0.tgz", - "integrity": "sha1-MvxLn82vhF/N9+c7uXysImHwqwo=", - "dev": true - }, - "chalk": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", - "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", - "dev": true, - "requires": { - "ansi-styles": "^2.2.1", - "escape-string-regexp": "^1.0.2", - "has-ansi": "^2.0.0", - "strip-ansi": "^3.0.0", - "supports-color": "^2.0.0" - } - }, - "chokidar": { - "version": "2.1.6", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-2.1.6.tgz", - "integrity": "sha512-V2jUo67OKkc6ySiRpJrjlpJKl9kDuG+Xb8VgsGzb+aEouhgS1D0weyPU4lEzdAcsCAvrih2J2BqyXqHWvVLw5g==", - "dev": true, - "requires": { - "anymatch": "^2.0.0", - "async-each": "^1.0.1", - "braces": "^2.3.2", - "fsevents": "^1.2.7", - "glob-parent": "^3.1.0", - "inherits": "^2.0.3", - "is-binary-path": "^1.0.0", - "is-glob": "^4.0.0", - "normalize-path": "^3.0.0", - "path-is-absolute": "^1.0.0", - "readdirp": "^2.2.1", - "upath": "^1.1.1" - } - }, - "class-utils": { - "version": "0.3.6", - "resolved": "https://registry.npmjs.org/class-utils/-/class-utils-0.3.6.tgz", - "integrity": "sha512-qOhPa/Fj7s6TY8H8esGu5QNpMMQxz79h+urzrNYN6mn+9BnxlDGf5QZ+XeCDsxSjPqsSR56XOZOJmpeurnLMeg==", - "dev": true, - "requires": { - "arr-union": "^3.1.0", - "define-property": "^0.2.5", - "isobject": "^3.0.0", - "static-extend": "^0.1.1" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - } - } - }, - "cliui": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/cliui/-/cliui-3.2.0.tgz", - "integrity": "sha1-EgYBU3qRbSmUD5NNo7SNWFo5IT0=", - "dev": true, - "requires": { - "string-width": "^1.0.1", - "strip-ansi": "^3.0.1", - "wrap-ansi": "^2.0.0" - } - }, - "code-point-at": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/code-point-at/-/code-point-at-1.1.0.tgz", - "integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=", - "dev": true - }, - "collection-visit": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/collection-visit/-/collection-visit-1.0.0.tgz", - "integrity": "sha1-S8A3PBZLwykbTTaMgpzxqApZ3KA=", - "dev": true, - "requires": { - "map-visit": "^1.0.0", - "object-visit": "^1.0.0" - } - }, - "color-convert": { - "version": "1.9.3", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", - "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==", - "dev": true, - "requires": { - "color-name": "1.1.3" - } - }, - "color-name": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", - "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=", - "dev": true - }, - "colors": { - "version": "1.3.3", - "resolved": "https://registry.npmjs.org/colors/-/colors-1.3.3.tgz", - "integrity": "sha512-mmGt/1pZqYRjMxB1axhTo16/snVZ5krrKkcmMeVKxzECMMXoCgnvTPp10QgHfcbQZw8Dq2jMNG6je4JlWU0gWg==", - "dev": true - }, - "commander": { - "version": "2.20.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.0.tgz", - "integrity": "sha512-7j2y+40w61zy6YC2iRNpUe/NwhNyoXrYpHMrSunaMG64nRnaf96zO/KMQR4OyN/UnE5KLyEBnKHd4aG3rskjpQ==", - "dev": true - }, - "component-bind": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/component-bind/-/component-bind-1.0.0.tgz", - "integrity": "sha1-AMYIq33Nk4l8AAllGx06jh5zu9E=", - "dev": true - }, - "component-emitter": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/component-emitter/-/component-emitter-1.2.1.tgz", - "integrity": "sha1-E3kY1teCg/ffemt8WmPhQOaUJeY=", - "dev": true - }, - "component-inherit": { - "version": "0.0.3", - "resolved": "https://registry.npmjs.org/component-inherit/-/component-inherit-0.0.3.tgz", - "integrity": "sha1-ZF/ErfWLcrZJ1crmUTVhnbJv8UM=", - "dev": true - }, - "concat-map": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=", - "dev": true - }, - "connect": { - "version": "3.6.6", - "resolved": "https://registry.npmjs.org/connect/-/connect-3.6.6.tgz", - "integrity": "sha1-Ce/2xVr3I24TcTWnJXSFi2eG9SQ=", - "dev": true, - "requires": { - "debug": "2.6.9", - "finalhandler": "1.1.0", - "parseurl": "~1.3.2", - "utils-merge": "1.0.1" - }, - "dependencies": { - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - } - } - }, - "connect-history-api-fallback": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/connect-history-api-fallback/-/connect-history-api-fallback-1.6.0.tgz", - "integrity": "sha512-e54B99q/OUoH64zYYRf3HBP5z24G38h5D3qXu23JGRoigpX5Ss4r9ZnDk3g0Z8uQC2x2lPaJ+UlWBc1ZWBWdLg==", - "dev": true - }, - "cookie": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.3.1.tgz", - "integrity": "sha1-5+Ch+e9DtMi6klxcWpboBtFoc7s=", - "dev": true - }, - "copy-and-watch": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/copy-and-watch/-/copy-and-watch-0.1.2.tgz", - "integrity": "sha512-On7+g3EXT3mqCKuvkq1zqtmI92Q+/W1K3XXZEXCnIFlQXhiyEMvsi+cjUI5vdKu8kOrc/KQ9sB5t5lIVYwikKg==", - "dev": true, - "requires": { - "chokidar": "^1.6.1", - "colors": "^1.1.2", - "glob": "^7.1.1", - "glob-parent": "^3.1.0" - }, - "dependencies": { - "anymatch": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-1.3.2.tgz", - "integrity": "sha512-0XNayC8lTHQ2OI8aljNCN3sSx6hsr/1+rlcDAotXJR7C1oZZHCNsfpbKwMjRA3Uqb5tF1Rae2oloTr4xpq+WjA==", - "dev": true, - "requires": { - "micromatch": "^2.1.5", - "normalize-path": "^2.0.0" - } - }, - "arr-diff": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz", - "integrity": "sha1-jzuCf5Vai9ZpaX5KQlasPOrjVs8=", - "dev": true, - "requires": { - "arr-flatten": "^1.0.1" - } - }, - "array-unique": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.2.1.tgz", - "integrity": "sha1-odl8yvy8JiXMcPrc6zalDFiwGlM=", - "dev": true - }, - "braces": { - "version": "1.8.5", - "resolved": "https://registry.npmjs.org/braces/-/braces-1.8.5.tgz", - "integrity": "sha1-uneWLhLf+WnWt2cR6RS3N4V79qc=", - "dev": true, - "requires": { - "expand-range": "^1.8.1", - "preserve": "^0.2.0", - "repeat-element": "^1.1.2" - } - }, - "chokidar": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz", - "integrity": "sha1-eY5ol3gVHIB2tLNg5e3SjNortGg=", - "dev": true, - "requires": { - "anymatch": "^1.3.0", - "async-each": "^1.0.0", - "fsevents": "^1.0.0", - "glob-parent": "^2.0.0", - "inherits": "^2.0.1", - "is-binary-path": "^1.0.0", - "is-glob": "^2.0.0", - "path-is-absolute": "^1.0.0", - "readdirp": "^2.0.0" - }, - "dependencies": { - "glob-parent": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-2.0.0.tgz", - "integrity": "sha1-gTg9ctsFT8zPUzbaqQLxgvbtuyg=", - "dev": true, - "requires": { - "is-glob": "^2.0.0" - } - } - } - }, - "expand-brackets": { - "version": "0.1.5", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-0.1.5.tgz", - "integrity": "sha1-3wcoTjQqgHzXM6xa9yQR5YHRF3s=", - "dev": true, - "requires": { - "is-posix-bracket": "^0.1.0" - } - }, - "extglob": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-0.3.2.tgz", - "integrity": "sha1-Lhj/PS9JqydlzskCPwEdqo2DSaE=", - "dev": true, - "requires": { - "is-extglob": "^1.0.0" - } - }, - "is-extglob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", - "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", - "dev": true - }, - "is-glob": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", - "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", - "dev": true, - "requires": { - "is-extglob": "^1.0.0" - } - }, - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - }, - "micromatch": { - "version": "2.3.11", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-2.3.11.tgz", - "integrity": "sha1-hmd8l9FyCzY0MdBNDRUpO9OMFWU=", - "dev": true, - "requires": { - "arr-diff": "^2.0.0", - "array-unique": "^0.2.1", - "braces": "^1.8.2", - "expand-brackets": "^0.1.4", - "extglob": "^0.3.1", - "filename-regex": "^2.0.0", - "is-extglob": "^1.0.0", - "is-glob": "^2.0.1", - "kind-of": "^3.0.2", - "normalize-path": "^2.0.1", - "object.omit": "^2.0.0", - "parse-glob": "^3.0.4", - "regex-cache": "^0.4.2" - } - }, - "normalize-path": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-2.1.1.tgz", - "integrity": "sha1-GrKLVW4Zg2Oowab35vogE3/mrtk=", - "dev": true, - "requires": { - "remove-trailing-separator": "^1.0.1" - } - } - } - }, - "copy-descriptor": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/copy-descriptor/-/copy-descriptor-0.1.1.tgz", - "integrity": "sha1-Z29us8OZl8LuGsOpJP1hJHSPV40=", - "dev": true - }, - "core-util-is": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", - "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=", - "dev": true - }, - "cross-spawn": { - "version": "6.0.5", - "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.5.tgz", - "integrity": "sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ==", - "dev": true, - "requires": { - "nice-try": "^1.0.4", - "path-key": "^2.0.1", - "semver": "^5.5.0", - "shebang-command": "^1.2.0", - "which": "^1.2.9" - } - }, - "debug": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", - "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - }, - "decamelize": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", - "integrity": "sha1-9lNNFRSCabIDUue+4m9QH5oZEpA=", - "dev": true - }, - "decode-uri-component": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.0.tgz", - "integrity": "sha1-6zkTMzRYd1y4TNGh+uBiEGu4dUU=", - "dev": true - }, - "define-properties": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.1.3.tgz", - "integrity": "sha512-3MqfYKj2lLzdMSf8ZIZE/V+Zuy+BgD6f164e8K2w7dgnpKArBDerGYpM46IYYcjnkdPNMjPk9A6VFB8+3SKlXQ==", - "dev": true, - "requires": { - "object-keys": "^1.0.12" - } - }, - "define-property": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-2.0.2.tgz", - "integrity": "sha512-jwK2UV4cnPpbcG7+VRARKTZPUWowwXA8bzH5NP6ud0oeAxyYPuGZUAC7hMugpCdz4BeSZl2Dl9k66CHJ/46ZYQ==", - "dev": true, - "requires": { - "is-descriptor": "^1.0.2", - "isobject": "^3.0.1" - }, - "dependencies": { - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - } - } - }, - "depd": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", - "integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak=", - "dev": true - }, - "destroy": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz", - "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA=", - "dev": true - }, - "dev-ip": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/dev-ip/-/dev-ip-1.0.1.tgz", - "integrity": "sha1-p2o+0YVb56ASu4rBbLgPPADcKPA=", - "dev": true - }, - "easy-extender": { - "version": "2.3.4", - "resolved": "https://registry.npmjs.org/easy-extender/-/easy-extender-2.3.4.tgz", - "integrity": "sha512-8cAwm6md1YTiPpOvDULYJL4ZS6WfM5/cTeVVh4JsvyYZAoqlRVUpHL9Gr5Fy7HA6xcSZicUia3DeAgO3Us8E+Q==", - "dev": true, - "requires": { - "lodash": "^4.17.10" - } - }, - "eazy-logger": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/eazy-logger/-/eazy-logger-3.0.2.tgz", - "integrity": "sha1-oyWqXlPROiIliJsqxBE7K5Y29Pw=", - "dev": true, - "requires": { - "tfunk": "^3.0.1" - } - }, - "ee-first": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", - "integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0=", - "dev": true - }, - "encodeurl": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", - "integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=", - "dev": true - }, - "engine.io": { - "version": "3.2.1", - "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-3.2.1.tgz", - "integrity": "sha512-+VlKzHzMhaU+GsCIg4AoXF1UdDFjHHwMmMKqMJNDNLlUlejz58FCy4LBqB2YVJskHGYl06BatYWKP2TVdVXE5w==", - "dev": true, - "requires": { - "accepts": "~1.3.4", - "base64id": "1.0.0", - "cookie": "0.3.1", - "debug": "~3.1.0", - "engine.io-parser": "~2.1.0", - "ws": "~3.3.1" - }, - "dependencies": { - "ws": { - "version": "3.3.3", - "resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz", - "integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==", - "dev": true, - "requires": { - "async-limiter": "~1.0.0", - "safe-buffer": "~5.1.0", - "ultron": "~1.1.0" - } - } - } - }, - "engine.io-client": { - "version": "3.3.2", - "resolved": "https://registry.npmjs.org/engine.io-client/-/engine.io-client-3.3.2.tgz", - "integrity": "sha512-y0CPINnhMvPuwtqXfsGuWE8BB66+B6wTtCofQDRecMQPYX3MYUZXFNKDhdrSe3EVjgOu4V3rxdeqN/Tr91IgbQ==", - "dev": true, - "requires": { - "component-emitter": "1.2.1", - "component-inherit": "0.0.3", - "debug": "~3.1.0", - "engine.io-parser": "~2.1.1", - "has-cors": "1.1.0", - "indexof": "0.0.1", - "parseqs": "0.0.5", - "parseuri": "0.0.5", - "ws": "~6.1.0", - "xmlhttprequest-ssl": "~1.5.4", - "yeast": "0.1.2" - } - }, - "engine.io-parser": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-2.1.3.tgz", - "integrity": "sha512-6HXPre2O4Houl7c4g7Ic/XzPnHBvaEmN90vtRO9uLmwtRqQmTOw0QMevL1TOfL2Cpu1VzsaTmMotQgMdkzGkVA==", - "dev": true, - "requires": { - "after": "0.8.2", - "arraybuffer.slice": "~0.0.7", - "base64-arraybuffer": "0.1.5", - "blob": "0.0.5", - "has-binary2": "~1.0.2" - } - }, - "error-ex": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz", - "integrity": "sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==", - "dev": true, - "requires": { - "is-arrayish": "^0.2.1" - } - }, - "es-abstract": { - "version": "1.13.0", - "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.13.0.tgz", - "integrity": "sha512-vDZfg/ykNxQVwup/8E1BZhVzFfBxs9NqMzGcvIJrqg5k2/5Za2bWo40dK2J1pgLngZ7c+Shh8lwYtLGyrwPutg==", - "dev": true, - "requires": { - "es-to-primitive": "^1.2.0", - "function-bind": "^1.1.1", - "has": "^1.0.3", - "is-callable": "^1.1.4", - "is-regex": "^1.0.4", - "object-keys": "^1.0.12" - } - }, - "es-to-primitive": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.2.0.tgz", - "integrity": "sha512-qZryBOJjV//LaxLTV6UC//WewneB3LcXOL9NP++ozKVXsIIIpm/2c13UDiD9Jp2eThsecw9m3jPqDwTyobcdbg==", - "dev": true, - "requires": { - "is-callable": "^1.1.4", - "is-date-object": "^1.0.1", - "is-symbol": "^1.0.2" - } - }, - "escape-html": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", - "integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg=", - "dev": true - }, - "escape-string-regexp": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", - "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=", - "dev": true - }, - "etag": { - "version": "1.8.1", - "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", - "integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc=", - "dev": true - }, - "eventemitter3": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-1.2.0.tgz", - "integrity": "sha1-HIaZHYFq0eUEdQ5zh0Ik7PO+xQg=", - "dev": true - }, - "expand-brackets": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", - "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", - "dev": true, - "requires": { - "debug": "^2.3.3", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "posix-character-classes": "^0.1.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - }, - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "expand-range": { - "version": "1.8.2", - "resolved": "https://registry.npmjs.org/expand-range/-/expand-range-1.8.2.tgz", - "integrity": "sha1-opnv/TNf4nIeuujiV+x5ZE/IUzc=", - "dev": true, - "requires": { - "fill-range": "^2.1.0" - }, - "dependencies": { - "fill-range": { - "version": "2.2.4", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-2.2.4.tgz", - "integrity": "sha512-cnrcCbj01+j2gTG921VZPnHbjmdAf8oQV/iGeV2kZxGSyfYjjTyY79ErsK1WJWMpw6DaApEX72binqJE+/d+5Q==", - "dev": true, - "requires": { - "is-number": "^2.1.0", - "isobject": "^2.0.0", - "randomatic": "^3.0.0", - "repeat-element": "^1.1.2", - "repeat-string": "^1.5.2" - } - }, - "is-number": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-2.1.0.tgz", - "integrity": "sha1-Afy7s5NGOlSPL0ZszhbezknbkI8=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - } - }, - "isarray": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", - "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=", - "dev": true - }, - "isobject": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz", - "integrity": "sha1-8GVWEJaj8dou9GJy+BXIQNh+DIk=", - "dev": true, - "requires": { - "isarray": "1.0.0" - } - }, - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "extend-shallow": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-3.0.2.tgz", - "integrity": "sha1-Jqcarwc7OfshJxcnRhMcJwQCjbg=", - "dev": true, - "requires": { - "assign-symbols": "^1.0.0", - "is-extendable": "^1.0.1" - }, - "dependencies": { - "is-extendable": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz", - "integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==", - "dev": true, - "requires": { - "is-plain-object": "^2.0.4" - } - } - } - }, - "extglob": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", - "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", - "dev": true, - "requires": { - "array-unique": "^0.3.2", - "define-property": "^1.0.0", - "expand-brackets": "^2.1.4", - "extend-shallow": "^2.0.1", - "fragment-cache": "^0.2.1", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - } - } - }, - "filename-regex": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/filename-regex/-/filename-regex-2.0.1.tgz", - "integrity": "sha1-wcS5vuPglyXdsQa3XB4wH+LxiyY=", - "dev": true - }, - "fill-range": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", - "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-number": "^3.0.0", - "repeat-string": "^1.6.1", - "to-regex-range": "^2.1.0" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "finalhandler": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.0.tgz", - "integrity": "sha1-zgtoVbRYU+eRsvzGgARtiCU91/U=", - "dev": true, - "requires": { - "debug": "2.6.9", - "encodeurl": "~1.0.1", - "escape-html": "~1.0.3", - "on-finished": "~2.3.0", - "parseurl": "~1.3.2", - "statuses": "~1.3.1", - "unpipe": "~1.0.0" - }, - "dependencies": { - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - } - } - }, - "find-up": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-1.1.2.tgz", - "integrity": "sha1-ay6YIrGizgpgq2TWEOzK1TyyTQ8=", - "dev": true, - "requires": { - "path-exists": "^2.0.0", - "pinkie-promise": "^2.0.0" - } - }, - "follow-redirects": { - "version": "1.5.10", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.5.10.tgz", - "integrity": "sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==", - "dev": true, - "requires": { - "debug": "=3.1.0" - } - }, - "for-in": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz", - "integrity": "sha1-gQaNKVqBQuwKxybG4iAMMPttXoA=", - "dev": true - }, - "for-own": { - "version": "0.1.5", - "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", - "integrity": "sha1-UmXGgaTylNq78XyVCbZ2OqhFEM4=", - "dev": true, - "requires": { - "for-in": "^1.0.1" - } - }, - "fragment-cache": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/fragment-cache/-/fragment-cache-0.2.1.tgz", - "integrity": "sha1-QpD60n8T6Jvn8zeZxrxaCr//DRk=", - "dev": true, - "requires": { - "map-cache": "^0.2.2" - } - }, - "fresh": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", - "integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac=", - "dev": true - }, - "fs-extra": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-3.0.1.tgz", - "integrity": "sha1-N5TzeMWLNC6n27sjCVEJxLO2IpE=", - "dev": true, - "requires": { - "graceful-fs": "^4.1.2", - "jsonfile": "^3.0.0", - "universalify": "^0.1.0" - } - }, - "fs.realpath": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", - "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", - "dev": true - }, - "fsevents": { - "version": "1.2.9", - "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-1.2.9.tgz", - "integrity": "sha512-oeyj2H3EjjonWcFjD5NvZNE9Rqe4UW+nQBU2HNeKw0koVLEFIhtyETyAakeAM3de7Z/SW5kcA+fZUait9EApnw==", - "dev": true, - "optional": true, - "requires": { - "nan": "^2.12.1", - "node-pre-gyp": "^0.12.0" - }, - "dependencies": { - "abbrev": { - "version": "1.1.1", - "bundled": true, - "dev": true, - "optional": true - }, - "ansi-regex": { - "version": "2.1.1", - "bundled": true, - "dev": true, - "optional": true - }, - "aproba": { - "version": "1.2.0", - "bundled": true, - "dev": true, - "optional": true - }, - "are-we-there-yet": { - "version": "1.1.5", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "delegates": "^1.0.0", - "readable-stream": "^2.0.6" - } - }, - "balanced-match": { - "version": "1.0.0", - "bundled": true, - "dev": true, - "optional": true - }, - "brace-expansion": { - "version": "1.1.11", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "chownr": { - "version": "1.1.1", - "bundled": true, - "dev": true, - "optional": true - }, - "code-point-at": { - "version": "1.1.0", - "bundled": true, - "dev": true, - "optional": true - }, - "concat-map": { - "version": "0.0.1", - "bundled": true, - "dev": true, - "optional": true - }, - "console-control-strings": { - "version": "1.1.0", - "bundled": true, - "dev": true, - "optional": true - }, - "core-util-is": { - "version": "1.0.2", - "bundled": true, - "dev": true, - "optional": true - }, - "debug": { - "version": "4.1.1", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "ms": "^2.1.1" - } - }, - "deep-extend": { - "version": "0.6.0", - "bundled": true, - "dev": true, - "optional": true - }, - "delegates": { - "version": "1.0.0", - "bundled": true, - "dev": true, - "optional": true - }, - "detect-libc": { - "version": "1.0.3", - "bundled": true, - "dev": true, - "optional": true - }, - "fs-minipass": { - "version": "1.2.5", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "minipass": "^2.2.1" - } - }, - "fs.realpath": { - "version": "1.0.0", - "bundled": true, - "dev": true, - "optional": true - }, - "gauge": { - "version": "2.7.4", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "aproba": "^1.0.3", - "console-control-strings": "^1.0.0", - "has-unicode": "^2.0.0", - "object-assign": "^4.1.0", - "signal-exit": "^3.0.0", - "string-width": "^1.0.1", - "strip-ansi": "^3.0.1", - "wide-align": "^1.1.0" - } - }, - "glob": { - "version": "7.1.3", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "fs.realpath": "^1.0.0", - "inflight": "^1.0.4", - "inherits": "2", - "minimatch": "^3.0.4", - "once": "^1.3.0", - "path-is-absolute": "^1.0.0" - } - }, - "has-unicode": { - "version": "2.0.1", - "bundled": true, - "dev": true, - "optional": true - }, - "iconv-lite": { - "version": "0.4.24", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "safer-buffer": ">= 2.1.2 < 3" - } - }, - "ignore-walk": { - "version": "3.0.1", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "minimatch": "^3.0.4" - } - }, - "inflight": { - "version": "1.0.6", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "once": "^1.3.0", - "wrappy": "1" - } - }, - "inherits": { - "version": "2.0.3", - "bundled": true, - "dev": true, - "optional": true - }, - "ini": { - "version": "1.3.5", - "bundled": true, - "dev": true, - "optional": true - }, - "is-fullwidth-code-point": { - "version": "1.0.0", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "number-is-nan": "^1.0.0" - } - }, - "isarray": { - "version": "1.0.0", - "bundled": true, - "dev": true, - "optional": true - }, - "minimatch": { - "version": "3.0.4", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "brace-expansion": "^1.1.7" - } - }, - "minimist": { - "version": "0.0.8", - "bundled": true, - "dev": true, - "optional": true - }, - "minipass": { - "version": "2.3.5", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "safe-buffer": "^5.1.2", - "yallist": "^3.0.0" - } - }, - "minizlib": { - "version": "1.2.1", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "minipass": "^2.2.1" - } - }, - "mkdirp": { - "version": "0.5.1", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "minimist": "0.0.8" - } - }, - "ms": { - "version": "2.1.1", - "bundled": true, - "dev": true, - "optional": true - }, - "needle": { - "version": "2.3.0", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "debug": "^4.1.0", - "iconv-lite": "^0.4.4", - "sax": "^1.2.4" - } - }, - "node-pre-gyp": { - "version": "0.12.0", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "detect-libc": "^1.0.2", - "mkdirp": "^0.5.1", - "needle": "^2.2.1", - "nopt": "^4.0.1", - "npm-packlist": "^1.1.6", - "npmlog": "^4.0.2", - "rc": "^1.2.7", - "rimraf": "^2.6.1", - "semver": "^5.3.0", - "tar": "^4" - } - }, - "nopt": { - "version": "4.0.1", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "abbrev": "1", - "osenv": "^0.1.4" - } - }, - "npm-bundled": { - "version": "1.0.6", - "bundled": true, - "dev": true, - "optional": true - }, - "npm-packlist": { - "version": "1.4.1", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "ignore-walk": "^3.0.1", - "npm-bundled": "^1.0.1" - } - }, - "npmlog": { - "version": "4.1.2", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "are-we-there-yet": "~1.1.2", - "console-control-strings": "~1.1.0", - "gauge": "~2.7.3", - "set-blocking": "~2.0.0" - } - }, - "number-is-nan": { - "version": "1.0.1", - "bundled": true, - "dev": true, - "optional": true - }, - "object-assign": { - "version": "4.1.1", - "bundled": true, - "dev": true, - "optional": true - }, - "once": { - "version": "1.4.0", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "wrappy": "1" - } - }, - "os-homedir": { - "version": "1.0.2", - "bundled": true, - "dev": true, - "optional": true - }, - "os-tmpdir": { - "version": "1.0.2", - "bundled": true, - "dev": true, - "optional": true - }, - "osenv": { - "version": "0.1.5", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "os-homedir": "^1.0.0", - "os-tmpdir": "^1.0.0" - } - }, - "path-is-absolute": { - "version": "1.0.1", - "bundled": true, - "dev": true, - "optional": true - }, - "process-nextick-args": { - "version": "2.0.0", - "bundled": true, - "dev": true, - "optional": true - }, - "rc": { - "version": "1.2.8", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "deep-extend": "^0.6.0", - "ini": "~1.3.0", - "minimist": "^1.2.0", - "strip-json-comments": "~2.0.1" - }, - "dependencies": { - "minimist": { - "version": "1.2.0", - "bundled": true, - "dev": true, - "optional": true - } - } - }, - "readable-stream": { - "version": "2.3.6", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.3", - "isarray": "~1.0.0", - "process-nextick-args": "~2.0.0", - "safe-buffer": "~5.1.1", - "string_decoder": "~1.1.1", - "util-deprecate": "~1.0.1" - } - }, - "rimraf": { - "version": "2.6.3", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "glob": "^7.1.3" - } - }, - "safe-buffer": { - "version": "5.1.2", - "bundled": true, - "dev": true, - "optional": true - }, - "safer-buffer": { - "version": "2.1.2", - "bundled": true, - "dev": true, - "optional": true - }, - "sax": { - "version": "1.2.4", - "bundled": true, - "dev": true, - "optional": true - }, - "semver": { - "version": "5.7.0", - "bundled": true, - "dev": true, - "optional": true - }, - "set-blocking": { - "version": "2.0.0", - "bundled": true, - "dev": true, - "optional": true - }, - "signal-exit": { - "version": "3.0.2", - "bundled": true, - "dev": true, - "optional": true - }, - "string-width": { - "version": "1.0.2", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "code-point-at": "^1.0.0", - "is-fullwidth-code-point": "^1.0.0", - "strip-ansi": "^3.0.0" - } - }, - "string_decoder": { - "version": "1.1.1", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "safe-buffer": "~5.1.0" - } - }, - "strip-ansi": { - "version": "3.0.1", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "ansi-regex": "^2.0.0" - } - }, - "strip-json-comments": { - "version": "2.0.1", - "bundled": true, - "dev": true, - "optional": true - }, - "tar": { - "version": "4.4.8", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "chownr": "^1.1.1", - "fs-minipass": "^1.2.5", - "minipass": "^2.3.4", - "minizlib": "^1.1.1", - "mkdirp": "^0.5.0", - "safe-buffer": "^5.1.2", - "yallist": "^3.0.2" - } - }, - "util-deprecate": { - "version": "1.0.2", - "bundled": true, - "dev": true, - "optional": true - }, - "wide-align": { - "version": "1.1.3", - "bundled": true, - "dev": true, - "optional": true, - "requires": { - "string-width": "^1.0.2 || 2" - } - }, - "wrappy": { - "version": "1.0.2", - "bundled": true, - "dev": true, - "optional": true - }, - "yallist": { - "version": "3.0.3", - "bundled": true, - "dev": true, - "optional": true - } - } - }, - "function-bind": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz", - "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==", - "dev": true - }, - "get-caller-file": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.3.tgz", - "integrity": "sha512-3t6rVToeoZfYSGd8YoLFR2DJkiQrIiUrGcjvFX2mDw3bn6k2OtwHN0TNCLbBO+w8qTvimhDkv+LSscbJY1vE6w==", - "dev": true - }, - "get-value": { - "version": "2.0.6", - "resolved": "https://registry.npmjs.org/get-value/-/get-value-2.0.6.tgz", - "integrity": "sha1-3BXKHGcjh8p2vTesCjlbogQqLCg=", - "dev": true - }, - "glob": { - "version": "7.1.4", - "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.4.tgz", - "integrity": "sha512-hkLPepehmnKk41pUGm3sYxoFs/umurYfYJCerbXEyFIWcAzvpipAgVkBqqT9RBKMGjnq6kMuyYwha6csxbiM1A==", - "dev": true, - "requires": { - "fs.realpath": "^1.0.0", - "inflight": "^1.0.4", - "inherits": "2", - "minimatch": "^3.0.4", - "once": "^1.3.0", - "path-is-absolute": "^1.0.0" - } - }, - "glob-base": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/glob-base/-/glob-base-0.3.0.tgz", - "integrity": "sha1-27Fk9iIbHAscz4Kuoyi0l98Oo8Q=", - "dev": true, - "requires": { - "glob-parent": "^2.0.0", - "is-glob": "^2.0.0" - }, - "dependencies": { - "glob-parent": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-2.0.0.tgz", - "integrity": "sha1-gTg9ctsFT8zPUzbaqQLxgvbtuyg=", - "dev": true, - "requires": { - "is-glob": "^2.0.0" - } - }, - "is-extglob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", - "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", - "dev": true - }, - "is-glob": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", - "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", - "dev": true, - "requires": { - "is-extglob": "^1.0.0" - } - } - } - }, - "glob-parent": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", - "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", - "dev": true, - "requires": { - "is-glob": "^3.1.0", - "path-dirname": "^1.0.0" - }, - "dependencies": { - "is-glob": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", - "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", - "dev": true, - "requires": { - "is-extglob": "^2.1.0" - } - } - } - }, - "graceful-fs": { - "version": "4.1.15", - "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.15.tgz", - "integrity": "sha512-6uHUhOPEBgQ24HM+r6b/QwWfZq+yiFcipKFrOFiBEnWdy5sdzYoi+pJeQaPI5qOLRFqWmAXUPQNsielzdLoecA==", - "dev": true - }, - "has": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz", - "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==", - "dev": true, - "requires": { - "function-bind": "^1.1.1" - } - }, - "has-ansi": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/has-ansi/-/has-ansi-2.0.0.tgz", - "integrity": "sha1-NPUEnOHs3ysGSa8+8k5F7TVBbZE=", - "dev": true, - "requires": { - "ansi-regex": "^2.0.0" - } - }, - "has-binary2": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has-binary2/-/has-binary2-1.0.3.tgz", - "integrity": "sha512-G1LWKhDSvhGeAQ8mPVQlqNcOB2sJdwATtZKl2pDKKHfpf/rYj24lkinxf69blJbnsvtqqNU+L3SL50vzZhXOnw==", - "dev": true, - "requires": { - "isarray": "2.0.1" - } - }, - "has-cors": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/has-cors/-/has-cors-1.1.0.tgz", - "integrity": "sha1-XkdHk/fqmEPRu5nCPu9J/xJv/zk=", - "dev": true - }, - "has-flag": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", - "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", - "dev": true - }, - "has-symbols": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.0.tgz", - "integrity": "sha1-uhqPGvKg/DllD1yFA2dwQSIGO0Q=", - "dev": true - }, - "has-value": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/has-value/-/has-value-1.0.0.tgz", - "integrity": "sha1-GLKB2lhbHFxR3vJMkw7SmgvmsXc=", - "dev": true, - "requires": { - "get-value": "^2.0.6", - "has-values": "^1.0.0", - "isobject": "^3.0.0" - } - }, - "has-values": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/has-values/-/has-values-1.0.0.tgz", - "integrity": "sha1-lbC2P+whRmGab+V/51Yo1aOe/k8=", - "dev": true, - "requires": { - "is-number": "^3.0.0", - "kind-of": "^4.0.0" - }, - "dependencies": { - "kind-of": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-4.0.0.tgz", - "integrity": "sha1-IIE989cSkosgc3hpGkUGb65y3Vc=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "hosted-git-info": { - "version": "2.8.9", - "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-2.8.9.tgz", - "integrity": "sha512-mxIDAb9Lsm6DoOJ7xH+5+X4y1LU/4Hi50L9C5sIswK3JzULS4bwk1FvjdBgvYR4bzT4tuUQiC15FE2f5HbLvYw==", - "dev": true - }, - "http-errors": { - "version": "1.7.2", - "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz", - "integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==", - "dev": true, - "requires": { - "depd": "~1.1.2", - "inherits": "2.0.3", - "setprototypeof": "1.1.1", - "statuses": ">= 1.5.0 < 2", - "toidentifier": "1.0.0" - }, - "dependencies": { - "statuses": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", - "integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=", - "dev": true - } - } - }, - "http-proxy": { - "version": "1.15.2", - "resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.15.2.tgz", - "integrity": "sha1-ZC/cr/5S00SNK9o7AHnpQJBk2jE=", - "dev": true, - "requires": { - "eventemitter3": "1.x.x", - "requires-port": "1.x.x" - } - }, - "iconv-lite": { - "version": "0.4.24", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", - "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", - "dev": true, - "requires": { - "safer-buffer": ">= 2.1.2 < 3" - } - }, - "immutable": { - "version": "3.8.2", - "resolved": "https://registry.npmjs.org/immutable/-/immutable-3.8.2.tgz", - "integrity": "sha1-wkOZUUVbs5kT2vKBN28VMOEErfM=", - "dev": true - }, - "indexof": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/indexof/-/indexof-0.0.1.tgz", - "integrity": "sha1-gtwzbSMrkGIXnQWrMpOmYFn9Q10=", - "dev": true - }, - "inflight": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", - "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", - "dev": true, - "requires": { - "once": "^1.3.0", - "wrappy": "1" - } - }, - "inherits": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", - "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=", - "dev": true - }, - "invert-kv": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/invert-kv/-/invert-kv-1.0.0.tgz", - "integrity": "sha1-EEqOSqym09jNFXqO+L+rLXo//bY=", - "dev": true - }, - "is-accessor-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", - "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-arrayish": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz", - "integrity": "sha1-d8mYQFJ6qOyxqLppe4BkWnqSap0=", - "dev": true - }, - "is-binary-path": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-1.0.1.tgz", - "integrity": "sha1-dfFmQrSA8YenEcgUFh/TpKdlWJg=", - "dev": true, - "requires": { - "binary-extensions": "^1.0.0" - } - }, - "is-buffer": { - "version": "1.1.6", - "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", - "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==", - "dev": true - }, - "is-callable": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.1.4.tgz", - "integrity": "sha512-r5p9sxJjYnArLjObpjA4xu5EKI3CuKHkJXMhT7kwbpUyIFD1n5PMAsoPvWnvtZiNz7LjkYDRZhd7FlI0eMijEA==", - "dev": true - }, - "is-data-descriptor": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", - "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-date-object": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.1.tgz", - "integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY=", - "dev": true - }, - "is-descriptor": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", - "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^0.1.6", - "is-data-descriptor": "^0.1.4", - "kind-of": "^5.0.0" - }, - "dependencies": { - "kind-of": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", - "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", - "dev": true - } - } - }, - "is-dotfile": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/is-dotfile/-/is-dotfile-1.0.3.tgz", - "integrity": "sha1-pqLzL/0t+wT1yiXs0Pa4PPeYoeE=", - "dev": true - }, - "is-equal-shallow": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/is-equal-shallow/-/is-equal-shallow-0.1.3.tgz", - "integrity": "sha1-IjgJj8Ih3gvPpdnqxMRdY4qhxTQ=", - "dev": true, - "requires": { - "is-primitive": "^2.0.0" - } - }, - "is-extendable": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz", - "integrity": "sha1-YrEQ4omkcUGOPsNqYX1HLjAd/Ik=", - "dev": true - }, - "is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", - "dev": true - }, - "is-fullwidth-code-point": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-1.0.0.tgz", - "integrity": "sha1-754xOG8DGn8NZDr4L95QxFfvAMs=", - "dev": true, - "requires": { - "number-is-nan": "^1.0.0" - } - }, - "is-glob": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.1.tgz", - "integrity": "sha512-5G0tKtBTFImOqDnLB2hG6Bp2qcKEFduo4tZu9MT/H6NQv/ghhy30o55ufafxJ/LdH79LLs2Kfrn85TLKyA7BUg==", - "dev": true, - "requires": { - "is-extglob": "^2.1.1" - } - }, - "is-number": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", - "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "is-number-like": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/is-number-like/-/is-number-like-1.0.8.tgz", - "integrity": "sha512-6rZi3ezCyFcn5L71ywzz2bS5b2Igl1En3eTlZlvKjpz1n3IZLAYMbKYAIQgFmEu0GENg92ziU/faEOA/aixjbA==", - "dev": true, - "requires": { - "lodash.isfinite": "^3.3.2" - } - }, - "is-plain-object": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", - "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", - "dev": true, - "requires": { - "isobject": "^3.0.1" - } - }, - "is-posix-bracket": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/is-posix-bracket/-/is-posix-bracket-0.1.1.tgz", - "integrity": "sha1-MzTceXdDaOkvAW5vvAqI9c1ua8Q=", - "dev": true - }, - "is-primitive": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/is-primitive/-/is-primitive-2.0.0.tgz", - "integrity": "sha1-IHurkWOEmcB7Kt8kCkGochADRXU=", - "dev": true - }, - "is-regex": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz", - "integrity": "sha1-VRdIm1RwkbCTDglWVM7SXul+lJE=", - "dev": true, - "requires": { - "has": "^1.0.1" - } - }, - "is-symbol": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.0.2.tgz", - "integrity": "sha512-HS8bZ9ox60yCJLH9snBpIwv9pYUAkcuLhSA1oero1UB5y9aiQpRA8y2ex945AOtCZL1lJDeIk3G5LthswI46Lw==", - "dev": true, - "requires": { - "has-symbols": "^1.0.0" - } - }, - "is-utf8": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/is-utf8/-/is-utf8-0.2.1.tgz", - "integrity": "sha1-Sw2hRCEE0bM2NA6AeX6GXPOffXI=", - "dev": true - }, - "is-windows": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-windows/-/is-windows-1.0.2.tgz", - "integrity": "sha512-eXK1UInq2bPmjyX6e3VHIzMLobc4J94i4AWn+Hpq3OU5KkrRC96OAcR3PRJ/pGu6m8TRnBHP9dkXQVsT/COVIA==", - "dev": true - }, - "is-wsl": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-1.1.0.tgz", - "integrity": "sha1-HxbkqiKwTRM2tmGIpmrzxgDDpm0=", - "dev": true - }, - "isarray": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-2.0.1.tgz", - "integrity": "sha1-o32U7ZzaLVmGXJ92/llu4fM4dB4=", - "dev": true - }, - "isexe": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", - "integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=", - "dev": true - }, - "isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", - "dev": true - }, - "json-parse-better-errors": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/json-parse-better-errors/-/json-parse-better-errors-1.0.2.tgz", - "integrity": "sha512-mrqyZKfX5EhL7hvqcV6WG1yYjnjeuYDzDhhcAAUrq8Po85NBQBJP+ZDUT75qZQ98IkUoBqdkExkukOU7Ts2wrw==", - "dev": true - }, - "jsonfile": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-3.0.1.tgz", - "integrity": "sha1-pezG9l9T9mLEQVx2daAzHQmS7GY=", - "dev": true, - "requires": { - "graceful-fs": "^4.1.6" - } - }, - "jsonify": { - "version": "0.0.0", - "resolved": "https://registry.npmjs.org/jsonify/-/jsonify-0.0.0.tgz", - "integrity": "sha1-LHS27kHZPKUbe1qu6PUDYx0lKnM=", - "dev": true - }, - "kind-of": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", - "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", - "dev": true - }, - "lcid": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/lcid/-/lcid-1.0.0.tgz", - "integrity": "sha1-MIrMr6C8SDo4Z7S28rlQYlHRuDU=", - "dev": true, - "requires": { - "invert-kv": "^1.0.0" - } - }, - "limiter": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/limiter/-/limiter-1.1.4.tgz", - "integrity": "sha512-XCpr5bElgDI65vVgstP8TWjv6/QKWm9GU5UG0Pr5sLQ3QLo8NVKsioe+Jed5/3vFOe3IQuqE7DKwTvKQkjTHvg==", - "dev": true - }, - "load-json-file": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-1.1.0.tgz", - "integrity": "sha1-lWkFcI1YtLq0wiYbBPWfMcmTdMA=", - "dev": true, - "requires": { - "graceful-fs": "^4.1.2", - "parse-json": "^2.2.0", - "pify": "^2.0.0", - "pinkie-promise": "^2.0.0", - "strip-bom": "^2.0.0" - } - }, - "localtunnel": { - "version": "1.9.2", - "resolved": "https://registry.npmjs.org/localtunnel/-/localtunnel-1.9.2.tgz", - "integrity": "sha512-NEKF7bDJE9U3xzJu3kbayF0WTvng6Pww7tzqNb/XtEARYwqw7CKEX7BvOMg98FtE9es2CRizl61gkV3hS8dqYg==", - "dev": true, - "requires": { - "axios": "0.19.0", - "debug": "4.1.1", - "openurl": "1.1.1", - "yargs": "6.6.0" - }, - "dependencies": { - "debug": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.1.1.tgz", - "integrity": "sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw==", - "dev": true, - "requires": { - "ms": "^2.1.1" - } - }, - "ms": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", - "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==", - "dev": true - }, - "yargs": { - "version": "6.6.0", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-6.6.0.tgz", - "integrity": "sha1-eC7CHvQDNF+DCoCMo9UTr1YGUgg=", - "dev": true, - "requires": { - "camelcase": "^3.0.0", - "cliui": "^3.2.0", - "decamelize": "^1.1.1", - "get-caller-file": "^1.0.1", - "os-locale": "^1.4.0", - "read-pkg-up": "^1.0.1", - "require-directory": "^2.1.1", - "require-main-filename": "^1.0.1", - "set-blocking": "^2.0.0", - "string-width": "^1.0.2", - "which-module": "^1.0.0", - "y18n": "^3.2.1", - "yargs-parser": "^4.2.0" - } - } - } - }, - "lodash": { - "version": "4.17.21", - "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", - "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==", - "dev": true - }, - "lodash.isfinite": { - "version": "3.3.2", - "resolved": "https://registry.npmjs.org/lodash.isfinite/-/lodash.isfinite-3.3.2.tgz", - "integrity": "sha1-+4m2WpqAKBgz8LdHizpRBPiY67M=", - "dev": true - }, - "map-cache": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/map-cache/-/map-cache-0.2.2.tgz", - "integrity": "sha1-wyq9C9ZSXZsFFkW7TyasXcmKDb8=", - "dev": true - }, - "map-visit": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/map-visit/-/map-visit-1.0.0.tgz", - "integrity": "sha1-7Nyo8TFE5mDxtb1B8S80edmN+48=", - "dev": true, - "requires": { - "object-visit": "^1.0.0" - } - }, - "math-random": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/math-random/-/math-random-1.0.4.tgz", - "integrity": "sha512-rUxjysqif/BZQH2yhd5Aaq7vXMSx9NdEsQcyA07uEzIvxgI7zIr33gGsh+RU0/XjmQpCW7RsVof1vlkvQVCK5A==", - "dev": true - }, - "memorystream": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/memorystream/-/memorystream-0.3.1.tgz", - "integrity": "sha1-htcJCzDORV1j+64S3aUaR93K+bI=", - "dev": true - }, - "micromatch": { - "version": "3.1.10", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", - "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "braces": "^2.3.1", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "extglob": "^2.0.4", - "fragment-cache": "^0.2.1", - "kind-of": "^6.0.2", - "nanomatch": "^1.2.9", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.2" - } - }, - "mime": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/mime/-/mime-1.4.1.tgz", - "integrity": "sha512-KI1+qOZu5DcW6wayYHSzR/tXKCDC5Om4s1z2QJjDULzLcmf3DvzS7oluY4HCTrc+9FiKmWUgeNLg7W3uIQvxtQ==", - "dev": true - }, - "mime-db": { - "version": "1.40.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.40.0.tgz", - "integrity": "sha512-jYdeOMPy9vnxEqFRRo6ZvTZ8d9oPb+k18PKoYNYUe2stVEBPPwsln/qWzdbmaIvnhZ9v2P+CuecK+fpUfsV2mA==", - "dev": true - }, - "mime-types": { - "version": "2.1.24", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.24.tgz", - "integrity": "sha512-WaFHS3MCl5fapm3oLxU4eYDw77IQM2ACcxQ9RIxfaC3ooc6PFuBMGZZsYpvoXS5D5QTWPieo1jjLdAm3TBP3cQ==", - "dev": true, - "requires": { - "mime-db": "1.40.0" - } - }, - "minimatch": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", - "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", - "dev": true, - "requires": { - "brace-expansion": "^1.1.7" - } - }, - "mitt": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/mitt/-/mitt-1.1.3.tgz", - "integrity": "sha512-mUDCnVNsAi+eD6qA0HkRkwYczbLHJ49z17BGe2PYRhZL4wpZUFZGJHU7/5tmvohoma+Hdn0Vh/oJTiPEmgSruA==", - "dev": true - }, - "mixin-deep": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/mixin-deep/-/mixin-deep-1.3.2.tgz", - "integrity": "sha512-WRoDn//mXBiJ1H40rqa3vH0toePwSsGb45iInWlTySa+Uu4k3tYUSxa2v1KqAiLtvlrSzaExqS1gtk96A9zvEA==", - "dev": true, - "requires": { - "for-in": "^1.0.2", - "is-extendable": "^1.0.1" - }, - "dependencies": { - "is-extendable": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz", - "integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==", - "dev": true, - "requires": { - "is-plain-object": "^2.0.4" - } - } - } - }, - "ms": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", - "dev": true - }, - "nan": { - "version": "2.14.0", - "resolved": "https://registry.npmjs.org/nan/-/nan-2.14.0.tgz", - "integrity": "sha512-INOFj37C7k3AfaNTtX8RhsTw7qRy7eLET14cROi9+5HAVbbHuIWUHEauBv5qT4Av2tWasiTY1Jw6puUNqRJXQg==", - "dev": true, - "optional": true - }, - "nanomatch": { - "version": "1.2.13", - "resolved": "https://registry.npmjs.org/nanomatch/-/nanomatch-1.2.13.tgz", - "integrity": "sha512-fpoe2T0RbHwBTBUOftAfBPaDEi06ufaUai0mE6Yn1kacc3SnTErfb/h+X94VXzI64rKFHYImXSvdwGGCmwOqCA==", - "dev": true, - "requires": { - "arr-diff": "^4.0.0", - "array-unique": "^0.3.2", - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "fragment-cache": "^0.2.1", - "is-windows": "^1.0.2", - "kind-of": "^6.0.2", - "object.pick": "^1.3.0", - "regex-not": "^1.0.0", - "snapdragon": "^0.8.1", - "to-regex": "^3.0.1" - } - }, - "negotiator": { - "version": "0.6.2", - "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz", - "integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==", - "dev": true - }, - "nice-try": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.5.tgz", - "integrity": "sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ==", - "dev": true - }, - "normalize-package-data": { - "version": "2.5.0", - "resolved": "https://registry.npmjs.org/normalize-package-data/-/normalize-package-data-2.5.0.tgz", - "integrity": "sha512-/5CMN3T0R4XTj4DcGaexo+roZSdSFW/0AOOTROrjxzCG1wrWXEsGbRKevjlIL+ZDE4sZlJr5ED4YW0yqmkK+eA==", - "dev": true, - "requires": { - "hosted-git-info": "^2.1.4", - "resolve": "^1.10.0", - "semver": "2 || 3 || 4 || 5", - "validate-npm-package-license": "^3.0.1" - } - }, - "normalize-path": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", - "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", - "dev": true - }, - "npm-run-all": { - "version": "4.1.5", - "resolved": "https://registry.npmjs.org/npm-run-all/-/npm-run-all-4.1.5.tgz", - "integrity": "sha512-Oo82gJDAVcaMdi3nuoKFavkIHBRVqQ1qvMb+9LHk/cF4P6B2m8aP04hGf7oL6wZ9BuGwX1onlLhpuoofSyoQDQ==", - "dev": true, - "requires": { - "ansi-styles": "^3.2.1", - "chalk": "^2.4.1", - "cross-spawn": "^6.0.5", - "memorystream": "^0.3.1", - "minimatch": "^3.0.4", - "pidtree": "^0.3.0", - "read-pkg": "^3.0.0", - "shell-quote": "^1.6.1", - "string.prototype.padend": "^3.0.0" - }, - "dependencies": { - "ansi-styles": { - "version": "3.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", - "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", - "dev": true, - "requires": { - "color-convert": "^1.9.0" - } - }, - "chalk": { - "version": "2.4.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz", - "integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==", - "dev": true, - "requires": { - "ansi-styles": "^3.2.1", - "escape-string-regexp": "^1.0.5", - "supports-color": "^5.3.0" - } - }, - "load-json-file": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", - "integrity": "sha1-L19Fq5HjMhYjT9U62rZo607AmTs=", - "dev": true, - "requires": { - "graceful-fs": "^4.1.2", - "parse-json": "^4.0.0", - "pify": "^3.0.0", - "strip-bom": "^3.0.0" - } - }, - "parse-json": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-4.0.0.tgz", - "integrity": "sha1-vjX1Qlvh9/bHRxhPmKeIy5lHfuA=", - "dev": true, - "requires": { - "error-ex": "^1.3.1", - "json-parse-better-errors": "^1.0.1" - } - }, - "path-type": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/path-type/-/path-type-3.0.0.tgz", - "integrity": "sha512-T2ZUsdZFHgA3u4e5PfPbjd7HDDpxPnQb5jN0SrDsjNSuVXHJqtwTnWqG0B1jZrgmJ/7lj1EmVIByWt1gxGkWvg==", - "dev": true, - "requires": { - "pify": "^3.0.0" - } - }, - "pify": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-3.0.0.tgz", - "integrity": "sha1-5aSs0sEB/fPZpNB/DbxNtJ3SgXY=", - "dev": true - }, - "read-pkg": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-3.0.0.tgz", - "integrity": "sha1-nLxoaXj+5l0WwA4rGcI3/Pbjg4k=", - "dev": true, - "requires": { - "load-json-file": "^4.0.0", - "normalize-package-data": "^2.3.2", - "path-type": "^3.0.0" - } - }, - "strip-bom": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", - "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", - "dev": true - }, - "supports-color": { - "version": "5.5.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", - "integrity": "sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==", - "dev": true, - "requires": { - "has-flag": "^3.0.0" - } - } - } - }, - "number-is-nan": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz", - "integrity": "sha1-CXtgK1NCKlIsGvuHkDGDNpQaAR0=", - "dev": true - }, - "object-component": { - "version": "0.0.3", - "resolved": "https://registry.npmjs.org/object-component/-/object-component-0.0.3.tgz", - "integrity": "sha1-8MaapQ78lbhmwYb0AKM3acsvEpE=", - "dev": true - }, - "object-copy": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/object-copy/-/object-copy-0.1.0.tgz", - "integrity": "sha1-fn2Fi3gb18mRpBupde04EnVOmYw=", - "dev": true, - "requires": { - "copy-descriptor": "^0.1.0", - "define-property": "^0.2.5", - "kind-of": "^3.0.3" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "object-keys": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz", - "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==", - "dev": true - }, - "object-path": { - "version": "0.9.2", - "resolved": "https://registry.npmjs.org/object-path/-/object-path-0.9.2.tgz", - "integrity": "sha1-D9mnT8X60a45aLWGvaXGMr1sBaU=", - "dev": true - }, - "object-visit": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/object-visit/-/object-visit-1.0.1.tgz", - "integrity": "sha1-95xEk68MU3e1n+OdOV5BBC3QRbs=", - "dev": true, - "requires": { - "isobject": "^3.0.0" - } - }, - "object.omit": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/object.omit/-/object.omit-2.0.1.tgz", - "integrity": "sha1-Gpx0SCnznbuFjHbKNXmuKlTr0fo=", - "dev": true, - "requires": { - "for-own": "^0.1.4", - "is-extendable": "^0.1.1" - } - }, - "object.pick": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/object.pick/-/object.pick-1.3.0.tgz", - "integrity": "sha1-h6EKxMFpS9Lhy/U1kaZhQftd10c=", - "dev": true, - "requires": { - "isobject": "^3.0.1" - } - }, - "on-finished": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz", - "integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=", - "dev": true, - "requires": { - "ee-first": "1.1.1" - } - }, - "once": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", - "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", - "dev": true, - "requires": { - "wrappy": "1" - } - }, - "openurl": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/openurl/-/openurl-1.1.1.tgz", - "integrity": "sha1-OHW0sO96UsFW8NtB1GCduw+Us4c=", - "dev": true - }, - "opn": { - "version": "5.3.0", - "resolved": "https://registry.npmjs.org/opn/-/opn-5.3.0.tgz", - "integrity": "sha512-bYJHo/LOmoTd+pfiYhfZDnf9zekVJrY+cnS2a5F2x+w5ppvTqObojTP7WiFG+kVZs9Inw+qQ/lw7TroWwhdd2g==", - "dev": true, - "requires": { - "is-wsl": "^1.1.0" - } - }, - "os-locale": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-1.4.0.tgz", - "integrity": "sha1-IPnxeuKe00XoveWDsT0gCYA8FNk=", - "dev": true, - "requires": { - "lcid": "^1.0.0" - } - }, - "parse-glob": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/parse-glob/-/parse-glob-3.0.4.tgz", - "integrity": "sha1-ssN2z7EfNVE7rdFz7wu246OIORw=", - "dev": true, - "requires": { - "glob-base": "^0.3.0", - "is-dotfile": "^1.0.0", - "is-extglob": "^1.0.0", - "is-glob": "^2.0.0" - }, - "dependencies": { - "is-extglob": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-1.0.0.tgz", - "integrity": "sha1-rEaBd8SUNAWgkvyPKXYMb/xiBsA=", - "dev": true - }, - "is-glob": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-2.0.1.tgz", - "integrity": "sha1-0Jb5JqPe1WAPP9/ZEZjLCIjC2GM=", - "dev": true, - "requires": { - "is-extglob": "^1.0.0" - } - } - } - }, - "parse-json": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-2.2.0.tgz", - "integrity": "sha1-9ID0BDTvgHQfhGkJn43qGPVaTck=", - "dev": true, - "requires": { - "error-ex": "^1.2.0" - } - }, - "parseqs": { - "version": "0.0.5", - "resolved": "https://registry.npmjs.org/parseqs/-/parseqs-0.0.5.tgz", - "integrity": "sha1-1SCKNzjkZ2bikbouoXNoSSGouJ0=", - "dev": true, - "requires": { - "better-assert": "~1.0.0" - } - }, - "parseuri": { - "version": "0.0.5", - "resolved": "https://registry.npmjs.org/parseuri/-/parseuri-0.0.5.tgz", - "integrity": "sha1-gCBKUNTbt3m/3G6+J3jZDkvOMgo=", - "dev": true, - "requires": { - "better-assert": "~1.0.0" - } - }, - "parseurl": { - "version": "1.3.3", - "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", - "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==", - "dev": true - }, - "pascalcase": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/pascalcase/-/pascalcase-0.1.1.tgz", - "integrity": "sha1-s2PlXoAGym/iF4TS2yK9FdeRfxQ=", - "dev": true - }, - "path-dirname": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/path-dirname/-/path-dirname-1.0.2.tgz", - "integrity": "sha1-zDPSTVJeCZpTiMAzbG4yuRYGCeA=", - "dev": true - }, - "path-exists": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-2.1.0.tgz", - "integrity": "sha1-D+tsZPD8UY2adU3V77YscCJ2H0s=", - "dev": true, - "requires": { - "pinkie-promise": "^2.0.0" - } - }, - "path-is-absolute": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", - "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=", - "dev": true - }, - "path-key": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/path-key/-/path-key-2.0.1.tgz", - "integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=", - "dev": true - }, - "path-parse": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.6.tgz", - "integrity": "sha512-GSmOT2EbHrINBf9SR7CDELwlJ8AENk3Qn7OikK4nFYAu3Ote2+JYNVvkpAEQm3/TLNEJFD/xZJjzyxg3KBWOzw==", - "dev": true - }, - "path-type": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/path-type/-/path-type-1.1.0.tgz", - "integrity": "sha1-WcRPfuSR2nBNpBXaWkBwuk+P5EE=", - "dev": true, - "requires": { - "graceful-fs": "^4.1.2", - "pify": "^2.0.0", - "pinkie-promise": "^2.0.0" - } - }, - "pidtree": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/pidtree/-/pidtree-0.3.0.tgz", - "integrity": "sha512-9CT4NFlDcosssyg8KVFltgokyKZIFjoBxw8CTGy+5F38Y1eQWrt8tRayiUOXE+zVKQnYu5BR8JjCtvK3BcnBhg==", - "dev": true - }, - "pify": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", - "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", - "dev": true - }, - "pinkie": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/pinkie/-/pinkie-2.0.4.tgz", - "integrity": "sha1-clVrgM+g1IqXToDnckjoDtT3+HA=", - "dev": true - }, - "pinkie-promise": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/pinkie-promise/-/pinkie-promise-2.0.1.tgz", - "integrity": "sha1-ITXW36ejWMBprJsXh3YogihFD/o=", - "dev": true, - "requires": { - "pinkie": "^2.0.0" - } - }, - "portscanner": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/portscanner/-/portscanner-2.1.1.tgz", - "integrity": "sha1-6rtAnk3iSVD1oqUW01rnaTQ/u5Y=", - "dev": true, - "requires": { - "async": "1.5.2", - "is-number-like": "^1.0.3" - } - }, - "posix-character-classes": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/posix-character-classes/-/posix-character-classes-0.1.1.tgz", - "integrity": "sha1-AerA/jta9xoqbAL+q7jB/vfgDqs=", - "dev": true - }, - "preserve": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/preserve/-/preserve-0.2.0.tgz", - "integrity": "sha1-gV7R9uvGWSb4ZbMQwHE7yzMVzks=", - "dev": true - }, - "process-nextick-args": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", - "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", - "dev": true - }, - "qs": { - "version": "6.2.3", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.2.3.tgz", - "integrity": "sha1-HPyyXBCpsrSDBT/zn138kjOQjP4=", - "dev": true - }, - "randomatic": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/randomatic/-/randomatic-3.1.1.tgz", - "integrity": "sha512-TuDE5KxZ0J461RVjrJZCJc+J+zCkTb1MbH9AQUq68sMhOMcy9jLcb3BrZKgp9q9Ncltdg4QVqWrH02W2EFFVYw==", - "dev": true, - "requires": { - "is-number": "^4.0.0", - "kind-of": "^6.0.0", - "math-random": "^1.0.1" - }, - "dependencies": { - "is-number": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-4.0.0.tgz", - "integrity": "sha512-rSklcAIlf1OmFdyAqbnWTLVelsQ58uvZ66S/ZyawjWqIviTWCjg2PzVGw8WUA+nNuPTqb4wgA+NszrJ+08LlgQ==", - "dev": true - } - } - }, - "range-parser": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", - "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==", - "dev": true - }, - "raw-body": { - "version": "2.4.0", - "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz", - "integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==", - "dev": true, - "requires": { - "bytes": "3.1.0", - "http-errors": "1.7.2", - "iconv-lite": "0.4.24", - "unpipe": "1.0.0" - } - }, - "read-pkg": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-1.1.0.tgz", - "integrity": "sha1-9f+qXs0pyzHAR0vKfXVra7KePyg=", - "dev": true, - "requires": { - "load-json-file": "^1.0.0", - "normalize-package-data": "^2.3.2", - "path-type": "^1.0.0" - } - }, - "read-pkg-up": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-1.0.1.tgz", - "integrity": "sha1-nWPBMnbAZZGNV/ACpX9AobZD+wI=", - "dev": true, - "requires": { - "find-up": "^1.0.0", - "read-pkg": "^1.0.0" - } - }, - "readable-stream": { - "version": "2.3.6", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", - "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", - "dev": true, - "requires": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.3", - "isarray": "~1.0.0", - "process-nextick-args": "~2.0.0", - "safe-buffer": "~5.1.1", - "string_decoder": "~1.1.1", - "util-deprecate": "~1.0.1" - }, - "dependencies": { - "isarray": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", - "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=", - "dev": true - } - } - }, - "readdirp": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-2.2.1.tgz", - "integrity": "sha512-1JU/8q+VgFZyxwrJ+SVIOsh+KywWGpds3NTqikiKpDMZWScmAYyKIgqkO+ARvNWJfXeXR1zxz7aHF4u4CyH6vQ==", - "dev": true, - "requires": { - "graceful-fs": "^4.1.11", - "micromatch": "^3.1.10", - "readable-stream": "^2.0.2" - } - }, - "regex-cache": { - "version": "0.4.4", - "resolved": "https://registry.npmjs.org/regex-cache/-/regex-cache-0.4.4.tgz", - "integrity": "sha512-nVIZwtCjkC9YgvWkpM55B5rBhBYRZhAaJbgcFYXXsHnbZ9UZI9nnVWYZpBlCqv9ho2eZryPnWrZGsOdPwVWXWQ==", - "dev": true, - "requires": { - "is-equal-shallow": "^0.1.3" - } - }, - "regex-not": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz", - "integrity": "sha512-J6SDjUgDxQj5NusnOtdFxDwN/+HWykR8GELwctJ7mdqhcyy1xEc4SRFHUXvxTp661YaVKAjfRLZ9cCqS6tn32A==", - "dev": true, - "requires": { - "extend-shallow": "^3.0.2", - "safe-regex": "^1.1.0" - } - }, - "remove-trailing-separator": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/remove-trailing-separator/-/remove-trailing-separator-1.1.0.tgz", - "integrity": "sha1-wkvOKig62tW8P1jg1IJJuSN52O8=", - "dev": true - }, - "repeat-element": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/repeat-element/-/repeat-element-1.1.3.tgz", - "integrity": "sha512-ahGq0ZnV5m5XtZLMb+vP76kcAM5nkLqk0lpqAuojSKGgQtn4eRi4ZZGm2olo2zKFH+sMsWaqOCW1dqAnOru72g==", - "dev": true - }, - "repeat-string": { - "version": "1.6.1", - "resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz", - "integrity": "sha1-jcrkcOHIirwtYA//Sndihtp15jc=", - "dev": true - }, - "require-directory": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", - "integrity": "sha1-jGStX9MNqxyXbiNE/+f3kqam30I=", - "dev": true - }, - "require-main-filename": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/require-main-filename/-/require-main-filename-1.0.1.tgz", - "integrity": "sha1-l/cXtp1IeE9fUmpsWqj/3aBVpNE=", - "dev": true - }, - "requires-port": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz", - "integrity": "sha1-kl0mAdOaxIXgkc8NpcbmlNw9yv8=", - "dev": true - }, - "resolve": { - "version": "1.11.0", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.11.0.tgz", - "integrity": "sha512-WL2pBDjqT6pGUNSUzMw00o4T7If+z4H2x3Gz893WoUQ5KW8Vr9txp00ykiP16VBaZF5+j/OcXJHZ9+PCvdiDKw==", - "dev": true, - "requires": { - "path-parse": "^1.0.6" - } - }, - "resolve-url": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/resolve-url/-/resolve-url-0.2.1.tgz", - "integrity": "sha1-LGN/53yJOv0qZj/iGqkIAGjiBSo=", - "dev": true - }, - "resp-modifier": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/resp-modifier/-/resp-modifier-6.0.2.tgz", - "integrity": "sha1-sSTeXE+6/LpUH0j/pzlw9KpFa08=", - "dev": true, - "requires": { - "debug": "^2.2.0", - "minimatch": "^3.0.2" - }, - "dependencies": { - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - } - } - }, - "ret": { - "version": "0.1.15", - "resolved": "https://registry.npmjs.org/ret/-/ret-0.1.15.tgz", - "integrity": "sha512-TTlYpa+OL+vMMNG24xSlQGEJ3B/RzEfUlLct7b5G/ytav+wPrplCpVMFuwzXbkecJrb6IYo1iFb0S9v37754mg==", - "dev": true - }, - "rx": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/rx/-/rx-4.1.0.tgz", - "integrity": "sha1-pfE/957zt0D+MKqAP7CfmIBdR4I=", - "dev": true - }, - "rxjs": { - "version": "5.5.12", - "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-5.5.12.tgz", - "integrity": "sha512-xx2itnL5sBbqeeiVgNPVuQQ1nC8Jp2WfNJhXWHmElW9YmrpS9UVnNzhP3EH3HFqexO5Tlp8GhYY+WEcqcVMvGw==", - "dev": true, - "requires": { - "symbol-observable": "1.0.1" - } - }, - "safe-buffer": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", - "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", - "dev": true - }, - "safe-regex": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz", - "integrity": "sha1-QKNmnzsHfR6UPURinhV91IAjvy4=", - "dev": true, - "requires": { - "ret": "~0.1.10" - } - }, - "safer-buffer": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", - "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", - "dev": true - }, - "semver": { - "version": "5.7.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.0.tgz", - "integrity": "sha512-Ya52jSX2u7QKghxeoFGpLwCtGlt7j0oY9DYb5apt9nPlJ42ID+ulTXESnt/qAQcoSERyZ5sl3LDIOw0nAn/5DA==", - "dev": true - }, - "send": { - "version": "0.16.2", - "resolved": "https://registry.npmjs.org/send/-/send-0.16.2.tgz", - "integrity": "sha512-E64YFPUssFHEFBvpbbjr44NCLtI1AohxQ8ZSiJjQLskAdKuriYEP6VyGEsRDH8ScozGpkaX1BGvhanqCwkcEZw==", - "dev": true, - "requires": { - "debug": "2.6.9", - "depd": "~1.1.2", - "destroy": "~1.0.4", - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "fresh": "0.5.2", - "http-errors": "~1.6.2", - "mime": "1.4.1", - "ms": "2.0.0", - "on-finished": "~2.3.0", - "range-parser": "~1.2.0", - "statuses": "~1.4.0" - }, - "dependencies": { - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - }, - "http-errors": { - "version": "1.6.3", - "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.6.3.tgz", - "integrity": "sha1-i1VoC7S+KDoLW/TqLjhYC+HZMg0=", - "dev": true, - "requires": { - "depd": "~1.1.2", - "inherits": "2.0.3", - "setprototypeof": "1.1.0", - "statuses": ">= 1.4.0 < 2" - } - }, - "setprototypeof": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.0.tgz", - "integrity": "sha512-BvE/TwpZX4FXExxOxZyRGQQv651MSwmWKZGqvmPcRIjDqWub67kTKuIMx43cZZrS/cBBzwBcNDWoFxt2XEFIpQ==", - "dev": true - }, - "statuses": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.4.0.tgz", - "integrity": "sha512-zhSCtt8v2NDrRlPQpCNtw/heZLtfUDqxBM1udqikb/Hbk52LK4nQSwr10u77iopCW5LsyHpuXS0GnEc48mLeew==", - "dev": true - } - } - }, - "serve-index": { - "version": "1.9.1", - "resolved": "https://registry.npmjs.org/serve-index/-/serve-index-1.9.1.tgz", - "integrity": "sha1-03aNabHn2C5c4FD/9bRTvqEqkjk=", - "dev": true, - "requires": { - "accepts": "~1.3.4", - "batch": "0.6.1", - "debug": "2.6.9", - "escape-html": "~1.0.3", - "http-errors": "~1.6.2", - "mime-types": "~2.1.17", - "parseurl": "~1.3.2" - }, - "dependencies": { - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - }, - "http-errors": { - "version": "1.6.3", - "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.6.3.tgz", - "integrity": "sha1-i1VoC7S+KDoLW/TqLjhYC+HZMg0=", - "dev": true, - "requires": { - "depd": "~1.1.2", - "inherits": "2.0.3", - "setprototypeof": "1.1.0", - "statuses": ">= 1.4.0 < 2" - } - }, - "setprototypeof": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.0.tgz", - "integrity": "sha512-BvE/TwpZX4FXExxOxZyRGQQv651MSwmWKZGqvmPcRIjDqWub67kTKuIMx43cZZrS/cBBzwBcNDWoFxt2XEFIpQ==", - "dev": true - }, - "statuses": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", - "integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=", - "dev": true - } - } - }, - "serve-static": { - "version": "1.13.2", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.13.2.tgz", - "integrity": "sha512-p/tdJrO4U387R9oMjb1oj7qSMaMfmOyd4j9hOFoxZe2baQszgHcSWjuya/CiT5kgZZKRudHNOA0pYXOl8rQ5nw==", - "dev": true, - "requires": { - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "parseurl": "~1.3.2", - "send": "0.16.2" - } - }, - "server-destroy": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/server-destroy/-/server-destroy-1.0.1.tgz", - "integrity": "sha1-8Tv5KOQrnD55OD5hzDmYtdFObN0=", - "dev": true - }, - "set-blocking": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", - "integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=", - "dev": true - }, - "set-value": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/set-value/-/set-value-2.0.0.tgz", - "integrity": "sha512-hw0yxk9GT/Hr5yJEYnHNKYXkIA8mVJgd9ditYZCe16ZczcaELYYcfvaXesNACk2O8O0nTiPQcQhGUQj8JLzeeg==", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-extendable": "^0.1.1", - "is-plain-object": "^2.0.3", - "split-string": "^3.0.1" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "setprototypeof": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz", - "integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw==", - "dev": true - }, - "shebang-command": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz", - "integrity": "sha1-RKrGW2lbAzmJaMOfNj/uXer98eo=", - "dev": true, - "requires": { - "shebang-regex": "^1.0.0" - } - }, - "shebang-regex": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-1.0.0.tgz", - "integrity": "sha1-2kL0l0DAtC2yypcoVxyxkMmO/qM=", - "dev": true - }, - "shell-quote": { - "version": "1.6.1", - "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.6.1.tgz", - "integrity": "sha1-9HgZSczkAmlxJ0MOo7PFR29IF2c=", - "dev": true, - "requires": { - "array-filter": "~0.0.0", - "array-map": "~0.0.0", - "array-reduce": "~0.0.0", - "jsonify": "~0.0.0" - } - }, - "snapdragon": { - "version": "0.8.2", - "resolved": "https://registry.npmjs.org/snapdragon/-/snapdragon-0.8.2.tgz", - "integrity": "sha512-FtyOnWN/wCHTVXOMwvSv26d+ko5vWlIDD6zoUJ7LW8vh+ZBC8QdljveRP+crNrtBwioEUWy/4dMtbBjA4ioNlg==", - "dev": true, - "requires": { - "base": "^0.11.1", - "debug": "^2.2.0", - "define-property": "^0.2.5", - "extend-shallow": "^2.0.1", - "map-cache": "^0.2.2", - "source-map": "^0.5.6", - "source-map-resolve": "^0.5.0", - "use": "^3.1.0" - }, - "dependencies": { - "debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", - "dev": true, - "requires": { - "ms": "2.0.0" - } - }, - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - }, - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - } - } - }, - "snapdragon-node": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/snapdragon-node/-/snapdragon-node-2.1.1.tgz", - "integrity": "sha512-O27l4xaMYt/RSQ5TR3vpWCAB5Kb/czIcqUFOM/C4fYcLnbZUc1PkjTAMjof2pBWaSTwOUd6qUHcFGVGj7aIwnw==", - "dev": true, - "requires": { - "define-property": "^1.0.0", - "isobject": "^3.0.0", - "snapdragon-util": "^3.0.1" - }, - "dependencies": { - "define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", - "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", - "dev": true, - "requires": { - "is-descriptor": "^1.0.0" - } - }, - "is-accessor-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", - "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-data-descriptor": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", - "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", - "dev": true, - "requires": { - "kind-of": "^6.0.0" - } - }, - "is-descriptor": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", - "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", - "dev": true, - "requires": { - "is-accessor-descriptor": "^1.0.0", - "is-data-descriptor": "^1.0.0", - "kind-of": "^6.0.2" - } - } - } - }, - "snapdragon-util": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/snapdragon-util/-/snapdragon-util-3.0.1.tgz", - "integrity": "sha512-mbKkMdQKsjX4BAL4bRYTj21edOf8cN7XHdYUJEe+Zn99hVEYcMvKPct1IqNe7+AZPirn8BCDOQBHQZknqmKlZQ==", - "dev": true, - "requires": { - "kind-of": "^3.2.0" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "socket.io": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-2.1.1.tgz", - "integrity": "sha512-rORqq9c+7W0DAK3cleWNSyfv/qKXV99hV4tZe+gGLfBECw3XEhBy7x85F3wypA9688LKjtwO9pX9L33/xQI8yA==", - "dev": true, - "requires": { - "debug": "~3.1.0", - "engine.io": "~3.2.0", - "has-binary2": "~1.0.2", - "socket.io-adapter": "~1.1.0", - "socket.io-client": "2.1.1", - "socket.io-parser": "~3.2.0" - }, - "dependencies": { - "engine.io-client": { - "version": "3.2.1", - "resolved": "https://registry.npmjs.org/engine.io-client/-/engine.io-client-3.2.1.tgz", - "integrity": "sha512-y5AbkytWeM4jQr7m/koQLc5AxpRKC1hEVUb/s1FUAWEJq5AzJJ4NLvzuKPuxtDi5Mq755WuDvZ6Iv2rXj4PTzw==", - "dev": true, - "requires": { - "component-emitter": "1.2.1", - "component-inherit": "0.0.3", - "debug": "~3.1.0", - "engine.io-parser": "~2.1.1", - "has-cors": "1.1.0", - "indexof": "0.0.1", - "parseqs": "0.0.5", - "parseuri": "0.0.5", - "ws": "~3.3.1", - "xmlhttprequest-ssl": "~1.5.4", - "yeast": "0.1.2" - } - }, - "socket.io-client": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/socket.io-client/-/socket.io-client-2.1.1.tgz", - "integrity": "sha512-jxnFyhAuFxYfjqIgduQlhzqTcOEQSn+OHKVfAxWaNWa7ecP7xSNk2Dx/3UEsDcY7NcFafxvNvKPmmO7HTwTxGQ==", - "dev": true, - "requires": { - "backo2": "1.0.2", - "base64-arraybuffer": "0.1.5", - "component-bind": "1.0.0", - "component-emitter": "1.2.1", - "debug": "~3.1.0", - "engine.io-client": "~3.2.0", - "has-binary2": "~1.0.2", - "has-cors": "1.1.0", - "indexof": "0.0.1", - "object-component": "0.0.3", - "parseqs": "0.0.5", - "parseuri": "0.0.5", - "socket.io-parser": "~3.2.0", - "to-array": "0.1.4" - } - }, - "socket.io-parser": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-3.2.0.tgz", - "integrity": "sha512-FYiBx7rc/KORMJlgsXysflWx/RIvtqZbyGLlHZvjfmPTPeuD/I8MaW7cfFrj5tRltICJdgwflhfZ3NVVbVLFQA==", - "dev": true, - "requires": { - "component-emitter": "1.2.1", - "debug": "~3.1.0", - "isarray": "2.0.1" - } - }, - "ws": { - "version": "3.3.3", - "resolved": "https://registry.npmjs.org/ws/-/ws-3.3.3.tgz", - "integrity": "sha512-nnWLa/NwZSt4KQJu51MYlCcSQ5g7INpOrOMt4XV8j4dqTXdmlUmSHQ8/oLC069ckre0fRsgfvsKwbTdtKLCDkA==", - "dev": true, - "requires": { - "async-limiter": "~1.0.0", - "safe-buffer": "~5.1.0", - "ultron": "~1.1.0" - } - } - } - }, - "socket.io-adapter": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-1.1.1.tgz", - "integrity": "sha1-KoBeihTWNyEk3ZFZrUUC+MsH8Gs=", - "dev": true - }, - "socket.io-client": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/socket.io-client/-/socket.io-client-2.2.0.tgz", - "integrity": "sha512-56ZrkTDbdTLmBIyfFYesgOxsjcLnwAKoN4CiPyTVkMQj3zTUh0QAx3GbvIvLpFEOvQWu92yyWICxB0u7wkVbYA==", - "dev": true, - "requires": { - "backo2": "1.0.2", - "base64-arraybuffer": "0.1.5", - "component-bind": "1.0.0", - "component-emitter": "1.2.1", - "debug": "~3.1.0", - "engine.io-client": "~3.3.1", - "has-binary2": "~1.0.2", - "has-cors": "1.1.0", - "indexof": "0.0.1", - "object-component": "0.0.3", - "parseqs": "0.0.5", - "parseuri": "0.0.5", - "socket.io-parser": "~3.3.0", - "to-array": "0.1.4" - } - }, - "socket.io-parser": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-3.3.0.tgz", - "integrity": "sha512-hczmV6bDgdaEbVqhAeVMM/jfUfzuEZHsQg6eOmLgJht6G3mPKMxYm75w2+qhAQZ+4X+1+ATZ+QFKeOZD5riHng==", - "dev": true, - "requires": { - "component-emitter": "1.2.1", - "debug": "~3.1.0", - "isarray": "2.0.1" - } - }, - "source-map": { - "version": "0.5.7", - "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", - "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=", - "dev": true - }, - "source-map-resolve": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/source-map-resolve/-/source-map-resolve-0.5.2.tgz", - "integrity": "sha512-MjqsvNwyz1s0k81Goz/9vRBe9SZdB09Bdw+/zYyO+3CuPk6fouTaxscHkgtE8jKvf01kVfl8riHzERQ/kefaSA==", - "dev": true, - "requires": { - "atob": "^2.1.1", - "decode-uri-component": "^0.2.0", - "resolve-url": "^0.2.1", - "source-map-url": "^0.4.0", - "urix": "^0.1.0" - } - }, - "source-map-url": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/source-map-url/-/source-map-url-0.4.0.tgz", - "integrity": "sha1-PpNdfd1zYxuXZZlW1VEo6HtQhKM=", - "dev": true - }, - "spdx-correct": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.0.tgz", - "integrity": "sha512-lr2EZCctC2BNR7j7WzJ2FpDznxky1sjfxvvYEyzxNyb6lZXHODmEoJeFu4JupYlkfha1KZpJyoqiJ7pgA1qq8Q==", - "dev": true, - "requires": { - "spdx-expression-parse": "^3.0.0", - "spdx-license-ids": "^3.0.0" - } - }, - "spdx-exceptions": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/spdx-exceptions/-/spdx-exceptions-2.2.0.tgz", - "integrity": "sha512-2XQACfElKi9SlVb1CYadKDXvoajPgBVPn/gOQLrTvHdElaVhr7ZEbqJaRnJLVNeaI4cMEAgVCeBMKF6MWRDCRA==", - "dev": true - }, - "spdx-expression-parse": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-3.0.0.tgz", - "integrity": "sha512-Yg6D3XpRD4kkOmTpdgbUiEJFKghJH03fiC1OPll5h/0sO6neh2jqRDVHOQ4o/LMea0tgCkbMgea5ip/e+MkWyg==", - "dev": true, - "requires": { - "spdx-exceptions": "^2.1.0", - "spdx-license-ids": "^3.0.0" - } - }, - "spdx-license-ids": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.4.tgz", - "integrity": "sha512-7j8LYJLeY/Yb6ACbQ7F76qy5jHkp0U6jgBfJsk97bwWlVUnUWsAgpyaCvo17h0/RQGnQ036tVDomiwoI4pDkQA==", - "dev": true - }, - "split-string": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/split-string/-/split-string-3.1.0.tgz", - "integrity": "sha512-NzNVhJDYpwceVVii8/Hu6DKfD2G+NrQHlS/V/qgv763EYudVwEcMQNxd2lh+0VrUByXN/oJkl5grOhYWvQUYiw==", - "dev": true, - "requires": { - "extend-shallow": "^3.0.0" - } - }, - "static-extend": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz", - "integrity": "sha1-YICcOcv/VTNyJv1eC1IPNB8ftcY=", - "dev": true, - "requires": { - "define-property": "^0.2.5", - "object-copy": "^0.1.0" - }, - "dependencies": { - "define-property": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", - "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", - "dev": true, - "requires": { - "is-descriptor": "^0.1.0" - } - } - } - }, - "statuses": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.3.1.tgz", - "integrity": "sha1-+vUbnrdKrvOzrPStX2Gr8ky3uT4=", - "dev": true - }, - "stream-throttle": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/stream-throttle/-/stream-throttle-0.1.3.tgz", - "integrity": "sha1-rdV8jXzHOoFjDTHNVdOWHPr7qcM=", - "dev": true, - "requires": { - "commander": "^2.2.0", - "limiter": "^1.0.5" - } - }, - "string-width": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", - "integrity": "sha1-EYvfW4zcUaKn5w0hHgfisLmxB9M=", - "dev": true, - "requires": { - "code-point-at": "^1.0.0", - "is-fullwidth-code-point": "^1.0.0", - "strip-ansi": "^3.0.0" - } - }, - "string.prototype.padend": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/string.prototype.padend/-/string.prototype.padend-3.0.0.tgz", - "integrity": "sha1-86rvfBcZ8XDF6rHDK/eA2W4h8vA=", - "dev": true, - "requires": { - "define-properties": "^1.1.2", - "es-abstract": "^1.4.3", - "function-bind": "^1.0.2" - } - }, - "string_decoder": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", - "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", - "dev": true, - "requires": { - "safe-buffer": "~5.1.0" - } - }, - "strip-ansi": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", - "integrity": "sha1-ajhfuIU9lS1f8F0Oiq+UJ43GPc8=", - "dev": true, - "requires": { - "ansi-regex": "^2.0.0" - } - }, - "strip-bom": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-2.0.0.tgz", - "integrity": "sha1-YhmoVhZSBJHzV4i9vxRHqZx+aw4=", - "dev": true, - "requires": { - "is-utf8": "^0.2.0" - } - }, - "supports-color": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", - "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", - "dev": true - }, - "symbol-observable": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/symbol-observable/-/symbol-observable-1.0.1.tgz", - "integrity": "sha1-g0D8RwLDEi310iKI+IKD9RPT/dQ=", - "dev": true - }, - "tfunk": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/tfunk/-/tfunk-3.1.0.tgz", - "integrity": "sha1-OORBT8ZJd9h6/apy+sttKfgve1s=", - "dev": true, - "requires": { - "chalk": "^1.1.1", - "object-path": "^0.9.0" - } - }, - "to-array": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/to-array/-/to-array-0.1.4.tgz", - "integrity": "sha1-F+bBH3PdTz10zaek/zI46a2b+JA=", - "dev": true - }, - "to-object-path": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/to-object-path/-/to-object-path-0.3.0.tgz", - "integrity": "sha1-KXWIt7Dn4KwI4E5nL4XB9JmeF68=", - "dev": true, - "requires": { - "kind-of": "^3.0.2" - }, - "dependencies": { - "kind-of": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", - "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", - "dev": true, - "requires": { - "is-buffer": "^1.1.5" - } - } - } - }, - "to-regex": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/to-regex/-/to-regex-3.0.2.tgz", - "integrity": "sha512-FWtleNAtZ/Ki2qtqej2CXTOayOH9bHDQF+Q48VpWyDXjbYxA4Yz8iDB31zXOBUlOHHKidDbqGVrTUvQMPmBGBw==", - "dev": true, - "requires": { - "define-property": "^2.0.2", - "extend-shallow": "^3.0.2", - "regex-not": "^1.0.2", - "safe-regex": "^1.1.0" - } - }, - "to-regex-range": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-2.1.1.tgz", - "integrity": "sha1-fIDBe53+vlmeJzZ+DU3VWQFB2zg=", - "dev": true, - "requires": { - "is-number": "^3.0.0", - "repeat-string": "^1.6.1" - } - }, - "toidentifier": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz", - "integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw==", - "dev": true - }, - "ua-parser-js": { - "version": "0.7.17", - "resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-0.7.17.tgz", - "integrity": "sha512-uRdSdu1oA1rncCQL7sCj8vSyZkgtL7faaw9Tc9rZ3mGgraQ7+Pdx7w5mnOSF3gw9ZNG6oc+KXfkon3bKuROm0g==", - "dev": true - }, - "ultron": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/ultron/-/ultron-1.1.1.tgz", - "integrity": "sha512-UIEXBNeYmKptWH6z8ZnqTeS8fV74zG0/eRU9VGkpzz+LIJNs8W/zM/L+7ctCkRrgbNnnR0xxw4bKOr0cW0N0Og==", - "dev": true - }, - "union-value": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/union-value/-/union-value-1.0.0.tgz", - "integrity": "sha1-XHHDTLW61dzr4+oM0IIHulqhrqQ=", - "dev": true, - "requires": { - "arr-union": "^3.1.0", - "get-value": "^2.0.6", - "is-extendable": "^0.1.1", - "set-value": "^0.4.3" - }, - "dependencies": { - "extend-shallow": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", - "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", - "dev": true, - "requires": { - "is-extendable": "^0.1.0" - } - }, - "set-value": { - "version": "0.4.3", - "resolved": "https://registry.npmjs.org/set-value/-/set-value-0.4.3.tgz", - "integrity": "sha1-fbCPnT0i3H945Trzw79GZuzfzPE=", - "dev": true, - "requires": { - "extend-shallow": "^2.0.1", - "is-extendable": "^0.1.1", - "is-plain-object": "^2.0.1", - "to-object-path": "^0.3.0" - } - } - } - }, - "universalify": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz", - "integrity": "sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==", - "dev": true - }, - "unpipe": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", - "integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw=", - "dev": true - }, - "unset-value": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/unset-value/-/unset-value-1.0.0.tgz", - "integrity": "sha1-g3aHP30jNRef+x5vw6jtDfyKtVk=", - "dev": true, - "requires": { - "has-value": "^0.3.1", - "isobject": "^3.0.0" - }, - "dependencies": { - "has-value": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/has-value/-/has-value-0.3.1.tgz", - "integrity": "sha1-ex9YutpiyoJ+wKIHgCVlSEWZXh8=", - "dev": true, - "requires": { - "get-value": "^2.0.3", - "has-values": "^0.1.4", - "isobject": "^2.0.0" - }, - "dependencies": { - "isobject": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz", - "integrity": "sha1-8GVWEJaj8dou9GJy+BXIQNh+DIk=", - "dev": true, - "requires": { - "isarray": "1.0.0" - } - } - } - }, - "has-values": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/has-values/-/has-values-0.1.4.tgz", - "integrity": "sha1-bWHeldkd/Km5oCCJrThL/49it3E=", - "dev": true - }, - "isarray": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", - "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=", - "dev": true - } - } - }, - "upath": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/upath/-/upath-1.1.2.tgz", - "integrity": "sha512-kXpym8nmDmlCBr7nKdIx8P2jNBa+pBpIUFRnKJ4dr8htyYGJFokkr2ZvERRtUN+9SY+JqXouNgUPtv6JQva/2Q==", - "dev": true - }, - "urix": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/urix/-/urix-0.1.0.tgz", - "integrity": "sha1-2pN/emLiH+wf0Y1Js1wpNQZ6bHI=", - "dev": true - }, - "use": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/use/-/use-3.1.1.tgz", - "integrity": "sha512-cwESVXlO3url9YWlFW/TA9cshCEhtu7IKJ/p5soJ/gGpj7vbvFrAY/eIioQ6Dw23KjZhYgiIo8HOs1nQ2vr/oQ==", - "dev": true - }, - "util-deprecate": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=", - "dev": true - }, - "utils-merge": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", - "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=", - "dev": true - }, - "validate-npm-package-license": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz", - "integrity": "sha512-DpKm2Ui/xN7/HQKCtpZxoRWBhZ9Z0kqtygG8XCgNQ8ZlDnxuQmWhj566j8fN4Cu3/JmbhsDo7fcAJq4s9h27Ew==", - "dev": true, - "requires": { - "spdx-correct": "^3.0.0", - "spdx-expression-parse": "^3.0.0" - } - }, - "which": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz", - "integrity": "sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ==", - "dev": true, - "requires": { - "isexe": "^2.0.0" - } - }, - "which-module": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/which-module/-/which-module-1.0.0.tgz", - "integrity": "sha1-u6Y8qGGUiZT/MHc2CJ47lgJsKk8=", - "dev": true - }, - "window-size": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/window-size/-/window-size-0.2.0.tgz", - "integrity": "sha1-tDFbtCFKPXBY6+7okuE/ok2YsHU=", - "dev": true - }, - "wrap-ansi": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", - "integrity": "sha1-2Pw9KE3QV5T+hJc8rs3Rz4JP3YU=", - "dev": true, - "requires": { - "string-width": "^1.0.1", - "strip-ansi": "^3.0.1" - } - }, - "wrappy": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=", - "dev": true - }, - "ws": { - "version": "6.1.4", - "resolved": "https://registry.npmjs.org/ws/-/ws-6.1.4.tgz", - "integrity": "sha512-eqZfL+NE/YQc1/ZynhojeV8q+H050oR8AZ2uIev7RU10svA9ZnJUddHcOUZTJLinZ9yEfdA2kSATS2qZK5fhJA==", - "dev": true, - "requires": { - "async-limiter": "~1.0.0" - } - }, - "xmlhttprequest-ssl": { - "version": "1.5.5", - "resolved": "https://registry.npmjs.org/xmlhttprequest-ssl/-/xmlhttprequest-ssl-1.5.5.tgz", - "integrity": "sha1-wodrBhaKrcQOV9l+gRkayPQ5iz4=", - "dev": true - }, - "y18n": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/y18n/-/y18n-3.2.2.tgz", - "integrity": "sha512-uGZHXkHnhF0XeeAPgnKfPv1bgKAYyVvmNL1xlKsPYZPaIHxGti2hHqvOCQv71XMsLxu1QjergkqogUnms5D3YQ==", - "dev": true - }, - "yargs": { - "version": "6.4.0", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-6.4.0.tgz", - "integrity": "sha1-gW4ahm1VmMzzTlWW3c4i2S2kkNQ=", - "dev": true, - "requires": { - "camelcase": "^3.0.0", - "cliui": "^3.2.0", - "decamelize": "^1.1.1", - "get-caller-file": "^1.0.1", - "os-locale": "^1.4.0", - "read-pkg-up": "^1.0.1", - "require-directory": "^2.1.1", - "require-main-filename": "^1.0.1", - "set-blocking": "^2.0.0", - "string-width": "^1.0.2", - "which-module": "^1.0.0", - "window-size": "^0.2.0", - "y18n": "^3.2.1", - "yargs-parser": "^4.1.0" - } - }, - "yargs-parser": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-4.2.1.tgz", - "integrity": "sha1-KczqwNxPA8bIe0qfIX3RjJ90hxw=", - "dev": true, - "requires": { - "camelcase": "^3.0.0" - } - }, - "yeast": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/yeast/-/yeast-0.1.2.tgz", - "integrity": "sha1-AI4G2AlDIMNy28L47XagymyKxBk=", - "dev": true - } - } -} diff --git a/docs/package.json b/docs/package.json deleted file mode 100644 index 839cab7e2d..0000000000 --- a/docs/package.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": "kedro-docs", - "version": "1.0.0", - "main": "build/html/index.html", - "scripts": { - "serve": "browser-sync start --server 'build/html' --files 'build/html/_static/css/*.css'", - "watch": "copy-and-watch --watch source/css/*.css build/html/_static/css", - "start": "npm-run-all -p serve watch" - }, - "author": "Richard Westenra ", - "devDependencies": { - "browser-sync": "^2.26.7", - "copy-and-watch": "^0.1.2", - "npm-run-all": "^4.1.5" - } -} diff --git a/docs/source/01_introduction/01_introduction.md b/docs/source/01_introduction/01_introduction.md deleted file mode 100644 index 41d8c30579..0000000000 --- a/docs/source/01_introduction/01_introduction.md +++ /dev/null @@ -1,26 +0,0 @@ -# What is Kedro? - -Kedro is an open-source Python framework for creating reproducible, maintainable and modular data science code. It borrows concepts from software engineering best-practice and applies them to machine-learning code; applied concepts include modularity, separation of concerns and versioning. - -For the source code, take a look at the [Kedro repository on Github](https://github.com/quantumblacklabs/kedro). - -## Learn how to use Kedro - -In the next few chapters, you will learn how to [install Kedro](../02_get_started/01_prerequisites.md) and set up your own production-ready data pipelines. - -Once you are set up, we suggest working through our examples, including: - -- A typical "Hello World" example, for an [entry-level description of the main Kedro concepts](https://kedro.readthedocs.io/en/stable/02_get_started/03_hello_kedro.html) -- An [introduction to the project template](https://kedro.readthedocs.io/en/stable/02_get_started/05_example_project.html) using the Iris dataset -- A more detailed [spaceflights tutorial](https://kedro.readthedocs.io/en/stable/03_tutorial/02_tutorial_template.html) to give you hands-on experience - -We also recommend the [frequently asked questions](../12_faq/01_faq.md) and the [API reference documentation](/kedro.rst) for additional information. - -## Assumptions - -We have designed the documentation and the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) for anyone new to Kedro. The more knowledge of Python you have, the easier you will find the learning curve. - -```eval_rst -.. note:: There are a number of excellent online resources for learning Python, but you should choose those that reference Python 3, as Kedro is built for Python 3.7+. There are many curated lists of online resources, such as the `official Python programming language website `_ and `this list of free programming books and tutorials `_. - -``` diff --git a/docs/source/02_get_started/01_prerequisites.md b/docs/source/02_get_started/01_prerequisites.md deleted file mode 100644 index 83552d26d9..0000000000 --- a/docs/source/02_get_started/01_prerequisites.md +++ /dev/null @@ -1,99 +0,0 @@ -# Installation prerequisites - -- Kedro supports macOS, Linux and Windows (7 / 8 / 10 and Windows Server 2016+). If you encounter any problems on these platforms, please check the [frequently asked questions](../12_faq/01_faq.md), [GitHub Discussions](https://github.com/quantumblacklabs/kedro/discussions) or the [Discord Server](https://discord.gg/akJDeVaxnB). - -- To work with Kedro, we highly recommend that you [download and install Anaconda](https://www.anaconda.com/products/individual#Downloads) (Python 3.x version). - -- If you are using PySpark, you will also need to [install Java](https://www.oracle.com/java/technologies/javase-downloads.html). If you are a Windows user, you will need admin rights to complete the installation. - -## Virtual environments - -The main purpose of Python virtual environments is to create an isolated environment for a Python project to have its own dependencies, regardless of other projects. We recommend that you create a new virtual environment for *each* new Kedro project you create. - -> [Read more about Python Virtual Environments](https://realpython.com/python-virtual-environments-a-primer/). - -Depending on your preferred Python installation, you can create virtual environments for working with Kedro as follows: - -- With [`conda`](#conda), a package and environment manager program bundled with Anaconda - -- Without Anaconda using [`venv`](#venv-instead-of-conda) or [`pipenv`](#pipenv-instead-of-conda) - -### `conda` - -[Install `conda`](https://docs.conda.io/projects/conda/en/latest/user-guide/install/) on your computer. - -Create a new Python virtual environment, called `kedro-environment`, using `conda`: - -```bash -conda create --name kedro-environment python=3.7 -y -``` - -This will create an isolated Python 3.7 environment. To activate it: - -```bash -conda activate kedro-environment -``` - -To exit `kedro-environment`: - -```bash -conda deactivate -``` - -```eval_rst -.. note:: The ``conda`` virtual environment is not dependent on your current working directory and can be activated from any directory. -``` - -### `venv` (instead of `conda`) - -If you are using Python 3, you should already have the `venv` module installed with the standard library. Create a directory for working with Kedro within your virtual environment: - -```bash -mkdir kedro-environment && cd kedro-environment -``` - -This will create a `kedro-environment` directory in your current working directory. Then you should create a new virtual environment in this directory by running: - -```bash -python -m venv env/kedro-environment # macOS / Linux -python -m venv env\kedro-environment # Windows -``` - -Activate this virtual environment: - -```bash -source env/kedro-environment/bin/activate # macOS / Linux -.\env\Scripts\activate # Windows -``` - -To exit the environment: - -```bash -deactivate -``` - -### `pipenv` (instead of `conda`) - -You will need to install `pipenv` as follows: - -```bash -pip install pipenv -``` - -Create a directory for the virtual environment and change to that directory: - -```bash -mkdir kedro-environment && cd kedro-environment -``` - -Once all the dependencies are installed, to start a session with the correct virtual environment activated: - -```bash -pipenv shell -``` - -To exit the shell session: - -```bash -exit -``` diff --git a/docs/source/02_get_started/02_install.md b/docs/source/02_get_started/02_install.md deleted file mode 100644 index c9082cce4d..0000000000 --- a/docs/source/02_get_started/02_install.md +++ /dev/null @@ -1,36 +0,0 @@ -# Install Kedro - -To install Kedro from the Python Package Index (PyPI) simply run: - -```bash -pip install kedro -``` - -```eval_rst -.. note:: It is also possible to install Kedro using ``conda``, as follows, but we recommend using ``pip`` at this point to eliminate any potential dependency issues, as follows: -``` - -```bash -conda install -c conda-forge kedro -``` - -Both `pip` and `conda` install the core Kedro module, which includes the CLI tool, project template, pipeline abstraction, framework, and support for configuration. - -## Verify a successful installation - -To check that Kedro is installed: - -```bash -kedro info -``` - -You should see an ASCII art graphic and the Kedro version number. For example: - -![](../meta/images/kedro_graphic.png) - -If you do not see the graphic displayed, or have any issues with your installation, see the [frequently asked questions](../12_faq/01_faq.md), check out [GitHub Discussions](https://github.com/quantumblacklabs/kedro/discussions) or talk to the community on the [Discord Server](https://discord.gg/akJDeVaxnB) -. - -## Install a development version - -You can try out a development version of Kedro direct from the [Kedro Github repository](https://github.com/quantumblacklabs/kedro) by following [these steps](../12_faq/01_faq.md#how-can-i-use-a-development-version-of-kedro). diff --git a/docs/source/02_get_started/03_hello_kedro.md b/docs/source/02_get_started/03_hello_kedro.md deleted file mode 100644 index 0719bf0f0e..0000000000 --- a/docs/source/02_get_started/03_hello_kedro.md +++ /dev/null @@ -1,124 +0,0 @@ -# A "Hello World" example - -It is time to introduce the most basic elements of Kedro. We have split a small example into sections to discuss each of the concepts with code. - -You can copy the example as one chunk of code from the bottom of this page. - -```eval_rst -.. note:: We do not create a Kedro project in this first example, but illustrate the concepts within a single ``.py`` file. -``` - -## Node - -A `node` is a Kedro concept. It is a wrapper for a Python function that names the inputs and outputs of that function. It is the building block of a pipeline. Nodes can be linked when the output of one node is the input of another. - -Here, the `return_greeting` function is wrapped by a node called `return_greeting_node`, which has no inputs, and names a single output (`my_salutation`): - -```python -from kedro.pipeline import node - - -# Prepare first node -def return_greeting(): - return "Hello" - - -return_greeting_node = node(func=return_greeting, inputs=None, outputs="my_salutation") -``` - -The `join_statements` function is wrapped by a node called `join_statements_node`, which names a single input (`my_salutation`) and a single output (`my_message`): - -```python -# Prepare second node -def join_statements(greeting): - return f"{greeting} Kedro!" - - -join_statements_node = node( - join_statements, inputs="my_salutation", outputs="my_message" -) -``` - -Note that `my_salutation` is the output of `return_greeting_node` and also the input of `join_statements_node`. - -## Pipeline - -A pipeline organises the dependencies and execution order of a collection of nodes, and connects inputs and outputs while keeping your code modular. The pipeline determines the node execution order by resolving dependencies and does *not* necessarily run the nodes in the order in which they are passed in. - -In this example the pipeline executes `return_greeting_node` before it executes `join_statements_node`: - -```python -from kedro.pipeline import Pipeline - -# Assemble nodes into a pipeline -pipeline = Pipeline([return_greeting_node, join_statements_node]) -``` - -## DataCatalog - -A `DataCatalog` is a Kedro concept. It is the registry of all data sources that the project can use. It maps the names of node inputs and outputs as keys in a `DataSet`, which is a Kedro class that can be specialised for different types of data storage. Kedro uses a `MemoryDataSet` for data that is simply stored in-memory. - -```python -from kedro.io import DataCatalog, MemoryDataSet - -# Prepare a data catalog -data_catalog = DataCatalog({"my_salutation": MemoryDataSet()}) -``` - -Kedro provides a [number of different built-in datasets](/kedro.extras.datasets) for different file types and file systems so you don’t have to write the logic for reading/writing data. - -## Runner - -The Runner is an object that runs the pipeline. Kedro resolves the order in which the nodes are executed: - -1. Kedro first executes `return_greeting_node`. This runs `return_greeting`, which takes no input but outputs the string "Hello". -2. The output string is stored in the `MemoryDataSet` named `my_salutation`. -3. Kedro then executes the second node, `join_statements_node`. This loads the `my_salutation` dataset and injects it into the `join_statements` function. -4. The function joins the input salutation with "Kedro!" to form the output string "Hello Kedro!" -5. The output of the pipeline is returned in a dictionary with key `my_message`. - -## Hello Kedro! - -It's now time to stitch the code together. Here is the full example: - -```python -"""Contents of hello_kedro.py""" -from kedro.io import DataCatalog, MemoryDataSet -from kedro.pipeline import node, Pipeline -from kedro.runner import SequentialRunner - -# Prepare a data catalog -data_catalog = DataCatalog({"my_salutation": MemoryDataSet()}) - -# Prepare first node -def return_greeting(): - return "Hello" - - -return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation") - -# Prepare second node -def join_statements(greeting): - return f"{greeting} Kedro!" - - -join_statements_node = node( - join_statements, inputs="my_salutation", outputs="my_message" -) - -# Assemble nodes into a pipeline -pipeline = Pipeline([return_greeting_node, join_statements_node]) - -# Create a runner to run the pipeline -runner = SequentialRunner() - -# Run the pipeline -print(runner.run(pipeline, data_catalog)) -``` -Then open a terminal and run the following command: - -```bash -python hello_kedro.py -``` - -You should see `{'my_message': 'Hello Kedro!'}` printed to the console. diff --git a/docs/source/02_get_started/04_new_project.md b/docs/source/02_get_started/04_new_project.md deleted file mode 100644 index 6c224439b8..0000000000 --- a/docs/source/02_get_started/04_new_project.md +++ /dev/null @@ -1,62 +0,0 @@ -# Create a new project - -Once you have [installed Kedro](./02_install.md), you can create a new, empty project by answering a series of questions, or by using settings recorded in a configuration file. - -If you want to create a Kedro project that is populated with some template or example code, you can use Kedro starters by specifying the `--starter` flag. Read the guide to [creating new projects with Kedro Starters](./06_starters.md) for more information. - -## Create a new project interactively - -Create a new project in your current working directory: - -```bash -kedro new -``` - -You will be asked to enter each of the following variables in turn. Once you have entered text for the first option (the project's name), you will be offered a default choice for the other options: - -```eval_rst -+------------------------+---------------------+----------------------------------------------------------------------------+ -| Option | Example | Description | -+========================+=====================+============================================================================+ -| :code:`project_name` | :code:`Get Started` | A human-readable name for your new project | -+------------------------+---------------------+----------------------------------------------------------------------------+ -| :code:`repo_name` | :code:`get-started` | Directory that holds your project repository | -+------------------------+---------------------+----------------------------------------------------------------------------+ -| :code:`python_package` | :code:`get_started` | A name for the Python package name in your project (short, all-lowercase) | -+------------------------+---------------------+----------------------------------------------------------------------------+ -``` - - -The output lists the directory in which to find the project. - -## Create a new project from a configuration file - -You can create a new project from a configuration file if you prefer. The file must contain: - -- `output_dir` The path in which to create the project directory -- `project_name` -- `repo_name` -- `python_package` - -The `output_dir` can be set to wherever you want to create the project. For example, `~` for your home directory, or `.` for the current working directory. Here is an example `config.yml`, which assumes that a directory named `~/code` already exists: - -```yaml -output_dir: ~/code -project_name: Get Started -repo_name: get-started -python_package: get_started -``` - -To create the new project: - -```bash -kedro new --config config.yml -``` - -## Initialise a `git` repository - -Having created your new project, if you are using `git`, you may want to set up a new repository by calling: - -```bash -git init -``` diff --git a/docs/source/02_get_started/05_example_project.md b/docs/source/02_get_started/05_example_project.md deleted file mode 100644 index 338c90c7dc..0000000000 --- a/docs/source/02_get_started/05_example_project.md +++ /dev/null @@ -1,150 +0,0 @@ -# Iris dataset example project - -In this chapter we describe the directory structure of a typical Kedro project. We will use an example based on the familiar [Iris dataset](https://www.kaggle.com/uciml/iris). - -The dataset was generated in 1936 by the British statistician and biologist Ronald Fisher. It contains 150 samples in total, comprising 50 samples of 3 different species of Iris plant (Iris Setosa, Iris Versicolour and Iris Virginica). For each sample, the flower measurements are recorded for the sepal length, sepal width, petal length and petal width, as illustrated in the following graphic. - -![](../meta/images/iris_measurements.png) - -The Iris dataset can be used by a machine learning model to illustrate classification (a method used to determine the type of an object by comparison with similar objects that have previously been categorised). Once trained on known data, the machine learning model can make a predictive classification by comparing a test object to the output of its training data. - -## Create the example project - -You must first [create a project](./04_new_project.md). Feel free to name your project as you like, but here we will assume the project's repository name is `get-started`. - -```bash -kedro new --starter=pandas-iris -``` - -### Project directory structure - -This example project illustrates a convenient starting point and some best-practices. It follows the default Kedro project template and uses folders to store datasets, notebooks, configuration and source code. When you create your own projects, you can adapt the folder structure if you need to. - -The example project directory is set out as follows: - -``` -get-started # Parent directory of the template -├── conf # Project configuration files -├── data # Local project data (not committed to version control) -├── docs # Project documentation -├── logs # Project output logs (not committed to version control) -├── notebooks # Project related Jupyter notebooks (can be used for experimental code before moving the code to src) -├── README.md # Project README -├── setup.cfg # Configuration options for `pytest` when doing `kedro test` and for the `isort` utility when doing `kedro lint` -└── src # Project source code -``` - -Kedro also creates the following hidden files and folders: - -``` -get-started -├── .coveragerc # Configuration file for the coverage reporting when doing `kedro test` -├── .gitignore # Prevent staging of unnecessary files to `git` -├── .ipython # IPython startup scripts -└── pyproject.toml # Identifies the project root and [contains configuration information](https://kedro.readthedocs.io/en/latest/11_faq/02_architecture_overview.html#kedro-yml) -``` - -#### `conf/` - -Within the `conf` folder, there are two subfolders for storing configuration information: `base` and `local`. - -##### `conf/base/` - -For project-specific settings to share across different installations (for example, with different users) you should use the `base` subfolder of `conf`. - -The folder contains three files for the example, but you can add others as you require: - -- `catalog.yml` - [Configures the Data Catalog](../05_data/01_data_catalog.md#using-the-data-catalog-within-kedro-configuration) with the file paths and load/save configuration required for different datasets -- `logging.yml` - Uses Python's default [`logging`](https://docs.python.org/3/library/logging.html) library to set up logging -- `parameters.yml` - Allows you to define parameters for machine learning experiments e.g. train / test split and number of iterations - -##### `conf/local/` - -The `local` subfolder of `conf` is used for **settings that should not be shared**, such as access credentials, custom editor configuration, personal IDE configuration and other sensitive or personal content. It is specific to user and installation. The contents of `conf/local/` is ignored by `git` (through inclusion in `.gitignore`). By default, Kedro creates one file, `credentials.yml`, in `conf/local`. - -#### `data` - -The `data` folder contains a number of subfolders to store project data. We recommend that you put raw data into `raw` and move processed data to other subfolders according to [data engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention). - -The example project has a single file, `iris.csv`, that contains the Iris dataset. The subfolders of `data` are ignored by `git` through inclusion in `.gitignore` since data is more frequently stored elsewhere, such as in an S3 bucket. However, if you are familiar with [`.gitignore`](https://docs.github.com/en/github/using-git/ignoring-files) you can edit it, if you are confident that you need to manage your data in `git`. - -#### `src` - -This subfolder contains the project's source code. It contains 2 subfolders: - -- `get_started/` This is the Python package for your project -- `tests/` The subfolder for unit tests for your project. Projects are preconfigured to run tests using `pytest` when you call `kedro test` from the project's root directory - -### What best practice should I follow to avoid leaking confidential data? - -* Avoid committing data to version control. -* Avoid committing notebook output cells (data can easily sneak into notebooks when you don't delete output cells). -* Avoid committing credentials in `conf/`. Only the `conf/local/` folder should be used for sensitive information like access credentials. - -```eval_rst -.. note:: By default any file inside the ``conf/`` folder (and its subfolders) containing ``credentials`` in its name will be ignored via ``.gitignore``. -``` - - -## Run the example project - -Once you have created the project, to run project-specific Kedro commands, you need to navigate to the directory in which it has been created. - -Call `kedro install` to install the project's dependencies. Next, call `kedro run`: - -```bash -cd getting-started -kedro install -kedro run -``` - -When the command completes, you should see a log message similar to the following in your console: - -```bash -2019-02-13 16:59:26,293 - kedro.runner.sequential_runner - INFO - Completed 4 out of 4 tasks -2019-02-13 16:59:26,293 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. -``` - -## Under the hood: Pipelines and nodes - -The example project contains two modular pipelines: - -- A `data_engineering` pipeline (`src/get_started/pipelines/data_engineering/pipeline.py`) responsible for splitting the data into training and testing samples - -- A `data_science` pipeline (`src/get_started/pipelines/data_science/pipeline.py`) responsible for model training, predictions and accuracy-reporting - - -**Data engineering node** - -This is the data engineering node function within `src/get_started/pipelines/data_engineering/nodes.py`: - -```eval_rst -+-----------------+----------------------------------------------------------------+--------------------------+ -| Node | Description | Node Function Name | -+=================+================================================================+==========================+ -| Split data | Splits the example | :code:`split_data` | -| | `Iris dataset ` | | -| | into train and test samples | | -+-----------------+----------------------------------------------------------------+--------------------------+ -``` - -**Data science node** - -These are the data science node functions within `pipelines/data_science/nodes.py`: - -```eval_rst -+-----------------+----------------------------------------------------------------+--------------------------+ -| Node | Description | Node Function Name | -+=================+================================================================+==========================+ -| Train model | Trains a simple multi-class logistic regression model | :code:`train_model` | -+-----------------+----------------------------------------------------------------+--------------------------+ -| Predict | Makes class predictions given a pre-trained model and a test | :code:`predict` | -| | set | | -+-----------------+----------------------------------------------------------------+--------------------------+ -| Report accuracy | Reports the accuracy of the predictions performed by the | :code:`report_accuracy` | -| | previous node | | -+-----------------+----------------------------------------------------------------+--------------------------+ -``` - - -The file `src/pipeline_registry.py` creates and collates the project's modular pipelines into a single pipeline, resolving node execution order from the input and output data dependencies between the nodes. diff --git a/docs/source/02_get_started/06_starters.md b/docs/source/02_get_started/06_starters.md deleted file mode 100644 index fcf297ac47..0000000000 --- a/docs/source/02_get_started/06_starters.md +++ /dev/null @@ -1,86 +0,0 @@ -# Kedro starters - -Kedro starters are used to create projects that contain code to run as-is, or to adapt and extend. They provide pre-defined example code and configuration that can be reused, for example: - -* As example code for a typical Kedro project -* To add a `docker-compose` setup to launch Kedro next to a monitoring stack -* To add deployment scripts and CI/CD setup for your targeted infrastructure - -A Kedro starter is a [Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/) template that contains the boilerplate code for a Kedro project. You can create your own starters for reuse within a project or team, as described in the documentation about [how to create a Kedro starter](../07_extend_kedro/05_create_kedro_starters.md). - -## How to use Kedro starters - -To create a Kedro project using a starter, apply the `--starter` flag to `kedro new` as follows: - -```bash -kedro new --starter= -``` - -```eval_rst -.. note:: ``path-to-starter`` could be a local directory or a VCS repository, as long as it is supported by `Cookiecutter `_. -``` - -To create a project using the `PySpark` starter: - -```bash -kedro new --starter=pyspark -``` - -If no starter is provided to `kedro new`, the default Kedro template will be used, as documented in ["Creating a new project"](./04_new_project.md). - -### Starter aliases - -We provide aliases for common starters maintained by the Kedro team so that users don't have to specify the full path. For example, to create a project using the `PySpark` starter: - -```bash -kedro new --starter=pyspark -``` - -To list all the aliases we support: - -```bash -kedro starter list -``` - -## List of official starters - -The Kedro team maintains the following starters to bootstrap new Kedro projects: - -* [Alias `astro-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/astro-iris): The [Kedro Iris dataset example project](https://kedro.readthedocs.io/en/stable/02_get_started/05_example_project.html) with a minimal setup for deploying the pipeline on Airflow with [Astronomer](https://www.astronomer.io/). -* [Alias `standalone-datacatalog`](https://github.com/quantumblacklabs/kedro-starters/tree/master/standalone-datacatalog): A minimum setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../05_data/01_data_catalog.md), which is a core component of Kedro. This starter is of use in the exploratory phase of a project. For more information, read the guide to [standalone use of the `DataCatalog`](../02_get_started/07_standalone_use_of_datacatalog.md). This starter was formerly known as `mini-kedro`. -* [Alias `pandas-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pandas-iris): The [Kedro Iris dataset example project](./05_example_project.md) -* [Alias `pyspark-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pyspark-iris): An alternative Kedro Iris dataset example, using [PySpark](../11_tools_integration/01_pyspark.md) -* [Alias `pyspark`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pyspark): The configuration and initialisation code for a [Kedro pipeline using PySpark](../11_tools_integration/01_pyspark.md) -* [Alias `spaceflights`](https://github.com/quantumblacklabs/kedro-starters/tree/master/spaceflights): The [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) example code - -## Starter versioning - -By default, Kedro will use the latest version available in the repository, but if you want to use a specific version of a starter, you can pass a `--checkout` argument to the command as follows: - -```bash -kedro new --starter=pyspark --checkout=0.1.0 -``` - -The `--checkout` value points to a branch, tag or commit in the starter repository. - -Under the hood, the value will be passed to the [`--checkout` flag in Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/usage.html#works-directly-with-git-and-hg-mercurial-repos-too). - -## Use a starter in interactive mode - -By default, when you create a new project using a starter, `kedro new` launches [by asking a few questions](./04_new_project.md#create-a-new-project-interactively). You will be prompted to provide the following variables: - -* `project_name` - A human readable name for your new project -* `repo_name` - A name for the directory that holds your project repository -* `python_package` - A Python package name for your project package (see [Python package naming conventions](https://www.python.org/dev/peps/pep-0008/#package-and-module-names)) - -This mode assumes that the starter doesn't require any additional configuration variables. - -## Use a starter with a configuration file - -Kedro also allows you to [specify a configuration file](./04_new_project.md#Create-a-new-project-from-a-configuration-file) to create a project. Use the `--config` flag alongside the starter as follows: - -```bash -kedro new --config=my_kedro_pyspark_project.yml --starter=pyspark -``` - -This option is useful when the starter requires more configuration than is required by the interactive mode. diff --git a/docs/source/02_get_started/07_standalone_use_of_datacatalog.md b/docs/source/02_get_started/07_standalone_use_of_datacatalog.md deleted file mode 100644 index fdd97e7c0a..0000000000 --- a/docs/source/02_get_started/07_standalone_use_of_datacatalog.md +++ /dev/null @@ -1,66 +0,0 @@ -# Standalone use of the `DataCatalog` - -## Introduction - -To make it easier to share a Jupyter notebook with others you need to avoid hard-coded file paths used to load or save data. One way to explore data within a shareable Jupyter notebook is take advantage of Kedro's [`DataCatalog`](https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html), but in the early phases of a project, you may not want to use any other Kedro features. - -The Kedro starter with alias `standalone-datacatalog` (formerly known as `mini-kedro`) provides this minimal functionality. You can specify the sources required to load and save data using a YAML API. For example: - - ```yaml -# conf/base/catalog.yml -example_dataset_1: - type: pandas.CSVDataSet - filepath: folder/filepath.csv - -example_dataset_2: - type: spark.SparkDataSet - filepath: s3a://your_bucket/data/01_raw/example_dataset_2* - credentials: dev_s3 - file_format: csv - save_args: - if_exists: replace -``` - -This makes it possible to interact with data within your Jupyter notebook, with code much like this: - -```python -df = catalog.load("example_dataset_1") -df_2 = catalog.save("example_dataset_2") -``` - -## Usage - -Create a new project using the [`standalone-datacatalog` starter](https://github.com/quantumblacklabs/kedro-starters/tree/master/standalone-datacatalog): - -```bash -$ kedro new --starter=standalone-datacatalog -``` - -## Content - -The starter comprises a minimal setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../05_data/01_data_catalog.md). - -The starter contains: - -* A `conf/` directory, which contains an example `DataCatalog` configuration (`catalog.yml`) -* A `data/` directory, which contains an example dataset identical to the one used by the [`pandas-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pandas-iris) starter -* An example notebook showing how to instantiate the `DataCatalog` and interact with the example dataset -* A blank `README.md` which points to this page of documentation - -## Create a full Kedro project - -When you later wish to build a full pipeline, you can use the same configuration, with the following steps: - -***1. Create a new empty Kedro project in a new directory*** - -Let's assume that the new project is created at `/path/to/your/project`: - -```bash -kedro new -``` - -***2. Copy the `conf/` and `data/` directories from your `standalone-datacatalog` starter project over to your new project*** - -```bash -cp -fR {conf,data} `/path/to/your/project` -``` diff --git a/docs/source/03_tutorial/01_spaceflights_tutorial.md b/docs/source/03_tutorial/01_spaceflights_tutorial.md deleted file mode 100644 index fd07a35fca..0000000000 --- a/docs/source/03_tutorial/01_spaceflights_tutorial.md +++ /dev/null @@ -1,87 +0,0 @@ -# Kedro spaceflights tutorial - -**Scenario**: *It is 2160 and the space tourism industry is booming. Globally, there are thousands of space shuttle companies taking tourists to the Moon and back. You have been able to source amenities offered in each space shuttle, customer reviews and company information.* - -**Project**: *You want to construct a model for predicting the price for each trip to the Moon and the corresponding return flight.* - -In this tutorial, we illustrate the typical Kedro workflow and the steps necessary to convert an empty Kedro project template into a working project. - -In the text, we assume that you create an empty project and follow the flow of the tutorial by copying and pasting the example code into the project as we describe. This tutorial will take approximately 2 hours and you will learn each step of the Kedro project development workflow, by working on an example to construct nodes and pipelines for the price-prediction model. - -```eval_rst -.. note:: You may prefer to get up and running more swiftly so we provide the full spaceflights example project as a `Kedro starter <../02_get_started/06_starters.md>`_. To create the project, run ``kedro new --starter=spaceflights``. When prompted for a project name, enter ``Kedro Tutorial``. Subsequently, accept the default suggestions for ``repo_name`` and ``python_package`` by pressing enter. This will generate a project from the `Kedro starter for the spaceflights tutorial `_ so you can follow the tutorial without any of the copy/pasting. -``` - -## Kedro project development workflow - -When building a Kedro project, you will typically follow a standard development workflow: - -![](../meta/images/typical_workflow.png) - -### 1. Set up the project template - -* Create a new project with `kedro new` -* Install project dependencies with `kedro install` -* Configure the following in the `conf` folder: - * Logging - * Credentials - * Any other sensitive / personal content - -### 2. Set up the data - -* Add data to the `data/` folder -* Reference all datasets for the project in `conf/base/catalog.yml` - -### 3. Create the pipeline - -* Create the data transformation steps as Python functions -* Construct the pipeline by adding your functions as nodes -* Choose how to run the pipeline: sequentially or in parallel - -### 4. Package the project - - * Build the project documentation - * Package the project for distribution - -## Optional: Git workflow - -### Create a project repository - -We recommend that you use `git` for source control, but Kedro doesn't require it, and can work without any source control management system. This section is optional if you choose not to use a `git` repository. - -```eval_rst -.. note:: If you are unfamiliar with a typical ``git`` workflow, you can follow one of the most popular, known as `Gitflow `_. -``` - -If you don't have a local `git` repository for your project already, navigate to the project directory and create one: - -```bash -git init -git remote add origin https://github.com/ -``` - -### Submit your changes to GitHub - -As you work on a project, you will periodically save your changes. In a team, we suggest that you each develop your code on a branch and create pull requests to submit it to the `develop` or `main` branches: - -```bash -# create a new feature branch called 'feature/project-template' -git checkout -b feature/project-template -# stage all the files you have changed -git add . -# commit changes to git with an instructive message -git commit -m 'Create project template' -# push changes to remote branch -git push origin feature/project-template -``` - -It isn't necessary to branch, but if everyone in a team works on the same branch (e.g. `main`), you may have to resolve merge conflicts more often. Here is an example of working directly on `main`: - -```bash -# stage all files -git add . -# commit changes to git with an instructive message -git commit -m 'Create project template' -# push changes to remote main -git push origin main -``` diff --git a/docs/source/03_tutorial/02_tutorial_template.md b/docs/source/03_tutorial/02_tutorial_template.md deleted file mode 100644 index 7918319440..0000000000 --- a/docs/source/03_tutorial/02_tutorial_template.md +++ /dev/null @@ -1,83 +0,0 @@ -# Set up the spaceflights project - -In this section, we discuss the project set-up phase, which is the first part of the [standard development workflow](./01_spaceflights_tutorial.md#kedro-project-development-workflow). The set-up steps are as follows: - - -* Create a new project -* Install dependencies -* Configure the project - - -## Create a new project - -Navigate to your chosen working directory and run the following to [create a new empty Kedro project](../02_get_started/04_new_project.md#create-a-new-project-interactively) using the default interactive prompts: - -```bash -kedro new -``` - -When prompted for a project name, enter `Kedro Tutorial`. Subsequently, accept the default suggestions for `repo_name` and `python_package` by pressing enter. - -## Install project dependencies with `kedro install` - -To install the project-specific dependencies, navigate to the root directory of the project and run: - -```bash -kedro install -``` - -### More about project dependencies - -Up to this point, we haven't discussed project dependencies, so now is a good time to examine them. We use Kedro to specify a project's dependencies and make it easier for others to run your project. It avoids version conflicts because Kedro ensures that you use same Python packages and versions. - -The generic project template bundles some typical dependencies, in `src/requirements.txt`. Here's a typical example, although you may find that the version numbers are slightly different depending on the version of Kedro that you are using: - -```text -black==21.5b1 # Used for formatting code with `kedro lint` -flake8>=3.7.9, <4.0 # Used for linting code with `kedro lint` -ipython==7.0 # Used for an IPython session with `kedro ipython` -isort~=5.0 # Used for linting code with `kedro lint` -jupyter~=1.0 # Used to open a Kedro-session in Jupyter Notebook & Lab -jupyter_client>=5.1.0, <7.0 # Used to open a Kedro-session in Jupyter Notebook & Lab -jupyterlab~=3.0 # Used to open a Kedro-session in Jupyter Lab -kedro==0.17.4 -nbstripout~=0.4 # Strips the output of a Jupyter Notebook and writes the outputless version to the original file -pytest-cov~=2.5 # Produces test coverage reports -pytest-mock>=1.7.1, <2.0 # Wrapper around the mock package for easier use with pytest -pytest~=6.2 # Testing framework for Python code -wheel>=0.35, <0.37 # The reference implementation of the Python wheel packaging standard -``` - -```eval_rst -.. note:: If your project has ``conda`` dependencies, you can create a ``src/environment.yml`` file and list them there. -``` - - -### Add and remove project-specific dependencies - -The dependencies above may be sufficient for some projects, but for the spaceflights project, you need to add a requirement for the `pandas` project because you are working with CSV and Excel files. You can add the necessary dependencies for these files types as follows: - -```bash -pip install "kedro[pandas.CSVDataSet,pandas.ExcelDataSet]" -``` - -Alternatively, if you need to, you can edit `src/requirements.txt` directly to modify your list of dependencies by replacing the requirement `kedro==0.17.4` with the following (your version of Kedro may be different): - -```text -kedro[pandas.CSVDataSet,pandas.ExcelDataSet]==0.17.4 -``` - -Then run the following: - -```bash -kedro build-reqs -``` - -You can find out more about [how to work with project dependencies](../04_kedro_project_setup/01_dependencies.md) in the Kedro project documentation. In a [later step of this tutorial](./04_create_pipelines.md#update-dependencies), we will modify project's dependencies to illustrate how, once you have installed project-specific dependencies, you can update them. - - -## Configure the project - -You may optionally add in any credentials to `conf/local/credentials.yml` that you would need to load specific data sources like usernames and passwords. Some examples are given within the file to illustrate how you store credentials. Additional information can be found in the [advanced documentation on configuration](../04_kedro_project_setup/02_configuration.md). - -At this stage of the workflow, you may also want to [set up logging](../08_logging/01_logging.md), but we do not use it in this tutorial. diff --git a/docs/source/03_tutorial/03_set_up_data.md b/docs/source/03_tutorial/03_set_up_data.md deleted file mode 100644 index 71220d6455..0000000000 --- a/docs/source/03_tutorial/03_set_up_data.md +++ /dev/null @@ -1,144 +0,0 @@ -# Set up the data - -In this section, we discuss the data set-up phase, which is the second part of the [standard development workflow](./01_spaceflights_tutorial.md#kedro-project-development-workflow). The steps are as follows: - -* Add datasets to your `data/` folder, according to [data engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention) -* Register the datasets with the Data Catalog in `conf/base/catalog.yml`, which is the registry of all data sources available for use by the project. This ensures that your code is reproducible when it references datasets in different locations and/or environments. - -You can find further information about [the Data Catalog](../05_data/01_data_catalog.md) in specific documentation covering advanced usage. - - -## Add your datasets to `data` - -The spaceflights tutorial makes use of fictional datasets of companies shuttling customers to the Moon and back. You will use the data to train a model to predict the price of shuttle hire. However, before you get to train the model, you will need to prepare the data for model building by creating a model input table. - -The spaceflight tutorial has three files and uses two data formats: `.csv` and `.xlsx`. Download and save the files to the `data/01_raw/` folder of your project directory: - -* [reviews.csv](https://quantumblacklabs.github.io/kedro/reviews.csv) -* [companies.csv](https://quantumblacklabs.github.io/kedro/companies.csv) -* [shuttles.xlsx](https://quantumblacklabs.github.io/kedro/shuttles.xlsx) - -Here are some examples of how you can [download the files from GitHub](https://www.quora.com/How-do-I-download-something-from-GitHub) to the `data/01_raw` directory inside your project: - -Using [cURL in a Unix terminal](https://curl.se/download.html): - -
-Click to expand - -```bash -# reviews -curl -o data/01_raw/reviews.csv https://quantumblacklabs.github.io/kedro/reviews.csv -# companies -curl -o data/01_raw/companies.csv https://quantumblacklabs.github.io/kedro/companies.csv -# shuttles -curl -o data/01_raw/shuttles.xlsx https://quantumblacklabs.github.io/kedro/shuttles.xlsx -``` -
- -Using [cURL for Windows](https://curl.se/windows/): - -
-Click to expand - -```bat -curl -o data\01_raw\reviews.csv https://quantumblacklabs.github.io/kedro/reviews.csv -curl -o data\01_raw\companies.csv https://quantumblacklabs.github.io/kedro/companies.csv -curl -o data\01_raw\shuttles.xlsx https://quantumblacklabs.github.io/kedro/shuttles.xlsx -``` -
- -Using [Wget in a Unix terminal](https://www.gnu.org/software/wget/): - -
-Click to expand - -```bash -# reviews -wget -O data/01_raw/reviews.csv https://quantumblacklabs.github.io/kedro/reviews.csv -# companies -wget -O data/01_raw/companies.csv https://quantumblacklabs.github.io/kedro/companies.csv -# shuttles -wget -O data/01_raw/shuttles.xlsx https://quantumblacklabs.github.io/kedro/shuttles.xlsx -``` -
- -Using [Wget for Windows](https://eternallybored.org/misc/wget/): - -
-Click to expand - -```bat -wget -O data\01_raw\reviews.csv https://quantumblacklabs.github.io/kedro/reviews.csv -wget -O data\01_raw\companies.csv https://quantumblacklabs.github.io/kedro/companies.csv -wget -O data\01_raw\shuttles.xlsx https://quantumblacklabs.github.io/kedro/shuttles.xlsx -``` -
- -## Register the datasets - -You now need to register the datasets so they can be loaded by Kedro. All Kedro projects have a `conf/base/catalog.yml` file, and you register each dataset by adding a named entry into the `.yml` file. The entry should include the following: - -* File location (path) -* Parameters for the given dataset -* Type of data -* Versioning - -Kedro supports a number of different data types, and those supported can be found in the API documentation. Kedro uses [`fssspec`](https://filesystem-spec.readthedocs.io/en/latest/) to read data from a variety of data stores including local file systems, network file systems, cloud object stores and HDFS. - - -### `csv` - -For the spaceflights data, first register the `csv` datasets by adding this snippet to the end of the `conf/base/catalog.yml` file: - -```yaml -companies: - type: pandas.CSVDataSet - filepath: data/01_raw/companies.csv - -reviews: - type: pandas.CSVDataSet - filepath: data/01_raw/reviews.csv -``` - -To check whether Kedro can load the data correctly, open a `kedro ipython` session and run: - -```python -companies = catalog.load("companies") -companies.head() -``` - -The command loads the dataset named `companies` (as per top-level key in `catalog.yml`) from the underlying filepath `data/01_raw/companies.csv` into the variable `companies`, which is of type `pandas.DataFrame`. The `head` method from `pandas` then displays the first five rows of the DataFrame. - -When you have finished, close `ipython` session as follows: - -```python -exit() -``` - -### `xlsx` - -Now register the `xlsx` dataset by adding this snippet to the end of the `conf/base/catalog.yml` file: - -```yaml -shuttles: - type: pandas.ExcelDataSet - filepath: data/01_raw/shuttles.xlsx -``` - -To test that everything works as expected, load the dataset within a _new_ `kedro ipython` session and display its first five rows: - -```python -shuttles = catalog.load("shuttles") -shuttles.head() -``` -When you have finished, close `ipython` session as follows: - -```python -exit() -``` - -## Custom data - -Kedro supports a number of [datasets](/kedro.extras.datasets) out of the box, but you can also add support for any proprietary data format or filesystem in your pipeline. - -You can find further information about [how to add support for custom datasets](../07_extend_kedro/03_custom_datasets.md) in specific documentation covering advanced usage. diff --git a/docs/source/03_tutorial/04_create_pipelines.md b/docs/source/03_tutorial/04_create_pipelines.md deleted file mode 100644 index 259f5d2cf0..0000000000 --- a/docs/source/03_tutorial/04_create_pipelines.md +++ /dev/null @@ -1,619 +0,0 @@ -# Create a pipeline - -This section covers the third part of the [standard development workflow](./01_spaceflights_tutorial.md#kedro-project-development-workflow), and covers the following: - -* How to create each [node](../13_resources/02_glossary.md#node) required by the example -* How to set up a [pipeline](../13_resources/02_glossary.md#pipeline) - - -## Data processing pipeline - -You previously registered the raw datasets for your Kedro project, so you can now create nodes to pre-process two of the datasets, [companies.csv](https://github.com/quantumblacklabs/kedro-starters/blob/master/spaceflights/%7B%7B%20cookiecutter.repo_name%20%7D%7D/data/01_raw/companies.csv) and [shuttles.xlsx](https://github.com/quantumblacklabs/kedro-starters/blob/master/spaceflights/%7B%7B%20cookiecutter.repo_name%20%7D%7D/data/01_raw/shuttles.xlsx), to prepare the data for modelling. - -### Node functions - -Create a file `src/kedro_tutorial/pipelines/data_processing/nodes.py`, adding the subfolders too if necessary. - -Add the code below, which provides two functions (`preprocess_companies` and `preprocess_shuttles`) that each input a raw dataframe and output a dataframe containing pre-processed data: - -
-Click to expand - -```python -import pandas as pd - - -def _is_true(x): - return x == "t" - - -def _parse_percentage(x): - x = x.str.replace("%", "") - x = x.astype(float) / 100 - return x - - -def _parse_money(x): - x = x.str.replace("$", "").str.replace(",", "") - x = x.astype(float) - return x - - -def preprocess_companies(companies: pd.DataFrame) -> pd.DataFrame: - """Preprocesses the data for companies. - - Args: - companies: Raw data. - Returns: - Preprocessed data, with `company_rating` converted to a float and - `iata_approved` converted to boolean. - """ - companies["iata_approved"] = _is_true(companies["iata_approved"]) - companies["company_rating"] = _parse_percentage(companies["company_rating"]) - return companies - - -def preprocess_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame: - """Preprocesses the data for shuttles. - - Args: - shuttles: Raw data. - Returns: - Preprocessed data, with `price` converted to a float and `d_check_complete`, - `moon_clearance_complete` converted to boolean. - """ - shuttles["d_check_complete"] = _is_true(shuttles["d_check_complete"]) - shuttles["moon_clearance_complete"] = _is_true(shuttles["moon_clearance_complete"]) - shuttles["price"] = _parse_money(shuttles["price"]) - return shuttles -``` -
- -### Assemble nodes into the data processing pipeline - -The next steps are to create a [node](../13_resources/02_glossary.md#node) for each function, and to create a [modular pipeline](../13_resources/02_glossary.md#modular-pipeline) for data processing: - -Add the following to `src/kedro_tutorial/pipelines/data_processing/pipeline.py`, so the `create_pipeline()` function looks as follows: - -
-Click to expand - -```python -def create_pipeline(**kwargs): - return Pipeline( - [ - node( - func=preprocess_companies, - inputs="companies", - outputs="preprocessed_companies", - name="preprocess_companies_node", - ), - node( - func=preprocess_shuttles, - inputs="shuttles", - outputs="preprocessed_shuttles", - name="preprocess_shuttles_node", - ), - ] - ) -``` -
- -```eval_rst -.. note:: ``companies`` and ``shuttles`` refer to the datasets defined in ``conf/base/catalog.yml``. These are inputs to the ``preprocess_companies`` and ``preprocess_shuttles`` functions. The named node inputs (and outputs) are used by the pipeline to determine interdependencies between the nodes, and hence, their execution order. -``` - -Be sure to import `node`, and your functions by adding them to the beginning of `pipeline.py`: - -```python -from kedro.pipeline import Pipeline, node - -from .nodes import preprocess_companies, preprocess_shuttles -``` - -You should also create a file `src/kedro_tutorial/pipelines/data_processing/__init__.py` containing the following: - -```python -from .pipeline import create_pipeline # NOQA -``` -This file ensures that the `data_processing` folder is a Python package, in accordance with the [standard format for a modular pipeline](../06_nodes_and_pipelines/03_modular_pipelines.md#how-do-i-create-a-modular-pipeline). - -### Update the project pipeline - -Now update the project's pipeline in `src/kedro_tutorial/pipeline_registry.py` to add the [modular pipeline](../13_resources/02_glossary.md#modular-pipeline) for data processing: - -
-Click to expand - -```python -from typing import Dict - -from kedro.pipeline import Pipeline - -from kedro_tutorial.pipelines import data_processing as dp - - -def register_pipelines() -> Dict[str, Pipeline]: - """Register the project's pipeline. - - Returns: - A mapping from a pipeline name to a ``Pipeline`` object. - - """ - data_processing_pipeline = dp.create_pipeline() - - return { - "__default__": data_processing_pipeline, - "dp": data_processing_pipeline, - } -``` -
- -### Test the example - -Run the following command in your terminal window to test the node named `preprocess_companies_node`: - -```bash -kedro run --node=preprocess_companies_node -``` - -You should see output similar to the below: - -```bash -2019-08-19 10:44:33,112 - root - INFO - ** Kedro project kedro-tutorial -2019-08-19 10:44:33,123 - kedro.io.data_catalog - INFO - Loading data from `companies` (CSVDataSet)... -2019-08-19 10:44:33,161 - kedro.pipeline.node - INFO - Running node: preprocess_companies_node: preprocess_companies([companies]) -> [preprocessed_companies] -2019-08-19 10:44:33,206 - kedro.io.data_catalog - INFO - Saving data to `preprocess_companies_node` (MemoryDataSet)... -2019-08-19 10:44:33,471 - kedro.runner.sequential_runner - INFO - Completed 1 out of 1 tasks -2019-08-19 10:44:33,471 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. - -``` - -To test the entire data processing pipeline: - -```bash -kedro run -``` - -You should see output similar to the following: - -```bash -kedro run - -2019-08-19 10:50:39,950 - root - INFO - ** Kedro project kedro-tutorial -2019-08-19 10:50:39,957 - kedro.io.data_catalog - INFO - Loading data from `shuttles` (ExcelDataSet)... -2019-08-19 10:50:48,521 - kedro.pipeline.node - INFO - Running node: preprocess_shuttles_node: preprocess_shuttles([shuttles]) -> [preprocessed_shuttles] -2019-08-19 10:50:48,587 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_shuttles` (MemoryDataSet)... -2019-08-19 10:50:49,133 - kedro.runner.sequential_runner - INFO - Completed 1 out of 2 tasks -2019-08-19 10:50:49,133 - kedro.io.data_catalog - INFO - Loading data from `companies` (CSVDataSet)... -2019-08-19 10:50:49,168 - kedro.pipeline.node - INFO - Running node: preprocess_companies_node: preprocess_companies([companies]) -> [preprocessed_companies] -2019-08-19 10:50:49,212 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_companies` (MemoryDataSet)... -2019-08-19 10:50:49,458 - kedro.runner.sequential_runner - INFO - Completed 2 out of 2 tasks -2019-08-19 10:50:49,459 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. - -``` - - -### Persist pre-processed data - -The nodes above each output a new dataset (`preprocessed_companies` and `preprocessed_shuttles`). When Kedro ran the pipeline, it determined that neither datasets had been registered in the data catalog (`conf/base/catalog.yml`). If a dataset is not registered, Kedro stores it in memory as a Python object using the [MemoryDataSet](/kedro.io.MemoryDataSet) class. Once all nodes depending on it have been executed, the `MemoryDataSet` is cleared and its memory released by the Python garbage collector. - -You can persist the preprocessed data by adding the following to `conf/base/catalog.yml`: - -```yaml -preprocessed_companies: - type: pandas.CSVDataSet - filepath: data/02_intermediate/preprocessed_companies.csv - -preprocessed_shuttles: - type: pandas.CSVDataSet - filepath: data/02_intermediate/preprocessed_shuttles.csv -``` - -The code above declares explicitly that [pandas.CSVDataSet](/kedro.extras.datasets.pandas.CSVDataSet) should be used instead of [`MemoryDataSet`](/kedro.io.MemoryDataSet). - -The [Data Catalog](../13_resources/02_glossary.md#data-catalog) will take care of saving the datasets automatically (in this case as CSV data) to the path specified next time the pipeline is run. There is no need to change any code in your preprocessing functions to accommodate this change. - -In this tutorial, we chose `pandas.CSVDataSet` for its simplicity, but you can use any other available dataset implementation class, for example, a database table, cloud storage (like [AWS S3](https://aws.amazon.com/s3/), [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/), etc.) or others. If you cannot find the dataset implementation you need, you can implement your own [custom dataset](../07_extend_kedro/03_custom_datasets.md). - -### Extend the data processing pipeline - -The next step in the tutorial is to add another node for a function to join together the three dataframes into a single model input table. First, add the `create_model_input_table()` function from the snippet below to `src/kedro_tutorial/pipelines/data_processing/nodes.py`. - -
-Click to expand - -```python -def create_model_input_table( - shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame -) -> pd.DataFrame: - """Combines all data to create a model input table. - - Args: - shuttles: Preprocessed data for shuttles. - companies: Preprocessed data for companies. - reviews: Raw data for reviews. - Returns: - model input table. - - """ - rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id") - model_input_table = rated_shuttles.merge( - companies, left_on="company_id", right_on="id" - ) - model_input_table = model_input_table.dropna() - return model_input_table -``` -
- - -Add the function to the data processing pipeline in `src/kedro_tutorial/pipelines/data_processing/pipeline.py` as a node: - -```python -node( - func=create_model_input_table, - inputs=["preprocessed_shuttles", "preprocessed_companies", "reviews"], - outputs="model_input_table", - name="create_model_input_table_node", -), -``` - -The code above informs Kedro that the function `create_model_input_table` should be called with the data loaded from datasets `preprocessed_shuttles`, `preprocessed_companies`, and `reviews` and the output should be saved to dataset `model_input_table`. - -Add an import statement for `create_model_input_table` at the top of the file: - -```python -from .nodes import create_model_input_table, preprocess_companies, preprocess_shuttles -``` - -If you want the model input table data to be saved to file rather than used in-memory, add an entry to `conf/base/catalog.yml`: - -```yaml -model_input_table: - type: pandas.CSVDataSet - filepath: data/03_primary/model_input_table.csv -``` - -### Test the example - -To test the progress of the example: - -```bash -kedro run -``` - -You should see output similar to the following: - -```bash -2019-08-19 10:55:47,534 - root - INFO - ** Kedro project kedro-tutorial -2019-08-19 10:55:47,541 - kedro.io.data_catalog - INFO - Loading data from `shuttles` (ExcelDataSet)... -2019-08-19 10:55:55,670 - kedro.pipeline.node - INFO - Running node: preprocess_shuttles_node: preprocess_shuttles([shuttles]) -> [preprocessed_shuttles] -2019-08-19 10:55:55,736 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_shuttles` (CSVDataSet)... -2019-08-19 10:55:56,284 - kedro.runner.sequential_runner - INFO - Completed 1 out of 3 tasks -2019-08-19 10:55:56,284 - kedro.io.data_catalog - INFO - Loading data from `companies` (CSVDataSet)... -2019-08-19 10:55:56,318 - kedro.pipeline.node - INFO - Running node: preprocess_companies_node: preprocess_companies([companies]) -> [preprocessed_companies] -2019-08-19 10:55:56,361 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_companies` (CSVDataSet)... -2019-08-19 10:55:56,610 - kedro.runner.sequential_runner - INFO - Completed 2 out of 3 tasks -2019-08-19 10:55:56,610 - kedro.io.data_catalog - INFO - Loading data from `preprocessed_shuttles` (CSVDataSet)... -2019-08-19 10:55:56,715 - kedro.io.data_catalog - INFO - Loading data from `preprocessed_companies` (CSVDataSet)... -2019-08-19 10:55:56,750 - kedro.io.data_catalog - INFO - Loading data from `reviews` (CSVDataSet)... -2019-08-19 10:55:56,812 - kedro.pipeline.node - INFO - Running node: create_model_input_table_node: create_model_input_table([preprocessed_companies,preprocessed_shuttles,reviews]) -> [model_input_table] -2019-08-19 10:55:58,679 - kedro.io.data_catalog - INFO - Saving data to `model_input_table` (CSVDataSet)... -2019-08-19 10:56:09,991 - kedro.runner.sequential_runner - INFO - Completed 3 out of 3 tasks -2019-08-19 10:56:09,991 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. -``` - - -## Data science pipeline - -We have created a modular pipeline for data processing, which merges three input datasets to create a model input table. Now we will create the data science pipeline for price prediction, which uses the [`LinearRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html) -implementation from the [scikit-learn](https://scikit-learn.org/stable/) library. - -### Update dependencies -We now need to add `scikit-learn` to the project's dependencies. This is a slightly different process from the initial change we made early in the tutorial. - -To **update** the project's dependencies, you should modify `src/requirements.in` to add the following. Note that you do not need to update ``src/requirements.txt`` as you did previously in the tutorial before you built the project's requirements with ``kedro build-reqs``: - -```text -scikit-learn==0.23.1 -``` - -Then, re-run `kedro install` with a flag telling Kedro to recompile the requirements: - -```bash -kedro install --build-reqs -``` - -You can find out more about [how to work with project dependencies](../04_kedro_project_setup/01_dependencies) in the Kedro project documentation. - -### Create a data science node - -Create a file `src/kedro_tutorial/pipelines/data_science/nodes.py`, adding the subfolders too if necessary. Add the following code to the file: - -
-Click to expand - -```python -import logging -from typing import Dict, Tuple - -import pandas as pd -from sklearn.linear_model import LinearRegression -from sklearn.metrics import r2_score -from sklearn.model_selection import train_test_split - - -def split_data(data: pd.DataFrame, parameters: Dict) -> Tuple: - """Splits data into features and targets training and test sets. - - Args: - data: Data containing features and target. - parameters: Parameters defined in parameters.yml. - Returns: - Split data. - """ - X = data[parameters["features"]] - y = data["price"] - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=parameters["test_size"], random_state=parameters["random_state"] - ) - return X_train, X_test, y_train, y_test - - -def train_model(X_train: pd.DataFrame, y_train: pd.Series) -> LinearRegression: - """Trains the linear regression model. - - Args: - X_train: Training data of independent features. - y_train: Training data for price. - - Returns: - Trained model. - """ - regressor = LinearRegression() - regressor.fit(X_train, y_train) - return regressor - - -def evaluate_model( - regressor: LinearRegression, X_test: pd.DataFrame, y_test: pd.Series -): - """Calculates and logs the coefficient of determination. - - Args: - regressor: Trained model. - X_test: Testing data of independent features. - y_test: Testing data for price. - """ - y_pred = regressor.predict(X_test) - score = r2_score(y_test, y_pred) - logger = logging.getLogger(__name__) - logger.info("Model has a coefficient R^2 of %.3f on test data.", score) -``` - -
- - -### Configure the input parameters - -Add the following to `conf/base/parameters.yml`: - -```yaml -test_size: 0.2 -random_state: 3 -features: - - engines - - passenger_capacity - - crew - - d_check_complete - - moon_clearance_complete - - iata_approved - - company_rating - - review_scores_rating -``` - -These are the parameters fed into the `DataCatalog` when the pipeline is executed. More information about [parameters](../04_kedro_project_setup/02_configuration.md#Parameters) is available in later documentation for advanced usage. Here, the parameters `test_size` and `random_state` are used as part of the train-test split, and `features` gives the names of columns in the model input table to use as features. - -### Register the dataset -The next step is to register the dataset that will save the trained model, by adding the following definition to `conf/base/catalog.yml`: - -```yaml -regressor: - type: pickle.PickleDataSet - filepath: data/06_models/regressor.pickle - versioned: true -``` - -Versioning is enabled for `regressor`, which means that the pickled output of the `regressor` will be versioned and saved every time the pipeline is run. This allows us to keep the history of the models built using this pipeline. Further details can be found in the [Versioning](../05_data/02_kedro_io.md#versioning) section. - -### Assemble the data science pipeline -To create a modular pipeline for the price prediction model, add the following to the top of `src/kedro_tutorial/pipelines/data_science/pipeline.py`: - -```python -from kedro.pipeline import Pipeline, node - -from .nodes import evaluate_model, split_data, train_model -``` - -And add the following pipeline definition to the same file: - -```python -def create_pipeline(**kwargs): - return Pipeline( - [ - node( - func=split_data, - inputs=["model_input_table", "parameters"], - outputs=["X_train", "X_test", "y_train", "y_test"], - name="split_data_node", - ), - node( - func=train_model, - inputs=["X_train", "y_train"], - outputs="regressor", - name="train_model_node", - ), - node( - func=evaluate_model, - inputs=["regressor", "X_test", "y_test"], - outputs=None, - name="evaluate_model_node", - ), - ] - ) -``` - -As with the data processing modular pipeline, you should ensure that the data science modular pipeline is a Python package. To do so, create the file `src/kedro_tutorial/pipelines/data_science/__init__.py` containing the following: - -```python -from .pipeline import create_pipeline # NOQA -``` - -### Update the project pipeline - -Add the data science pipeline to the project by replacing the code in `register_pipelines` in `src/kedro_tutorial/pipeline_registry.py` with the following: - -```python -def register_pipelines() -> Dict[str, Pipeline]: - """Register the project's pipeline. - - Returns: - A mapping from a pipeline name to a ``Pipeline`` object. - - """ - data_processing_pipeline = dp.create_pipeline() - data_science_pipeline = ds.create_pipeline() - - return { - "__default__": data_processing_pipeline + data_science_pipeline, - "dp": data_processing_pipeline, - "ds": data_science_pipeline, - } -``` - -Include the import at the top of the file: - -```python -from kedro_tutorial.pipelines import data_science as ds -``` - -The two modular pipelines are merged together into a project default pipeline by the `__default__` key used in `"__default__": data_processing_pipeline + data_science_pipeline`. -The `data_processing_pipeline` will preprocess the data, and `data_science_pipeline` will create features, train and evaluate the model. - -```eval_rst -.. note:: The order in which you add the pipelines together is not significant and ``data_science_pipeline + data_processing_pipeline`` will result in the same pipeline, since Kedro automatically detects the correct execution order for all the nodes in the resulting pipeline. -``` - -### Test the pipelines -Execute the default pipeline: - -```bash -kedro run -``` -You should see output similar to the following: - -
-Click to expand - -```bash -2019-08-19 10:51:46,501 - root - INFO - ** Kedro project kedro-tutorial -2019-08-19 10:51:46,510 - kedro.io.data_catalog - INFO - Loading data from `companies` (CSVDataSet)... -2019-08-19 10:51:46,547 - kedro.pipeline.node - INFO - Running node: preprocess_companies_node: preprocess_companies([companies]) -> [preprocessed_companies] -2019-08-19 10:51:46,597 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_companies` (CSVDataSet)... -2019-08-19 10:51:46,906 - kedro.runner.sequential_runner - INFO - Completed 1 out of 6 tasks -2019-08-19 10:51:46,906 - kedro.io.data_catalog - INFO - Loading data from `shuttles` (ExcelDataSet)... -2019-08-19 10:51:55,324 - kedro.pipeline.node - INFO - Running node: preprocess_shuttles_node: preprocess_shuttles([shuttles]) -> [preprocessed_shuttles] -2019-08-19 10:51:55,389 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_shuttles` (CSVDataSet)... -2019-08-19 10:51:55,932 - kedro.runner.sequential_runner - INFO - Completed 2 out of 6 tasks -2019-08-19 10:51:55,932 - kedro.io.data_catalog - INFO - Loading data from `preprocessed_shuttles` (CSVDataSet)... -2019-08-19 10:51:56,042 - kedro.io.data_catalog - INFO - Loading data from `preprocessed_companies` (CSVDataSet)... -2019-08-19 10:51:56,078 - kedro.io.data_catalog - INFO - Loading data from `reviews` (CSVDataSet)... -2019-08-19 10:51:56,139 - kedro.pipeline.node - INFO - Running node: create_model_input_table_node: create_model_input_table([preprocessed_companies,preprocessed_shuttles,reviews]) -> [model_input_table] -2019-08-19 10:51:58,037 - kedro.io.data_catalog - INFO - Saving data to `model_input_table` (CSVDataSet)... -2019-08-19 10:52:09,133 - kedro.runner.sequential_runner - INFO - Completed 3 out of 6 tasks -2019-08-19 10:52:09,133 - kedro.io.data_catalog - INFO - Loading data from `model_input_table` (CSVDataSet)... -2019-08-19 10:52:10,941 - kedro.io.data_catalog - INFO - Loading data from `parameters` (MemoryDataSet)... -2019-08-19 10:52:10,941 - kedro.pipeline.node - INFO - Running node: split_data_node: split_data([model_input_table,parameters]) -> [X_test,X_train,y_test,y_train] -2019-08-19 10:52:11,343 - kedro.io.data_catalog - INFO - Saving data to `X_train` (MemoryDataSet)... -2019-08-19 10:52:11,372 - kedro.io.data_catalog - INFO - Saving data to `X_test` (MemoryDataSet)... -2019-08-19 10:52:11,380 - kedro.io.data_catalog - INFO - Saving data to `y_train` (MemoryDataSet)... -2019-08-19 10:52:11,381 - kedro.io.data_catalog - INFO - Saving data to `y_test` (MemoryDataSet)... -2019-08-19 10:52:11,443 - kedro.runner.sequential_runner - INFO - Completed 4 out of 6 tasks -2019-08-19 10:52:11,443 - kedro.io.data_catalog - INFO - Loading data from `X_train` (MemoryDataSet)... -2019-08-19 10:52:11,472 - kedro.io.data_catalog - INFO - Loading data from `y_train` (MemoryDataSet)... -2019-08-19 10:52:11,474 - kedro.pipeline.node - INFO - Running node: train_model_node: train_model([X_train,y_train]) -> [regressor] -2019-08-19 10:52:11,704 - kedro.io.data_catalog - INFO - Saving data to `regressor` (PickleDataSet)... -2019-08-19 10:52:11,776 - kedro.runner.sequential_runner - INFO - Completed 5 out of 6 tasks -2019-08-19 10:52:11,776 - kedro.io.data_catalog - INFO - Loading data from `regressor` (PickleDataSet)... -2019-08-19 10:52:11,776 - kedro.io.data_catalog - INFO - Loading data from `X_test` (MemoryDataSet)... -2019-08-19 10:52:11,784 - kedro.io.data_catalog - INFO - Loading data from `y_test` (MemoryDataSet)... -2019-08-19 10:52:11,785 - kedro.pipeline.node - INFO - Running node: evaluate_model_node: evaluate_model([X_test,regressor,y_test]) -> None -2019-08-19 10:52:11,830 - kedro_tutorial.pipelines.data_science.nodes - INFO - Model has a coefficient R^2 of 0.462 on test data. -2019-08-19 10:52:11,869 - kedro.runner.sequential_runner - INFO - Completed 6 out of 6 tasks -2019-08-19 10:52:11,869 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. -``` -
- -## Kedro runners - -There are three different Kedro runners that can run the pipeline: - -* `SequentialRunner` - runs your nodes sequentially; once a node has completed its task then the next one starts. -* `ParallelRunner` - runs your nodes in parallel; independent nodes are able to run at the same time, which is more efficient when there are independent branches in your pipeline and allows you to take advantage of multiple CPU cores. -* `ThreadRunner` - runs your nodes in parallel, similarly to `ParallelRunner`, but uses multithreading instead of multiprocessing. - -By default, Kedro uses a `SequentialRunner`, which is instantiated when you execute `kedro run` from the command line. If you decide to use `ParallelRunner`, provide an additional flag when running the pipeline from the command line: - -```bash -kedro run --parallel -``` - -If you want to run using `ThreadRunner` or a custom runner, you can do so by running: - -```bash -kedro run --runner=ThreadRunner -kedro run --runner=module.path.to.my.runner -``` - -```eval_rst -.. note:: ``ParallelRunner`` performs task parallelisation, which is different from data parallelisation as seen in PySpark. -``` - -You can find out more about the runners Kedro provides, and how to create your own, in the [pipeline documentation about runners](../06_nodes_and_pipelines/04_run_a_pipeline.md). - -## Slice a pipeline - -In some cases you may want to run just part of a pipeline. For example, you may need to only run the data science pipeline to tune the hyperparameters of the price prediction model and skip data processing execution. You can 'slice' the pipeline and specify just the portion you want to run by using the `--pipeline` command line option. For example, to only run the pipeline named `ds` (as labelled in `register_pipelines`), execute the following command: - -```bash -kedro run --pipeline=ds -``` - -See the [pipeline slicing documentation](../06_nodes_and_pipelines/05_slice_a_pipeline.md) for other ways to run sections of your pipeline. - -```eval_rst -.. note:: To successfully run the pipeline, you need to make sure that all required input datasets already exist, otherwise you may get an error similar to this: -``` - -```bash -kedro run --pipeline=ds - -2019-10-04 12:36:12,135 - root - INFO - ** Kedro project kedro-tutorial -2019-10-04 12:36:12,158 - kedro.io.data_catalog - INFO - Loading data from `model_input_table` (CSVDataSet)... -2019-10-04 12:36:12,158 - kedro.runner.sequential_runner - WARNING - There are 3 nodes that have not run. -You can resume the pipeline run with the following command: -kedro run -Traceback (most recent call last): - ... - File "pandas/_libs/parsers.pyx", line 382, in pandas._libs.parsers.TextReader.__cinit__ - File "pandas/_libs/parsers.pyx", line 689, in pandas._libs.parsers.TextReader._setup_parser_source -FileNotFoundError: [Errno 2] File b'data/03_primary/model_input_table.csv' does not exist: b'data/03_primary/model_input_table.csv' - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - ... - raise DataSetError(message) from exc -kedro.io.core.DataSetError: Failed while loading data from data set CSVDataSet(filepath=data/03_primary/model_input_table.csv, save_args={'index': False}). -[Errno 2] File b'data/03_primary/model_input_table.csv' does not exist: b'data/03_primary/model_input_table.csv' -``` diff --git a/docs/source/03_tutorial/05_package_a_project.md b/docs/source/03_tutorial/05_package_a_project.md deleted file mode 100644 index 40defbac59..0000000000 --- a/docs/source/03_tutorial/05_package_a_project.md +++ /dev/null @@ -1,45 +0,0 @@ -# Package a project - -This section explains how to build your project documentation, and how to bundle your project into a Python package. - -## Add documentation to your project - -You can generate project-specific documentation by running `kedro build-docs` in the project's root directory. Kedro builds the resulting HTML files in `docs/build/html/`. To browse the documentation generated, open `docs/build/html/index.html` in your browser, or run `kedro build-docs --open` to automatically open the documentation after building. - -The `build-docs` command creates documentation based on the code structure of your project. Documentation includes any [`docstrings`](https://www.datacamp.com/community/tutorials/docstrings-python) defined in your code. - -Kedro uses the [Sphinx](https://www.sphinx-doc.org) framework, so if you want to customise your documentation, please refer to `docs/source/conf.py` and the [corresponding section of the Sphinx documentation](https://www.sphinx-doc.org/en/master/usage/configuration.html). - - -## Package your project - -To package your project, run the following in your project's root directory: - -```bash -kedro package -``` - -Kedro builds the package into the `dist/` folder of your project, and creates one `.egg` file and one `.whl` file, which are [Python packaging formats for binary distribution](https://packaging.python.org/). - -The resulting package only contains the Python source code of your Kedro pipeline, not any of the `conf/`, `data/` and `logs/` subfolders. This means that you can distribute the project to run elsewhere, such as on a separate computer with different configuration, data and logging. When distributed, the packaged project must be run from within a directory that contains the `conf/` subfolder (and `data/` and `logs/` if your pipeline loads/saves local data or uses logging). - -Recipients of the `.egg` and `.whl` files need to have Python and `pip` on their machines, but do not need to have Kedro installed. The project is installed to the root of a folder with the relevant `conf/`, `data/` and `logs/` subfolders, by navigating to the root and calling: - -```bash -pip install -``` - -For example, having installed project `kedro-spaceflights` and package `kedro_spaceflights`, a recipient can run the Kedro project as follows from the root of the project: - -```bash -python -m kedro_spaceflights.run -``` - -An executable, `kedro-spaceflights`, is also placed in the `bin/` subfolder of the Python installation location. - - -### Docker and Airflow - -We support the [Kedro-Docker](https://github.com/quantumblacklabs/kedro-docker) plugin for packaging and shipping Kedro projects within [Docker](https://www.docker.com/) containers. - -We also support [Kedro-Airflow](https://github.com/quantumblacklabs/kedro-airflow) to convert your Kedro project into an [Airflow](https://airflow.apache.org/) project. diff --git a/docs/source/03_tutorial/06_visualise_pipeline.md b/docs/source/03_tutorial/06_visualise_pipeline.md deleted file mode 100644 index 35482a73b3..0000000000 --- a/docs/source/03_tutorial/06_visualise_pipeline.md +++ /dev/null @@ -1,94 +0,0 @@ -# Visualise pipelines - -[Kedro-Viz](https://github.com/quantumblacklabs/kedro-viz) displays data and machine-learning pipelines in an informative way, emphasising the connections between datasets and nodes. It shows the structure of your Kedro pipeline. This exercise assumes that you have been following the [Spaceflights tutorial](01_spaceflights_tutorial.md). - -## Install Kedro-Viz - -You can install Kedro-Viz by running: -```bash -pip install kedro-viz -``` - -## Visualise a whole pipeline - -You should be in your project root directory, and once Kedro-Viz is installed you can visualise your pipeline by running: -```bash -kedro viz -``` - -This command will run a server on http://127.0.0.1:4141 that will open up your visualisation on a browser. You should - be able to see the following: - -![](../meta/images/pipeline_visualisation.png) - -If a visualisation panel opens up and a pipeline is not visible then please check that your [pipeline definition](04_create_pipelines.md) is complete. All other errors can be logged as GitHub Issues on the [Kedro-Viz repository](https://github.com/quantumblacklabs/kedro-viz). - -## Exit an open visualisation - -You exit this visualisation by closing the open browser and entering **Ctrl+C** or **Cmd+C** in your terminal. - -## Visualise layers - -A pipeline can be broken up into different layers according to how data is processed, and using a convention for layers makes it easier to collaborate. For example, the [data engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention) shown here labels datasets according to the stage of the pipeline (e.g. whether the data has been cleaned). - -Kedro-Viz makes it easy to visualise these data processing stages by adding a `layer` attribute to the datasets in the Data Catalog. We will be modifying `catalog.yml` with the following: - -```yaml -companies: - type: pandas.CSVDataSet - filepath: data/01_raw/companies.csv - layer: raw - -reviews: - type: pandas.CSVDataSet - filepath: data/01_raw/reviews.csv - layer: raw - -shuttles: - type: pandas.ExcelDataSet - filepath: data/01_raw/shuttles.xlsx - layer: raw - -preprocessed_companies: - type: pandas.CSVDataSet - filepath: data/02_intermediate/preprocessed_companies.csv - layer: intermediate - -preprocessed_shuttles: - type: pandas.CSVDataSet - filepath: data/02_intermediate/preprocessed_shuttles.csv - layer: intermediate - -model_input_table: - type: pandas.CSVDataSet - filepath: data/03_primary/model_input_table.csv - layer: primary - -regressor: - type: pickle.PickleDataSet - filepath: data/06_models/regressor.pickle - versioned: true - layer: models -``` - -Run kedro-viz again with `kedro viz` and observe how your visualisation has changed to indicate the layers: - -![](../meta/images/pipeline_visualisation_with_layers.png) - -## Share a pipeline - -Visualisations from Kedro-Viz are made shareable by using functionality that allows you to save the visualisation as a JSON file. - -To save a visualisation, run: -``` -kedro viz --save-file my_shareable_pipeline.json -``` - -This command will save a pipeline visualisation of your primary `__default__` pipeline as a JSON file called `my_shareable_pipeline.json`. - -To visualise a saved pipeline, run: -``` -kedro viz --load-file my_shareable_pipeline.json -``` - -And this will visualise the pipeline visualisation saved as `my_shareable_pipeline.json`. diff --git a/docs/source/04_kedro_project_setup/01_dependencies.md b/docs/source/04_kedro_project_setup/01_dependencies.md deleted file mode 100644 index e1a3eb7c8c..0000000000 --- a/docs/source/04_kedro_project_setup/01_dependencies.md +++ /dev/null @@ -1,81 +0,0 @@ -# Dependencies - -## Project-specific dependencies -When we introduced Kedro, we touched briefly on how to specify a project's dependencies to make it easier for others to run your project and avoid version conflicts downstream. - -You can add or remove dependencies. For a new project, edit `src/requirements.txt` and then run the following: - -```bash -kedro build-reqs -``` - -The `build-reqs` command will: - -1. Generate `src/requirements.in` from the contents of `src/requirements.txt` -2. [pip compile](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) the requirements listed in `src/requirements.in` -3. Regenerate `src/requirements.txt` to specify a list of pinned project dependencies (those with a strict version) - -```eval_rst -.. note:: ``src/requirements.in`` contains "source" requirements, while ``src/requirements.txt`` contains the compiled version of those and requires no manual updates. -``` - -To further update the project requirements, you should modify `src/requirements.in` (not `src/requirements.txt`) and re-run `kedro build-reqs`. - - -## `kedro install` - -To install the project-specific dependencies, navigate to the root directory of the project and run: - -```bash -kedro install -``` - -`kedro install` automatically compiles project dependencies by running `kedro build-reqs` behind the scenes if the `src/requirements.in` file doesn't exist. - -To skip the compilation step and install requirements as-is from `src/requirements.txt`, run the following: -```bash -kedro install --no-build-reqs -``` - -This takes the latest version of a dependency that is available within the range specified. It allows flexibility in the version of the dependency that `pip` installs. For example, if `ipython>=7.0.0,<8.0` is specified, then the most up-to-date version available is installed. - - -To force the compilation, even if `src/requirements.in` already exists, run the following: - -```bash -kedro install --build-reqs -``` - -In some cases, such as a production setting, this is useful to eliminate ambiguity and specify exactly the version of each dependency that is installed. - -## Workflow dependencies - -To install all of the dependencies recorded in Kedro's [`setup.py`](https://github.com/quantumblacklabs/kedro/blob/develop/setup.py) run: - -```bash -pip install "kedro[all]" -``` - -### Install dependencies related to the Data Catalog - -The [Data Catalog](../05_data/01_data_catalog.md) is your way of interacting with different data types in Kedro. The modular dependencies in this category include `pandas`, `numpy`, `pyspark`, `matplotlib`, `pillow`, `dask`, and more. - -#### Install dependencies at a group-level - -Data types are broken into groups e.g. `pandas`, `spark` and `pickle`. Each group has a collection of data types e.g.`pandas.CSVDataSet`, `pandas.ParquetDataSet` and more. You can install dependencies for an entire group of dependencies as follows: - -```bash -pip install "kedro[]" -``` - -This installs Kedro and dependencies related to the data type group. An example of this could be a workflow that depends on the data types in `pandas`. Run `pip install "kedro[pandas]"` to install Kedro and the dependencies for the data types in the [`pandas` group](https://github.com/quantumblacklabs/kedro/tree/develop/kedro/extras/datasets/pandas). - -#### Install dependencies at a type-level - -To limit installation to dependencies specific to a data type: - -```bash -pip install "kedro[.]" -``` - -For example, your workflow may require use of the `pandas.ExcelDataSet`, so to install its dependencies, run `pip install "kedro[pandas.ExcelDataSet]"`. diff --git a/docs/source/04_kedro_project_setup/02_configuration.md b/docs/source/04_kedro_project_setup/02_configuration.md deleted file mode 100644 index d4b5ae9ac5..0000000000 --- a/docs/source/04_kedro_project_setup/02_configuration.md +++ /dev/null @@ -1,376 +0,0 @@ -# Configuration - -This section contains detailed information about configuration, for which the relevant API documentation can be found in [kedro.config.ConfigLoader](/kedro.config.ConfigLoader). - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.18.0``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -## Configuration root - -We recommend that you keep all configuration files in the `conf` directory of a Kedro project. However, if you prefer, you may point Kedro to any other directory and change the configuration paths by setting the `CONF_ROOT` variable in `src//settings.py` as follows: -```python -CONF_ROOT = "new_conf" -``` - -## Local and base configuration environments - -Kedro-specific configuration (e.g., `DataCatalog` configuration for IO) is loaded using the `ConfigLoader` class: - -```python -from kedro.config import ConfigLoader - -conf_loader = ConfigLoader(conf_root="conf", env="local") -conf_catalog = conf_loader.get("catalog*", "catalog*/**") -``` - -This recursively scans for configuration files firstly in `conf/base/` (`base` being the default environment) and then in `conf/local/` (`local` being the designated overriding environment) directory according to the following rules: - -* *Either* of the following is true: - * filename starts with `catalog` - * file is located in a sub-directory whose name is prefixed with `catalog` -* *And* file extension is one of the following: `yaml`, `yml`, `json`, `ini`, `pickle`, `xml` or `properties` - -Configuration information from files stored in `base` or `local` that match these rules is merged at runtime and returned in the form of a config dictionary: - -* If any two configuration files located inside the same environment path (`conf/base/` or `conf/local/` in this example) contain the same top-level key, `load_config` will raise a `ValueError` indicating that the duplicates are not allowed. - -* If two configuration files have duplicate top-level keys but are in different environment paths (one in `conf/base/`, another in `conf/local/`, for example) then the last loaded path (`conf/local/` in this case) takes precedence and overrides that key value. `ConfigLoader.get` will not raise any errors, however a `DEBUG` level log message will be emitted with information on the overridden keys. - -Any top-level keys that start with `_` are considered hidden (or reserved) and are ignored after the config is loaded. Those keys will neither trigger a key duplication error nor appear in the resulting configuration dictionary. However, you may still use such keys, for example, as [YAML anchors and aliases](https://support.atlassian.com/bitbucket-cloud/docs/yaml-anchors/). - -## Additional configuration environments - -In addition to the two built-in local and base configuration environments, it is possible to create your own. Your project loads `conf/base/` as the bottom-level configuration environment but allows you to overwrite it with any other environments that you create such as `conf/server/` or `conf/test/`. Additional configuration environments are used by running the following command: - -```bash -kedro run --env=test -``` - -If no `env` option is specified, this will default to using the `local` environment to overwrite `conf/base`. - -If, for some reason, your project does not have any other environments apart from `base`, i.e. no `local` environment to default to, you will need to customise `KedroContext` to take `env="base"` in the constructor and then specify your custom `KedroContext` subclass in `src//settings.py` under the `CONTEXT_CLASS` key. - -If you set the `KEDRO_ENV` environment variable to the name of your environment, Kedro will load that environment for your `kedro run`, `kedro ipython`, `kedro jupyter notebook` and `kedro jupyter lab` sessions: - -```bash -export KEDRO_ENV=test -``` - -```eval_rst -.. note:: If you specify both the ``KEDRO_ENV`` environment variable and provide the ``--env`` argument to a CLI command, the CLI argument takes precedence. -``` - -## Template configuration - -Kedro also provides an extension [TemplatedConfigLoader](/kedro.config.TemplatedConfigLoader) class that allows you to template values in configuration files. To apply templating in your project, you will need to set the `CONFIG_LOADER_CLASS` constant in your `src//settings.py`: - -```python -from kedro.config import TemplatedConfigLoader # new import - -... -CONFIG_LOADER_CLASS = TemplatedConfigLoader -... -``` - -Let's assume the project contains a `conf/base/globals.yml` file with the following contents: - -```yaml -bucket_name: "my_s3_bucket" -key_prefix: "my/key/prefix/" - -datasets: - csv: "pandas.CSVDataSet" - spark: "spark.SparkDataSet" - -folders: - raw: "01_raw" - int: "02_intermediate" - pri: "03_primary" - fea: "04_feature" -``` - -The contents of the dictionary resulting from `globals_pattern` get merged with the `globals_dict` dictionary. In case of conflicts, the keys from the `globals_dict` dictionary take precedence. The resulting global dictionary prepared by `TemplatedConfigLoader` will look like this: - -```python -{ - "bucket_name": "another_bucket_name", - "non_string_key": 10, - "key_prefix": "my/key/prefix", - "datasets": {"csv": "pandas.CSVDataSet", "spark": "spark.SparkDataSet"}, - "folders": { - "raw": "01_raw", - "int": "02_intermediate", - "pri": "03_primary", - "fea": "04_feature", - }, -} -``` - -Now the templating can be applied to the configuration. Here is an example of a templated `conf/base/catalog.yml`: - -```yaml -raw_boat_data: - type: "${datasets.spark}" # nested paths into global dict are allowed - filepath: "s3a://${bucket_name}/${key_prefix}/${folders.raw}/boats.csv" - file_format: parquet - -raw_car_data: - type: "${datasets.csv}" - filepath: "s3://${bucket_name}/data/${key_prefix}/${folders.raw}/${filename|cars.csv}" # default to 'cars.csv' if the 'filename' key is not found in the global dict -``` - -Under the hood, `TemplatedConfigLoader` uses [`JMESPath` syntax](https://github.com/jmespath/jmespath.py) to extract elements from the globals dictionary. - -### Jinja2 support - -From version 0.17.0 `TemplateConfigLoader` also supports [Jinja2](https://palletsprojects.com/p/jinja/) template engine alongside the original template syntax. Below is an example of a `catalog.yml` file that uses both features: - -``` -{% for speed in ['fast', 'slow'] %} -{{ speed }}-trains: - type: MemoryDataSet - -{{ speed }}-cars: - type: pandas.CSVDataSet - filepath: s3://${bucket_name}/{{ speed }}-cars.csv - save_args: - index: true - -{% endfor %} -``` - -When parsing this configuration file, `TemplateConfigLoader` will: - -1. Read the `catalog.yml` and compile it using Jinja2 -2. Use a YAML parser to parse the compiled config into a Python dictionary -3. Expand `${bucket_name}` in `filepath` using the `globals_pattern` and `globals_dict` arguments for the `TemplateConfigLoader` instance as in the previous examples - -The output Python dictionary will look as follows: - -```python -{ - "fast-trains": {"type": "MemoryDataSet"}, - "fast-cars": { - "type": "pandas.CSVDataSet", - "filepath": "s3://my_s3_bucket/fast-cars.csv", - "save_args": {"index": True}, - }, - "slow-trains": {"type": "MemoryDataSet"}, - "slow-cars": { - "type": "pandas.CSVDataSet", - "filepath": "s3://my_s3_bucket/slow-cars.csv", - "save_args": {"index": True}, - }, -} -``` - -```eval_rst -.. warning:: Although Jinja2 is a very powerful and extremely flexible template engine, which comes with a wide range of features, we do not recommend using it to template your configuration unless absolutely necessary. The flexibility of dynamic configuration comes at a cost of significantly reduced readability and much higher maintenance overhead. We believe that, for the majority of analytics projects, dynamically compiled configuration does more harm than good. -``` - - -## Parameters - -### Load parameters - -Parameters project configuration can be loaded with the help of the `ConfigLoader` class: - -```python -from kedro.config import ConfigLoader - -conf_paths = ["conf/base", "conf/local"] -conf_loader = ConfigLoader(conf_paths) -parameters = conf_loader.get("parameters*", "parameters*/**") -``` - -This will load configuration files from `conf/base` and `conf/local` that have a filename starting with `parameters` or are located inside a folder with name starting with `parameters`. - -```eval_rst -.. note:: Since it is loaded after ``conf/base``, the configuration path ``conf/local`` takes precedence in the example above. Hence any overlapping top-level keys from ``conf/base`` will be overwritten by the ones from ``conf/local``. -``` - -Calling `conf_loader.get()` in the example above will throw a `MissingConfigException` error if there are no configuration files matching the given patterns in any of the specified paths. If this is a valid workflow for your application, you can handle it as follows: - -```python -from kedro.config import ConfigLoader, MissingConfigException - -conf_paths = ["conf/base", "conf/local"] -conf_loader = ConfigLoader(conf_paths) - -try: - parameters = conf_loader.get("parameters*", "parameters*/**") -except MissingConfigException: - parameters = {} -``` - -```eval_rst -.. note:: The ``kedro.framework.context.KedroContext`` class uses the approach above to load project parameters. -``` - -Parameters can then be used on their own or fed in as function inputs, as described [below](#use-parameters). - -### Specify parameters at runtime - -Kedro also allows you to specify runtime parameters for the `kedro run` CLI command. To do so, you need to use the `--params` command line option and specify a comma-separated list of key-value pairs that will be added to [KedroContext](/kedro.framework.context.KedroContext) parameters and made available to pipeline nodes. Each key-value pair is split on the first colon. For example: - -```bash -kedro run --params param_key1:value1,param_key2:2.0 # this will add {"param_key1": "value1", "param_key2": 2} to parameters dictionary -``` - -Values provided in the CLI take precedence and overwrite parameters specified in configuration files. Parameter keys are _always_ treated as strings. Parameter values are converted to a float or an integer number if the corresponding conversion succeeds; otherwise they are also treated as string. - -If any extra parameter key and/or value contains spaces, you should wrap the whole option contents in quotes: - -```bash -kedro run --params "key1:value with spaces,key2:value" -``` - -Since key-value pairs are split on the first colon, values can contain colons, but keys cannot. This is a valid CLI command: - -```bash -kedro run --params endpoint_url:https://endpoint.example.com -``` - -### Use parameters - -Say you have a set of parameters you're playing around with that specify modelling hyperparameters. You can declare these in one place, for instance `conf/base/parameters.yml`, so that you isolate your changes in one central location. - -```yaml -step_size: 1 -learning_rate: 0.01 -``` - - You may now reference these parameters in the `node` definition, using the `params:` prefix: - -```python -def increase_volume(volume, step): - return volume + step - - -# in pipeline definition -node( - func=increase_volume, - inputs=["input_volume", "params:step_size"], - outputs="output_volume", -) -``` - -You can also group your parameters into nested structures and, using the same method above, load them by top-level key: - -```yaml -step_size: 1 -model_params: - learning_rate: 0.01 - test_data_ratio: 0.2 - number_of_train_iterations: 10000 -``` - - -```python -def train_model(data, model): - lr = model["learning_rate"] - test_data_ratio = model["test_data_ratio"] - iterations = model["number_of_train_iterations"] - ... - - -# in pipeline definition -node( - func=train_model, - inputs=["input_data", "params:model_params"], - outputs="output_data", -) -``` - -Alternatively, you can also pass `parameters` to the node inputs and get access to the entire collection of values inside the node function. - -```python -def increase_volume(volume, params): - step = params["step_size"] - return volume + step - - -# in pipeline definition -node( - func=increase_volume, inputs=["input_volume", "parameters"], outputs="output_volume" -) -``` - -In both cases, under the hood parameters are added to the Data Catalog through the method `add_feed_dict()` in [`DataCatalog`](/kedro.io.DataCatalog), where they live as `MemoryDataSet`s. This method is also what the `KedroContext` class uses when instantiating the catalog. - -```eval_rst -.. note:: You can use ``add_feed_dict()`` to inject any other entries into your ``DataCatalog`` as per your use case. -``` - -## Credentials - -For security reasons, we strongly recommend *not* committing any credentials or other secrets to the Version Control System. Hence, by default any file inside the `conf/` folder (and its subfolders) containing `credentials` in its name will be ignored via `.gitignore` and not committed to your git repository. - -Credentials configuration can be loaded the same way as any other project configuration using the `ConfigLoader` class: - -```python -from kedro.config import ConfigLoader - -conf_paths = ["conf/base", "conf/local"] -conf_loader = ConfigLoader(conf_paths) -credentials = conf_loader.get("credentials*", "credentials*/**") -``` - -This will load configuration files from `conf/base` and `conf/local` that have filename starting with `credentials` or are located inside a folder with name starting with `credentials`. - -```eval_rst -.. note:: Since it is loaded after ``conf/base``, the configuration path ``conf/local`` takes precedence in the example above. Hence any overlapping top-level keys from ``conf/base`` will be overwritten by the ones from ``conf/local``. -``` - -Calling `conf_loader.get()` in the example above will throw a `MissingConfigException` error if there are no configuration files matching the given patterns in any of the specified paths. If this is a valid workflow for your application, you can handle it as follows: - -```python -from kedro.config import ConfigLoader, MissingConfigException - -conf_paths = ["conf/base", "conf/local"] -conf_loader = ConfigLoader(conf_paths) - -try: - credentials = conf_loader.get("credentials*", "credentials*/**") -except MissingConfigException: - credentials = {} -``` - -```eval_rst -.. note:: The ``kedro.framework.context.KedroContext`` class uses the approach above to load project credentials. -``` - -Credentials configuration can then be used on its own or [fed into the `DataCatalog`](../05_data/01_data_catalog.md#feeding-in-credentials). - -### AWS credentials - -When working with AWS credentials on datasets, you are not required to store AWS credentials in the project configuration files. Instead, you can specify them using environment variables `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and, optionally, `AWS_SESSION_TOKEN`. Please refer to the [official documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html) for more details. - -## Configure `kedro run` arguments - -An extensive list of CLI options for a `kedro run` is available in the [Kedro CLI documentation](../09_development/03_commands_reference.md#run-the-project). However, instead of specifying all the command line options in a `kedro run` via the CLI, you can specify a config file that contains the arguments, say `config.yml` and run: - -```console -$ kedro run --config config.yml -``` - -where `config.yml` is formatted as below (for example): - -```yaml -run: - tag: - - tag1 - - tag2 - - tag3 - pipeline: pipeline1 - parallel: true - node_names: - - node1 - - node2 - env: env1 -``` - -```eval_rst -.. note:: If you provide both a configuration file and a CLI option that clashes with the configuration file, the CLI option will take precedence. -``` diff --git a/docs/source/05_data/01_data_catalog.md b/docs/source/05_data/01_data_catalog.md deleted file mode 100644 index 9c96651075..0000000000 --- a/docs/source/05_data/01_data_catalog.md +++ /dev/null @@ -1,659 +0,0 @@ -# The Data Catalog - - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -This section introduces `catalog.yml`, the project-shareable Data Catalog. The file is located in `conf/base` and is a registry of all data sources available for use by a project; it manages loading and saving of data. - -All supported data connectors are available in [`kedro.extras.datasets`](/kedro.extras.datasets). - -## Using the Data Catalog within Kedro configuration - -Kedro uses configuration to make your code reproducible when it has to reference datasets in different locations and/or in different environments. - -You can copy this file and reference additional locations for the same datasets. For instance, you can use the `catalog.yml` file in `conf/base/` to register the locations of datasets that would run in production while copying and updating a second version of `catalog.yml` in `conf/local/` to register the locations of sample datasets that you are using for prototyping your data pipeline(s). - -There is built-in functionality for `conf/local/` to overwrite `conf/base/` detailed [here](../04_kedro_project_setup/02_configuration.md). This means that a dataset called `cars` could exist in the `catalog.yml` files in `conf/base/` and `conf/local/`. In code, in `src`, you would only call a dataset named `cars` and Kedro would detect which definition of `cars` dataset to use to run your pipeline - `cars` definition from `conf/local/catalog.yml` would take precedence in this case. - -The Data Catalog also works with the `credentials.yml` in `conf/local/`, allowing you to specify usernames and passwords that are required to load certain datasets. - -The are two ways of defining a Data Catalog through the use of YAML configuration, or programmatically using an API. Both methods allow you to specify: - - - Dataset name - - Dataset type - - Location of the dataset using `fsspec`, detailed in the next section - - Credentials needed in order to access the dataset - - Load and saving arguments - - Whether or not you want a [dataset or ML model to be versioned](02_kedro_io.md#versioning) when you run your data pipeline - -## Specifying the location of the dataset - -Kedro relies on [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) for reading and saving data from a variety of data stores including local file systems, network file systems, cloud object stores, and Hadoop. When specifying a storage location in `filepath:`, you should provide a URL using the general form `protocol://path/to/data`. If no protocol is provided, the local file system is assumed (same as ``file://``). - -The following prepends are available: - -- **Local or Network File System**: `file://` - the local file system is default in the absence of any protocol, it also permits relative paths. -- **Hadoop File System (HDFS)**: `hdfs://user@server:port/path/to/data` - Hadoop Distributed File System, for resilient, replicated files within a cluster. -- **Amazon S3**: `s3://my-bucket-name/path/to/data` - Amazon S3 remote binary store, often used with Amazon EC2, - using the library s3fs. -- **S3 Compatible Storage**: `s3://my-bucket-name/path/_to/data` - e.g. Minio, using the s3fs library. -- **Google Cloud Storage**: `gcs://` - Google Cloud Storage, typically used with Google Compute - resource using gcsfs (in development). -- **Azure Blob Storage / Azure Data Lake Storage Gen2**: `abfs://` - Azure Blob Storage, typically used when working on an Azure environment. -- **HTTP(s)**: ``http://`` or ``https://`` for reading data directly from HTTP web servers. - -`fsspec` also provides other file systems, such as SSH, FTP and WebHDFS. See the [documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#implementations) for more information. - -## Data Catalog `*_args` parameters - -Data Catalog accepts two different groups of `*_args` parameters that serve different purposes: -- `fs_args` -- `load_args` and `save_args` - -The `fs_args` is used to configure the interaction with a filesystem. -All the top-level parameters of `fs_args` (except `open_args_load` and `open_args_save`) will be passed in an underlying filesystem class. - -Example 1: Provide the `project` value to the underlying filesystem class (`GCSFileSystem`) to interact with Google Cloud Storage (GCS) - -```yaml -test_dataset: - type: ... - fs_args: - project: test_project -``` - -The `open_args_load` and `open_args_save` parameters are passed to the filesystem's `open` method to configure how a dataset file (on a specific filesystem) is opened during a load or save operation, respectively. - -Example 2: Load data from a local binary file using `utf-8` encoding - -```yaml -test_dataset: - type: ... - fs_args: - open_args_load: - mode: "rb" - encoding: "utf-8" -``` - -`load_args` and `save_args` configure how a third-party library (e.g. `pandas` for `CSVDataSet`) loads/saves data from/to a file. - -Example 3: Save data to a CSV file without row names (index) using `utf-8` encoding - -```yaml -test_dataset: - type: pandas.CSVDataSet - ... - save_args: - index: False - encoding: "utf-8" -``` - -## Using the Data Catalog with the YAML API - -The YAML API allows you to configure your datasets in a YAML configuration file, `conf/base/catalog.yml` or `conf/local/catalog.yml`. - -Here are some examples of data configuration in a `catalog.yml`: - -Example 1: Loads / saves a CSV file from / to a local file system - -```yaml -bikes: - type: pandas.CSVDataSet - filepath: data/01_raw/bikes.csv -``` - -Example 2: Loads and saves a CSV on a local file system, using specified load and save arguments - -```yaml -cars: - type: pandas.CSVDataSet - filepath: data/01_raw/company/cars.csv - load_args: - sep: ',' - save_args: - index: False - date_format: '%Y-%m-%d %H:%M' - decimal: . - -``` - -Example 3: Loads and saves a compressed CSV on a local file system - -```yaml -boats: - type: pandas.CSVDataSet - filepath: data/01_raw/company/boats.csv.gz - load_args: - sep: ',' - compression: 'gzip' - fs_args: - open_args_load: - mode: 'rb' -``` - -Example 4: Loads a CSV file from a specific S3 bucket, using credentials and load arguments - -```yaml -motorbikes: - type: pandas.CSVDataSet - filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.csv - credentials: dev_s3 - load_args: - sep: ',' - skiprows: 5 - skipfooter: 1 - na_values: ['#NA', NA] -``` - -Example 5: Loads / saves a pickle file from / to a local file system - -```yaml -airplanes: - type: pickle.PickleDataSet - filepath: data/06_models/airplanes.pkl - backend: pickle -``` - -Example 6: Loads an excel file from Google Cloud Storage - -```yaml -rockets: - type: pandas.ExcelDataSet - filepath: gcs://your_bucket/data/02_intermediate/company/motorbikes.xlsx - fs_args: - project: my-project - credentials: my_gcp_credentials - save_args: - sheet_name: Sheet1 -``` - -Example 7: Save an image created with Matplotlib on Google Cloud Storage - -```yaml -results_plot: - type: matplotlib.MatplotlibWriter - filepath: gcs://your_bucket/data/08_results/plots/output_1.jpeg - fs_args: - project: my-project - credentials: my_gcp_credentials -``` - -Example 8: Loads / saves an HDF file on local file system storage, using specified load and save arguments - -```yaml -skateboards: - type: pandas.HDFDataSet - filepath: data/02_intermediate/skateboards.hdf - key: name - load_args: - columns: [brand, length] - save_args: - mode: w # Overwrite even when the file already exists - dropna: True -``` - -Example 9: Loads / saves a parquet file on local file system storage, using specified load and save arguments - -```yaml -trucks: - type: pandas.ParquetDataSet - filepath: data/02_intermediate/trucks.parquet - load_args: - columns: [name, gear, disp, wt] - categories: list - index: name - save_args: - compression: GZIP - file_scheme: hive - has_nulls: False - partition_on: [name] -``` - -Example 10: Load / saves a Spark table on S3, using specified load and save arguments - -```yaml -weather: - type: spark.SparkDataSet - filepath: s3a://your_bucket/data/01_raw/weather* - credentials: dev_s3 - file_format: csv - load_args: - header: True - inferSchema: True - save_args: - sep: '|' - header: True -``` - -Example 11: Loads / saves a SQL table using credentials, a database connection, using specified load and save arguments - -```yaml -scooters: - type: pandas.SQLTableDataSet - credentials: scooters_credentials - table_name: scooters - load_args: - index_col: [name] - columns: [name, gear] - save_args: - if_exists: replace -``` - -Example 12: Load a SQL table with credentials, a database connection, and applies a SQL query to the table - -```yaml -scooters_query: - type: pandas.SQLQueryDataSet - credentials: scooters_credentials - sql: select * from cars where gear=4 - load_args: - index_col: [name] -``` - -Example 13: Load data from an API endpoint, example US corn yield data from USDA - -```yaml -us_corn_yield_data: - type: api.APIDataSet - url: https://quickstats.nass.usda.gov - params: - key: SOME_TOKEN - format: JSON - commodity_desc: CORN - statisticcat_des: YIELD - agg_level_desc: STATE - year: 2000 -``` - -When using [`pandas.SQLTableDataSet`](/kedro.extras.datasets.pandas.SQLTableDataSet) or [`pandas.SQLQueryDataSet`](/kedro.extras.datasets.pandas.SQLQueryDataSet) you must provide a database connection string. In the example above we pass it using `scooters_credentials` key from the credentials (see the details in [Feeding in credentials](#feeding-in-credentials) section below). `scooters_credentials` must have a top-level key `con` containing [SQLAlchemy compatible](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) connection string. As an alternative to credentials, you could explicitly put `con` into `load_args` and `save_args` (`pandas.SQLTableDataSet` only). - -Example 14: Loading data from Minio (S3 API Compatible Storage) - -```yaml -test: - type: pandas.CSVDataSet - filepath: s3://your_bucket/test.csv # assume `test.csv` is uploaded to the Minio server. - credentials: dev_minio -``` -In `credentials.yml`, define the `key`, `secret` and the `endpoint_url` as follows: - -```yaml -dev_minio: - key: token - secret: key - client_kwargs: - endpoint_url : 'http://localhost:9000' -``` - -```eval_rst -.. note:: The easiest way to setup MinIO is to run a Docker image. After the following command, you can access to Minio server with ``http://localhost:9000`` and create a bucket and add files as if it is on S3. -``` - -`docker run -p 9000:9000 -e "MINIO_ACCESS_KEY=token" -e "MINIO_SECRET_KEY=key" minio/minio server /data` - -Example 15: Loading a model saved as a pickle from Azure Blob Storage - -```yaml -ml_model: - type: pickle.PickleDataSet - filepath: "abfs://models/ml_models.pickle" - versioned: True - credentials: dev_abs -``` -In `credentials.yml`, define the `account_name` and `account_key` as follows: - -```yaml -dev_abs: - account_name: accountname - account_key: key -``` - -## Creating a Data Catalog YAML configuration file via CLI - -You can use [`kedro catalog create` command](../09_development/03_commands_reference.md#create-a-data-catalog-yaml-configuration-file) to create a Data Catalog YAML configuration. - -It creates a `//catalog/.yml` configuration file with `MemoryDataSet` datasets for each dataset in a registered pipeline if it is missing from the `DataCatalog`. - -```yaml -# //catalog/.yml -rockets: - type: MemoryDataSet -scooters: - type: MemoryDataSet -``` - -## Adding parameters - -You can [configure parameters](../04_kedro_project_setup/02_configuration.md#load-parameters) for your project and [reference them](../04_kedro_project_setup/02_configuration.md#use-parameters) in your nodes. Do this using the `add_feed_dict()` method ([API documentation](/kedro.io.DataCatalog)). You can use this method to add any other entry / metadata you wish on the `DataCatalog`. - - -## Feeding in credentials - -Before instantiating the `DataCatalog` Kedro will first attempt to read the credentials from [the project configuration](../04_kedro_project_setup/02_configuration.md#aws-credentials). The resulting dictionary is then passed into `DataCatalog.from_config()` as the `credentials` argument. - -Let's assume that the project contains the file `conf/local/credentials.yml` with the following contents: - -```yaml -dev_s3: - client_kwargs: - aws_access_key_id: key - aws_secret_access_key: secret - -scooters_credentials: - con: sqlite:///kedro.db - -my_gcp_credentials: - id_token: key -``` - -In the example above `catalog.yml` contains references to credentials keys `dev_s3` and `scooters_credentials`. It means that when instantiating `motorbikes` dataset, for example, the `DataCatalog` will attempt to read top-level key `dev_s3` from the received `credentials` dictionary, and then will pass its values into the dataset `__init__` as `credentials` argument. This is essentially equivalent to calling this: - -```python -CSVDataSet( - filepath="s3://test_bucket/data/02_intermediate/company/motorbikes.csv", - load_args=dict(sep=",", skiprows=5, skipfooter=1, na_values=["#NA", "NA"]), - credentials=dict(key="token", secret="key"), -) -``` - - -## Loading multiple datasets that have similar configuration - -You may encounter situations where your datasets use the same file format, load and save arguments, and are stored in the same folder. YAML has a [built-in syntax](https://yaml.org/spec/1.2/spec.html#id2765878) for factorising parts of a YAML file, which means that you can decide what is generalisable across your datasets so that you do not have to spend time copying and pasting dataset configurations in `catalog.yml`. - -You can see this in the following example: - -```yaml -_csv: &csv - type: spark.SparkDataSet - file_format: csv - load_args: - sep: ',' - na_values: ['#NA', NA] - header: True - inferSchema: False - -cars: - <<: *csv - filepath: s3a://data/01_raw/cars.csv - -trucks: - <<: *csv - filepath: s3a://data/01_raw/trucks.csv - -bikes: - <<: *csv - filepath: s3a://data/01_raw/bikes.csv - load_args: - header: False -``` - -The syntax `&csv` names the following block `csv` and the syntax `<<: *csv` inserts the contents of the block named `csv`. Locally declared keys entirely override inserted ones as seen in `bikes`. - -```eval_rst -.. note:: It's important that the name of the template entry starts with a ``_`` so Kedro knows not to try and instantiate it as a dataset. -``` - -You can also nest reuseable YAML syntax: - -```yaml -_csv: &csv - type: spark.SparkDataSet - file_format: csv - load_args: &csv_load_args - header: True - inferSchema: False - -airplanes: - <<: *csv - filepath: s3a://data/01_raw/airplanes.csv - load_args: - <<: *csv_load_args - sep: ; -``` - -In this example the default `csv` configuration is inserted into `airplanes` and then the `load_args` block is overridden. Normally that would replace the whole dictionary. In order to extend `load_args` the defaults for that block are then re-inserted. - - -## Transcoding datasets - -You may come across a situation where you would like to read the same file using two different dataset implementations. Use transcoding when you want to load and save the same file, via its specified `filepath`, using different `DataSet` implementations. - -### A typical example of transcoding - -For instance, parquet files can not only be loaded via the `ParquetDataSet` using `pandas`, but also directly by `SparkDataSet`. This conversion is typical when coordinating a `Spark` to `pandas` workflow. - -To enable transcoding, define two `DataCatalog` entries for the same dataset in a common format (Parquet, JSON, CSV, etc.) in your `conf/base/catalog.yml`: - -```yaml -my_dataframe@spark: - type: spark.SparkDataSet - filepath: data/02_intermediate/data.parquet - file_format: parquet - -my_dataframe@pandas: - type: pandas.ParquetDataSet - filepath: data/02_intermediate/data.parquet -``` - -These entries are used in the pipeline like this: - -```python -Pipeline( - [ - node(func=my_func1, inputs="spark_input", outputs="my_dataframe@spark"), - node(func=my_func2, inputs="my_dataframe@pandas", outputs="pipeline_output"), - ] -) -``` - -### How does transcoding work? - -In this example, Kedro understands that `my_dataframe` is the same dataset in its `spark.SparkDataSet` and `pandas.ParquetDataSet` formats and helps resolve the node execution order. - -In the pipeline, Kedro uses the `spark.SparkDataSet` implementation for saving and `pandas.ParquetDataSet` -for loading, so the first node should output a `pyspark.sql.DataFrame`, while the second node would receive a `pandas.Dataframe`. - - -## Transforming datasets - -Transformers are used to intercept the load and save operations on Kedro `DataSet`s. Use cases for transformers include: - - - Data validation - - Tracking operation performance - - Data format conversion (although we would recommend [Transcoding](#transcoding-datasets) for this) - -### Applying built-in transformers - -Here we cover the use case of _tracking operation performance_ by applying built-in transformers to monitor the latency of load and save operations. - -Transformers are applied at the `DataCatalog` level. To apply the built-in `ProfileTimeTransformer`, you need to: - -1. Navigate to `src//hooks.py` -2. Apply `ProfileTimeTransformer` in the hook implementation `TransformerHooks.after_catalog_created` -3. Register the hook in your `src//settings.py` - -```python -# src//hooks.py - -from kedro.extras.transformers import ProfileTimeTransformer # new import -from kedro.framework.hooks import hook_impl # new import -from kedro.io import DataCatalog # new import - - -class TransformerHooks: - @hook_impl - def after_catalog_created(self, catalog: DataCatalog) -> None: - catalog.add_transformer(ProfileTimeTransformer()) -``` - -```python -# src//settings.py -from .hooks import TransformerHooks - -HOOKS = (TransformerHooks(),) -``` - -Once complete, rerun the pipeline from the terminal and you should see the following logging output: - -```console -$ kedro run - -... -2019-11-13 15:09:01,784 - kedro.io.data_catalog - INFO - Loading data from `companies` (CSVDataSet)... -2019-11-13 15:09:01,827 - ProfileTimeTransformer - INFO - Loading companies took 0.043 seconds -2019-11-13 15:09:01,828 - kedro.pipeline.node - INFO - Running node: preprocessing_companies: preprocess_companies([companies]) -> [preprocessed_companies] -2019-11-13 15:09:01,880 - kedro_tutorial.nodes.data_engineering - INFO - Running 'preprocess_companies' took 0.05 seconds -2019-11-13 15:09:01,880 - kedro_tutorial.nodes.data_engineering - INFO - Running 'preprocess_companies' took 0.05 seconds -2019-11-13 15:09:01,880 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_companies` (CSVDataSet)... -2019-11-13 15:09:02,112 - ProfileTimeTransformer - INFO - Saving preprocessed_companies took 0.232 seconds -2019-11-13 15:09:02,113 - kedro.runner.sequential_runner - INFO - Completed 1 out of 6 tasks -... -``` - -The `ProfileTimeTransformer - INFO` log messages report the latency of dataset load and save operations. - -### Transformer scope -You can refine the scope of the transformer by specifying an optional list of the datasets it is applied to in `add_transformer`. - -For example, the command `catalog.add_transformer(profile_time, ["dataset1", "dataset2"])` applies the `profile_time` transformer _only_ to the datasets named `dataset1` and `dataset2`. - -This is useful when you need to apply a transformer to just a subset of datasets. - -## Versioning datasets and ML models - -Making a simple addition to your Data Catalog allows you to perform versioning of datasets and machine learning models. - -Consider the following versioned dataset defined in the `catalog.yml`: - -```yaml -cars.csv: - type: pandas.CSVDataSet - filepath: data/01_raw/company/cars.csv - versioned: True -``` - -The `DataCatalog` will create a versioned `CSVDataSet` called `cars.csv`. The actual csv file location will look like `data/01_raw/company/cars.csv//cars.csv`, where `` corresponds to a global save version string formatted as `YYYY-MM-DDThh.mm.ss.sssZ`. - -You can run the pipeline with a particular versioned data set with `--load-version` flag as follows: - -```bash -kedro run --load-version="cars.csv:YYYY-MM-DDThh.mm.ss.sssZ" -``` -where `--load-version` is dataset name and version timestamp separated by `:`. - -This section shows just the very basics of versioning, which is described further in the documentation about [Kedro IO](../05_data/02_kedro_io.md#versioning). - -## Using the Data Catalog with the Code API - -The code API allows you to: - -* configure data sources in code -* operate the IO module within notebooks - -### Configuring a Data Catalog - -In a file like `catalog.py`, you can construct a `DataCatalog` object programmatically. In the following, we are using a number of pre-built data loaders documented in the [API reference documentation](/kedro.extras.datasets). - -```python -from kedro.io import DataCatalog -from kedro.extras.datasets.pandas import ( - CSVDataSet, - SQLTableDataSet, - SQLQueryDataSet, - ParquetDataSet, -) - -io = DataCatalog( - { - "bikes": CSVDataSet(filepath="../data/01_raw/bikes.csv"), - "cars": CSVDataSet(filepath="../data/01_raw/cars.csv", load_args=dict(sep=",")), - "cars_table": SQLTableDataSet( - table_name="cars", credentials=dict(con="sqlite:///kedro.db") - ), - "scooters_query": SQLQueryDataSet( - sql="select * from cars where gear=4", - credentials=dict(con="sqlite:///kedro.db"), - ), - "ranked": ParquetDataSet(filepath="ranked.parquet"), - } -) -``` - -When using `SQLTableDataSet` or `SQLQueryDataSet` you must provide a `con` key containing [SQLAlchemy compatible](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) database connection string. In the example above we pass it as part of `credentials` argument. Alternative to `credentials` is to put `con` into `load_args` and `save_args` (`SQLTableDataSet` only). - -### Loading datasets - -You can access each dataset by its name. - -```python -cars = io.load("cars") # data is now loaded as a DataFrame in 'cars' -gear = cars["gear"].values -``` - -#### Behind the scenes - -The following steps happened behind the scenes when `load` was called: - -- The value `cars` was located in the Data Catalog -- The corresponding `AbstractDataSet` object was retrieved -- The `load` method of this dataset was called -- This `load` method delegated the loading to the underlying pandas `read_csv` function - -### Viewing the available data sources - -If you forget what data was assigned, you can always review the `DataCatalog`. - -```python -io.list() -``` - -### Saving data - -You can save data using an API similar to that used to load data. - -```eval_rst -.. caution:: This use is not recommended unless you are prototyping in notebooks. -``` - -#### Saving data to memory - -```python -from kedro.io import MemoryDataSet - -memory = MemoryDataSet(data=None) -io.add("cars_cache", memory) -io.save("cars_cache", "Memory can store anything.") -io.load("car_cache") -``` - -#### Saving data to a SQL database for querying - -At this point we may want to put the data in a SQLite database to run queries on it. Let's use that to rank scooters by their mpg. - -```python -import os - -# This cleans up the database in case it exists at this point -try: - os.remove("kedro.db") -except FileNotFoundError: - pass - -io.save("cars_table", cars) -ranked = io.load("scooters_query")[["brand", "mpg"]] -``` - -#### Saving data in Parquet - -Finally we can save the processed data in Parquet format. - -```python -io.save("ranked", ranked) -``` - -```eval_rst -.. attention:: Saving ``None`` to a dataset is not allowed! -``` diff --git a/docs/source/05_data/02_kedro_io.md b/docs/source/05_data/02_kedro_io.md deleted file mode 100644 index 865e96c742..0000000000 --- a/docs/source/05_data/02_kedro_io.md +++ /dev/null @@ -1,645 +0,0 @@ -# Kedro IO - - -In this tutorial, we cover advanced uses of the [Kedro IO](/kedro.io.rst) module to understand the underlying implementation. The relevant API documentation is [kedro.io.AbstractDataSet](/kedro.io.AbstractDataSet) and [kedro.io.DataSetError](/kedro.io.DataSetError). - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -## Error handling - -We have custom exceptions for the main classes of errors that you can handle to deal with failures. - -```python -from kedro.io import * -``` - -```python -io = DataCatalog(data_sets=dict()) # empty catalog - -try: - cars_df = io.load("cars") -except DataSetError: - print("Error raised.") -``` - - -## AbstractDataSet - -To understand what is going on behind the scenes, you should study the [AbstractDataSet interface](/kedro.io.AbstractDataSet). `AbstractDataSet` is the underlying interface that all datasets extend. It requires subclasses to override the `_load` and `_save` and provides `load` and `save` methods that enrich the corresponding private methods with uniform error handling. It also requires subclasses to override `_describe`, which is used in logging the internal information about the instances of your custom `AbstractDataSet` implementation. - -If you have a dataset called `parts`, you can make direct calls to it like so: - -```python -parts_df = parts.load() -``` - -However, we recommend using a `DataCatalog` instead (for more details, see [this section](../05_data/01_data_catalog.md) in the User Guide) as it has been designed to make all datasets available to project members. - -For contributors, if you would like to submit a new dataset, you will have to extend `AbstractDataSet`. For a complete guide, please read [Creating a new dataset](../07_extend_kedro/03_custom_datasets.md). - - -## Versioning - -In order to enable versioning, you need to update the `catalog.yml` config file and set the `versioned` attribute to `true` for the given dataset. If this is a custom dataset, the implementation must also: - 1. extend `kedro.io.core.AbstractVersionedDataSet` AND - 2. add `version` namedtuple as an argument to its `__init__` method AND - 3. call `super().__init__()` with positional arguments `filepath`, `version`, and, optionally, with `glob` and `exists` functions if it uses a non-local filesystem (see [kedro.extras.datasets.pandas.CSVDataSet](/kedro.extras.datasets.pandas.CSVDataSet) as an example) AND - 4. modify its `_describe`, `_load` and `_save` methods respectively to support versioning (see [`kedro.extras.datasets.pandas.CSVDataSet`](/kedro.extras.datasets.pandas.CSVDataSet) for an example implementation) - -```eval_rst -.. note:: If a new version of a dataset is created mid-run, for instance by an external system adding new files, it will not interfere in the current run, i.e. the load version stays the same throughout subsequent loads. -``` - -An example dataset could look similar to the below: - -```python -from pathlib import Path, PurePosixPath - -import pandas as pd - -from kedro.io import AbstractVersionedDataSet - - -class MyOwnDataSet(AbstractVersionedDataSet): - def __init__(self, filepath, version, param1, param2=True): - super().__init__(PurePosixPath(filepath), version) - self._param1 = param1 - self._param2 = param2 - - def _load(self) -> pd.DataFrame: - load_path = self._get_load_path() - return pd.read_csv(load_path) - - def _save(self, df: pd.DataFrame) -> None: - save_path = self._get_save_path() - df.to_csv(save_path) - - def _exists(self) -> bool: - path = self._get_load_path() - return Path(path).exists() - - def _describe(self): - return dict(version=self._version, param1=self._param1, param2=self._param2) -``` - -With `catalog.yml` specifying: - -```yaml -my_dataset: - type: .MyOwnDataSet - filepath: data/01_raw/my_data.csv - versioned: true - param1: # param1 is a required argument - # param2 will be True by default -``` - -### `version` namedtuple - -Versioned dataset `__init__` method must have an optional argument called `version` with a default value of `None`. If provided, this argument must be an instance of [`kedro.io.core.Version`](/kedro.io.Version). Its `load` and `save` attributes must either be `None` or contain string values representing exact load and save versions: - -* If `version` is `None` then the dataset is considered *not versioned*. -* If `version.load` is `None` then the latest available version will be used to load the dataset, otherwise a string representing exact load version must be provided. -* If `version.save` is `None` then a new save version string will be generated by calling `kedro.io.core.generate_timestamp()`, otherwise a string representing exact save version must be provided. - -### Versioning using the YAML API - -The easiest way to version a specific dataset is to change the corresponding entry in the `catalog.yml`. For example, if the following dataset was defined in the `catalog.yml`: - -```yaml -cars: - type: pandas.CSVDataSet - filepath: data/01_raw/company/car_data.csv - versioned: true -``` - -The `DataCatalog` will create a versioned `CSVDataSet` called `cars`. The actual csv file location will look like `data/01_raw/company/car_data.csv//car_data.csv`, where `` corresponds to a global save version string formatted as `YYYY-MM-DDThh.mm.ss.sssZ`. Every time the `DataCatalog` is instantiated, it generates a new global save version, which is propagated to all versioned datasets it contains. - -`catalog.yml` only allows you to version your datasets but it does not allow you to choose which version to load or save. This is deliberate because we have chosen to separate the data catalog from any runtime configuration. If you need to pin a dataset version, you can either [specify the versions in a separate `yml` file and call it at runtime](../04_kedro_project_setup/02_configuration.md#configure-kedro-run-arguments) or [instantiate your versioned datasets using Code API and define a version parameter explicitly](#versioning-using-the-code-api). - -By default, the `DataCatalog` will load the latest version of the dataset. However, it is also possible to specify an exact load version. In order to do that, you can pass a dictionary with exact load versions to `DataCatalog.from_config`: - -```python -load_versions = {"cars": "2019-02-13T14.35.36.518Z"} -io = DataCatalog.from_config(catalog_config, credentials, load_versions=load_versions) -cars = io.load("cars") -``` - -The last row in the example above would attempt to load a CSV file from `data/01_raw/company/car_data.csv/2019-02-13T14.35.36.518Z/car_data.csv`: - -* `load_versions` configuration has an effect only if a dataset versioning has been enabled in the catalog config file - see the example above. - -* We recommend that you do not override `save_version` argument in `DataCatalog.from_config` unless strongly required to do so, since it may lead to inconsistencies between loaded and saved versions of the versioned datasets. - -```eval_rst -.. attention:: The ``DataCatalog`` does not re-generate save versions between instantiations. Therefore, if you call ``catalog.save('cars', some_data)`` twice, then the second call will fail, since it tries to overwrite a versioned dataset using the same save version. To mitigate this, reload your data catalog by calling ``%reload_kedro`` line magic. This limitation does not apply to ``load`` operation. -``` - -### Versioning using the Code API - -Although we recommend enabling versioning using the `catalog.yml` config file as described in the section above, you may require more control over load and save versions of a specific dataset. To achieve this you can instantiate `Version` and pass it as a parameter to the dataset initialisation: - -```python -from kedro.io import DataCatalog, Version -from kedro.extras.datasets.pandas import CSVDataSet -import pandas as pd - -data1 = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) -data2 = pd.DataFrame({"col1": [7], "col2": [8], "col3": [9]}) -version = Version( - load=None, # load the latest available version - save=None, # generate save version automatically on each save operation -) - -test_data_set = CSVDataSet( - filepath="data/01_raw/test.csv", save_args={"index": False}, version=version -) -io = DataCatalog({"test_data_set": test_data_set}) - -# save the dataset to data/01_raw/test.csv//test.csv -io.save("test_data_set", data1) -# save the dataset into a new file data/01_raw/test.csv//test.csv -io.save("test_data_set", data2) - -# load the latest version from data/test.csv/*/test.csv -reloaded = io.load("test_data_set") -assert data2.equals(reloaded) -``` - -```eval_rst -.. note:: In the example above we did not fix any versions. If we do, then the behaviour of load and save operations becomes slightly different: -``` - -```python -version = Version( - load="my_exact_version", # load exact version - save="my_exact_version", # save to exact version -) - -test_data_set = CSVDataSet( - filepath="data/01_raw/test.csv", save_args={"index": False}, version=version -) -io = DataCatalog({"test_data_set": test_data_set}) - -# save the dataset to data/01_raw/test.csv/my_exact_version/test.csv -io.save("test_data_set", data1) -# load from data/01_raw/test.csv/my_exact_version/test.csv -reloaded = io.load("test_data_set") -assert data1.equals(reloaded) - -# raises DataSetError since the path -# data/01_raw/test.csv/my_exact_version/test.csv already exists -io.save("test_data_set", data2) -``` - -```eval_rst -.. attention:: Passing exact load and/or save versions to the dataset instantiation is not recommended, since it may lead to inconsistencies between operations. For example, if versions for load and save operations do not match, save operation would result in a ``UserWarning`` indicating that save a load versions do not match. Load after save may also return an error if the corresponding load version is not found: -``` - -```python -version = Version( - load="exact_load_version", # load exact version - save="exact_save_version", # save to exact version -) - -test_data_set = CSVDataSet( - filepath="data/01_raw/test.csv", save_args={"index": False}, version=version -) -io = DataCatalog({"test_data_set": test_data_set}) - -io.save("test_data_set", data1) # emits a UserWarning due to version inconsistency - -# raises DataSetError since the data/01_raw/test.csv/exact_load_version/test.csv -# file does not exist -reloaded = io.load("test_data_set") -``` - -### Supported datasets - -Currently the following datasets support versioning: - -- `kedro.extras.datasets.matplotlib.MatplotlibWriter` -- `kedro.extras.datasets.holoviews.HoloviewsWriter` -- `kedro.extras.datasets.networkx.NetworkXDataSet` -- `kedro.extras.datasets.pandas.CSVDataSet` -- `kedro.extras.datasets.pandas.ExcelDataSet` -- `kedro.extras.datasets.pandas.FeatherDataSet` -- `kedro.extras.datasets.pandas.HDFDataSet` -- `kedro.extras.datasets.pandas.JSONDataSet` -- `kedro.extras.datasets.pandas.ParquetDataSet` -- `kedro.extras.datasets.pickle.PickleDataSet` -- `kedro.extras.datasets.pillow.ImageDataSet` -- `kedro.extras.datasets.text.TextDataSet` -- `kedro.extras.datasets.spark.SparkDataSet` -- `kedro.extras.datasets.yaml.YAMLDataSet` -- `kedro.extras.datasets.api.APIDataSet` -- `kedro.extras.datasets.tensorflow.TensorFlowModelDataset` -- `kedro.extras.datasets.json.JSONDataSet` - -```eval_rst -.. note:: Although, HTTPs is a supported file system in the dataset implementations, it does not support versioning. -``` - -## Partitioned dataset - -These days distributed systems play an increasingly important role in ETL data pipelines. They significantly increase the processing throughput, enabling us to work with much larger volumes of input data. However, these benefits sometimes come at a cost. When dealing with the input data generated by such distributed systems, you may encounter a situation where your Kedro node needs to read the data from a directory full of uniform files of the same type (e.g. JSON, CSV, Parquet, etc.) rather than from a single file. Tools like `PySpark` and the corresponding [SparkDataSet](/kedro.extras.datasets.spark.SparkDataSet) cater for such use cases, but the use of Spark is not always feasible. - -This is the reason why Kedro provides a built-in [PartitionedDataSet](/kedro.io.PartitionedDataSet), which has the following features: - -* `PartitionedDataSet` can recursively load all or specific files from a given location. -* Is platform agnostic and can work with any filesystem implementation supported by [fsspec](https://filesystem-spec.readthedocs.io/) including local, S3, GCS, and many more. -* Implements a [lazy loading](https://en.wikipedia.org/wiki/Lazy_loading) approach and does not attempt to load any partition data until a processing node explicitly requests it. - -```eval_rst -.. note:: In this section each individual file inside a given location is called a partition. -``` - -### Partitioned dataset definition - -`PartitionedDataSet` definition can be put in your `catalog.yml` like any other regular dataset definition; the definition represents the following structure: - -```yaml -# conf/base/catalog.yml - -my_partitioned_dataset: - type: PartitionedDataSet - path: s3://my-bucket-name/path/to/folder # path to the location of partitions - dataset: pandas.CSVDataSet # shorthand notation for the dataset which will handle individual partitions - credentials: my_credentials - load_args: - load_arg1: value1 - load_arg2: value2 -``` - -```eval_rst -.. note:: As any other dataset ``PartitionedDataSet`` can also be instantiated programmatically in Python: -``` - -```python -from kedro.extras.datasets.pandas import CSVDataSet -from kedro.io import PartitionedDataSet - -my_credentials = {...} # credentials dictionary - -my_partitioned_dataset = PartitionedDataSet( - path="s3://my-bucket-name/path/to/folder", - dataset=CSVDataSet, - credentials=my_credentials, - load_args={"load_arg1": "value1", "load_arg2": "value2"}, -) -``` - -Alternatively, if you need more granular configuration of the underlying dataset, its definition can be provided in full: - -```yaml -# conf/base/catalog.yml - -my_partitioned_dataset: - type: PartitionedDataSet - path: s3://my-bucket-name/path/to/folder - dataset: # full dataset config notation - type: pandas.CSVDataSet - load_args: - delimiter: "," - save_args: - index: false - credentials: my_credentials - load_args: - load_arg1: value1 - load_arg2: value2 - filepath_arg: filepath # the argument of the dataset to pass the filepath to - filename_suffix: ".csv" -``` - -Here is an exhaustive list of the arguments supported by `PartitionedDataSet`: - -```eval_rst -+-------------------------+--------------------------------+--------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Argument | Required | Supported types | Description | -+=========================+================================+==================================================+===========================================================================================================================================================================================================================================================+ -| :code:`path` | Yes | :code:`str` | Path to the folder containing partitioned data. If path starts with the protocol (e.g., :code:`s3://`) then the corresponding :code:`fsspec` concrete filesystem implementation will be used. If protocol is not specified, local filesystem will be used | -+-------------------------+--------------------------------+--------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| :code:`dataset` | Yes | :code:`str`, :code:`Type[AbstractDataSet]`, | Underlying dataset definition, for more details see the section below | -| | | :code:`Dict[str, Any]` | | -+-------------------------+--------------------------------+--------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| :code:`credentials` | No | :code:`Dict[str, Any]` | Protocol-specific options that will be passed to :code:`fsspec.filesystemcall`, for more details see the section below | -+-------------------------+--------------------------------+--------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| :code:`load_args` | No | :code:`Dict[str, Any]` | Keyword arguments to be passed into :code:`find()` method of the corresponding filesystem implementation | -+-------------------------+--------------------------------+--------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| :code:`filepath_arg` | No | :code:`str` | Argument name of the underlying dataset initializer that will contain a path to an individual partition | -| | (defaults to :code:`filepath`) | | | -+-------------------------+--------------------------------+--------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| :code:`filename_suffix` | No | :code:`str` | If specified, partitions that don't end with this string will be ignored | -| | (defaults to an empty string) | | | -+-------------------------+--------------------------------+--------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -``` - -#### Dataset definition - -Dataset definition should be passed into the `dataset` argument of the `PartitionedDataSet`. The dataset definition is used to instantiate a new dataset object for each individual partition, and use that dataset object for load and save operations. Dataset definition supports shorthand and full notations. - -##### Shorthand notation - -Requires you to only specify a class of the underlying dataset either as a string (e.g. `pandas.CSVDataSet` or a fully qualified class path like `kedro.extras.datasets.pandas.CSVDataSet`) or as a class object that is a subclass of the [AbstractDataSet](/kedro.io.AbstractDataSet). - -##### Full notation - -Full notation allows you to specify a dictionary with the full underlying dataset definition _except_ the following arguments: -* The argument that receives the partition path (`filepath` by default) - if specified, a `UserWarning` will be emitted stating that this value will be overridden by individual partition paths -* `credentials` key - specifying it will result in `DataSetError` being raised; dataset credentials should be passed into `credentials` argument of the `PartitionedDataSet` rather than underlying dataset definition - see [the section below](#partitioned-dataset-credentials) for details -* `versioned` flag - specifying it will result in `DataSetError` being raised; versioning cannot be enabled for the underlying datasets - -#### Partitioned dataset credentials - -```eval_rst -.. note:: Support for ``dataset_credentials`` key in the credentials for ``PartitionedDataSet`` is now deprecated. The dataset credentials should be specified explicitly inside the dataset config. -``` - -Credentials management for `PartitionedDataSet` is somewhat special in a sense that it may contain credentials for both `PartitionedDataSet` itself _and_ the underlying dataset that is used for partition load and save. Top-level credentials are passed to the underlying dataset config (unless such config already has credentials configured), but not the other way around - dataset credentials are never propagated to the filesystem. - -Here is the full list of possible scenarios: - -```eval_rst -+-------------------+--------------------+--------------------------------------------------------------------+----------------------------------------------------------------------------+ -| Top-level | Underlying dataset | Example :code:`PartitionedDataSet` definition | Description | -| credentials | credentials | | | -+===================+====================+====================================================================+============================================================================+ -| Undefined | Undefined | :code:`PartitionedDataSet(path="s3://bucket-name/path/to/folder",` | Credentials are not passed to the underlying dataset or the filesystem | -| | | :code:`dataset="CSVDataSet")` | | -+-------------------+--------------------+--------------------------------------------------------------------+----------------------------------------------------------------------------+ -| Undefined | Specified | :code:`PartitionedDataSet(path="s3://bucket-name/path/to/folder",` | Underlying dataset credentials are passed to the :code:`CSVDataSet` | -| | | :code:`dataset={"type": "CSVDataSet",` | constructor, filesystem is instantiated without credentials | -| | | :code:`"credentials": {"secret": True}})` | | -+-------------------+--------------------+--------------------------------------------------------------------+----------------------------------------------------------------------------+ -| Specified | Undefined | :code:`PartitionedDataSet(path="s3://bucket-name/path/to/folder",` | Top-level credentials are passed to the underlying :code:`CSVDataSet` | -| | | :code:`dataset="CSVDataSet", credentials={"secret": True})` | constructor and the filesystem | -+-------------------+--------------------+--------------------------------------------------------------------+----------------------------------------------------------------------------+ -| Specified | :code:`None` | :code:`PartitionedDataSet(path="s3://bucket-name/path/to/folder",` | Top-level credentials are passed to the filesystem, :code:`CSVDataSet` is | -| | | :code:`dataset={"type": "CSVDataSet", "credentials": None},` | instantiated without credentials - this way you can stop the top-level | -| | | :code:`credentials={"dataset_secret": True})` | credentials from propagating into the dataset config | -+-------------------+--------------------+--------------------------------------------------------------------+----------------------------------------------------------------------------+ -| Specified | Specified | :code:`PartitionedDataSet(path="s3://bucket-name/path/to/folder",` | Top-level credentials are passed to the filesystem, underlying dataset | -| | | :code:`dataset={"type": "CSVDataSet",` | credentials are passed to the :code:`CSVDataSet` constructor | -| | | :code:`"credentials":{"dataset_secret": True}},` | | -| | | :code:`credentials={"secret": True})` | | -+-------------------+--------------------+--------------------------------------------------------------------+----------------------------------------------------------------------------+ -``` - -### Partitioned dataset load - -Let's assume that the Kedro pipeline that you are working with contains the node defined as follows: - -```python -from kedro.pipeline import node - -node(concat_partitions, inputs="my_partitioned_dataset", outputs="concatenated_result") -``` - -The underlying node function `concat_partitions` may look like this: - -```python -from typing import Any, Callable, Dict -import pandas as pd - - -def concat_partitions(partitioned_input: Dict[str, Callable[[], Any]]) -> pd.DataFrame: - """Concatenate input partitions into one pandas DataFrame. - - Args: - partitioned_input: A dictionary with partition ids as keys and load functions as values. - - Returns: - Pandas DataFrame representing a concatenation of all loaded partitions. - """ - result = pd.DataFrame() - - for partition_key, partition_load_func in sorted(partitioned_input.items()): - partition_data = partition_load_func() # load the actual partition data - # concat with existing result - result = pd.concat([result, partition_data], ignore_index=True, sort=True) - - return result -``` - -As you can see from the example above, on load `PartitionedDataSet` _does not_ automatically load the data from the located partitions. Instead, `PartitionedDataSet` returns a dictionary with partition IDs as keys and the corresponding load functions as values. It allows the node that consumes the `PartitionedDataSet` to implement the logic that defines what partitions need to be loaded and how this data is going to be processed. - -Partition ID _does not_ represent the whole partition path, but only a part of it that is unique for a given partition _and_ filename suffix: - -* Example 1: if `path=s3://my-bucket-name/folder` and partition is stored in `s3://my-bucket-name/folder/2019-12-04/data.csv` then its Partition ID is `2019-12-04/data.csv`. - - -* Example 2: if `path=s3://my-bucket-name/folder` and `filename_suffix=".csv"` and partition is stored in `s3://my-bucket-name/folder/2019-12-04/data.csv` then its Partition ID is `2019-12-04/data`. - -`PartitionedDataSet` implements caching on load operation, which means that if multiple nodes consume the same `PartitionedDataSet`, they will all receive the same partition dictionary even if some new partitions were added to the folder after the first load has been completed. This is done deliberately to guarantee the consistency of load operations between the nodes and avoid race conditions. You can reset the cache by calling `release()` method of the partitioned dataset object. - -### Partitioned dataset save - -`PartitionedDataSet` also supports a save operation. Let's assume the following configuration: - -```yaml -# conf/base/catalog.yml - -new_partitioned_dataset: - type: PartitionedDataSet - path: s3://my-bucket-name - dataset: pandas.CSVDataSet - filename_suffix: ".csv" -``` - -node definition: - -```python -from kedro.pipeline import node - -node(create_partitions, inputs=None, outputs="new_partitioned_dataset") -``` - -and underlying node function `create_partitions`: - -```python -from typing import Any, Dict -import pandas as pd - - -def create_partitions() -> Dict[str, Any]: - """Create new partitions and save using PartitionedDataSet. - - Returns: - Dictionary with the partitions to create. - """ - return { - # create a file "s3://my-bucket-name/part/foo.csv" - "part/foo": pd.DataFrame({"data": [1, 2]}), - # create a file "s3://my-bucket-name/part/bar.csv.csv" - "part/bar.csv": pd.DataFrame({"data": [3, 4]}), - } -``` - -```eval_rst -.. note:: Writing to an existing partition may result in its data being overwritten, if this case is not specifically handled by the underlying dataset implementation. You should implement your own checks to ensure that no existing data is lost when writing to a ``PartitionedDataSet``. The simplest safety mechanism could be to use partition IDs that have a high chance of uniqueness: for example, the current timestamp. -``` - -`PartitionedDataSet` also supports lazy saving, where the partition's data is not materialized until it's time to write. -To use this, simply return `Callable` types in the dictionary: - -```python -from typing import Any, Dict, Callable -import pandas as pd - - -def create_partitions() -> Dict[str, Callable[[], Any]]: - """Create new partitions and save using PartitionedDataSet. - - Returns: - Dictionary of the partitions to create to a function that creates them. - """ - return { - # create a file "s3://my-bucket-name/part/foo.csv" - "part/foo": lambda: pd.DataFrame({"data": [1, 2]}), - # create a file "s3://my-bucket-name/part/bar.csv" - "part/bar": lambda: pd.DataFrame({"data": [3, 4]}), - } -``` - -> *Note:* When using lazy saving the dataset will be written _after_ the `after_node_run` [hook](../07_extend_kedro/02_hooks). - -### Incremental loads with `IncrementalDataSet` - -[IncrementalDataSet](/kedro.io.IncrementalDataSet) is a subclass of `PartitionedDataSet`, which stores the information about the last processed partition in the so-called `checkpoint`. `IncrementalDataSet` addresses the use case when partitions have to be processed incrementally, i.e. each subsequent pipeline run should only process the partitions which were not processed by the previous runs. - -This checkpoint, by default, is persisted to the location of the data partitions. For example, for `IncrementalDataSet` instantiated with path `s3://my-bucket-name/path/to/folder` the checkpoint will be saved to `s3://my-bucket-name/path/to/folder/CHECKPOINT`, unless the checkpoint configuration is [explicitly overwritten](#checkpoint-configuration). - -The checkpoint file is only created _after_ the partitioned dataset is explicitly [confirmed](#incremental-dataset-confirm). - -#### Incremental dataset load - -Loading `IncrementalDataSet` works similarly to [`PartitionedDataSet`](#partitioned-dataset-load) with several exceptions: -1. `IncrementalDataSet` loads the data _eagerly_, so the values in the returned dictionary represent the actual data stored in the corresponding partition, rather than a pointer to the load function. `IncrementalDataSet` considers a partition relevant for processing if its ID satisfies the comparison function, given the checkpoint value. -2. `IncrementalDataSet` _does not_ raise a `DataSetError` if load finds no partitions to return - an empty dictionary is returned instead. An empty list of available partitions is part of a normal workflow for `IncrementalDataSet`. - -#### Incremental dataset save - -`IncrementalDataSet` save operation is identical to the [save operation](#partitioned-dataset-save) of the `PartitionedDataSet`. - -#### Incremental dataset confirm - -```eval_rst -.. note:: The checkpoint value *is not* automatically updated by the fact that a new set of partitions was successfully loaded or saved. -``` - -Partitioned dataset checkpoint update is triggered by an explicit `confirms` instruction in one of the nodes downstream. It can be the same node, which processes the partitioned dataset: - -```python -from kedro.pipeline import node - -# process and then confirm `IncrementalDataSet` within the same node -node( - process_partitions, - inputs="my_partitioned_dataset", - outputs="my_processed_dataset", - confirms="my_partitioned_dataset", -) -``` - -Alternatively, confirmation can be deferred to one of the nodes downstream, allowing you to implement extra validations before the loaded partitions are considered successfully processed: - -```python -from kedro.pipeline import Pipeline, node - -Pipeline( - [ - node( - func=process_partitions, - inputs="my_partitioned_dataset", - outputs="my_processed_dataset", - ), - # do something else - node( - func=confirm_partitions, - # note that the node may not require 'my_partitioned_dataset' as an input - inputs="my_processed_dataset", - outputs=None, - confirms="my_partitioned_dataset", - ), - # ... - node( - func=do_something_else_with_partitions, - # will return the same partitions even though they were already confirmed - inputs=["my_partitioned_dataset", "my_processed_dataset"], - outputs=None, - ), - ] -) -``` - -Important notes about the confirmation operation: - -* Confirming a partitioned dataset does not affect any subsequent loads within the same run. All downstream nodes that input the same partitioned dataset as input will all receive the _same_ partitions. Partitions that are created externally during the run will also not affect the dataset loads and won't appear in the list of loaded partitions until the next run or until the [`release()`](/kedro.io.IncrementalDataSet) method is called on the dataset object. -* A pipeline cannot contain more than one node confirming the same dataset. - - -#### Checkpoint configuration - -`IncrementalDataSet` does not require explicit configuration of the checkpoint unless there is a need to deviate from the defaults. To update the checkpoint configuration, add a `checkpoint` key containing the valid dataset configuration. This may be required if, say, the pipeline has read-only permissions to the location of partitions (or write operations are undesirable for any other reason), in such case `IncrementalDataSet` can be configured to save the checkpoint elsewhere. `checkpoint` key also supports partial config updates where only some checkpoint attributes are overwritten, while the defaults are kept for the rest: - -```yaml -my_partitioned_dataset: - type: IncrementalDataSet - path: s3://my-bucket-name/path/to/folder - dataset: pandas.CSVDataSet - checkpoint: - # update the filepath and load_args, but keep the dataset type unchanged - filepath: gcs://other-bucket/CHECKPOINT - load_args: - k1: v1 -``` - -#### Special checkpoint config keys - -Along with the standard dataset attributes, `checkpoint` config also accepts 2 special optional keys: -* `comparison_func` (defaults to `operator.gt`) - fully qualified import path to the function that will be used to compare a partition ID with the checkpoint value, to determine if a partition should be processed. Such function must accept 2 positional string arguments - partition ID and checkpoint value, and return `True` if such partition is considered to be past the checkpoint. Specifying your own `comparison_func` may be useful if you need to customise the checkpoint filtration mechanism - for example, you may want to implement windowed loading, where you always want to load the partitions representing the last calendar month. See the example config specifying a custom comparison function: - -```yaml -my_partitioned_dataset: - type: IncrementalDataSet - path: s3://my-bucket-name/path/to/folder - dataset: pandas.CSVDataSet - checkpoint: - comparison_func: my_module.path.to.custom_comparison_function # the path must be importable -``` - -* `force_checkpoint` - if set, partitioned dataset will use this value as the checkpoint instead of loading the corresponding checkpoint file. This might be useful if you need to rollback the processing steps and reprocess some (or all) of the available partitions. See the example config forcing the checkpoint value: - -```yaml -my_partitioned_dataset: - type: IncrementalDataSet - path: s3://my-bucket-name/path/to/folder - dataset: pandas.CSVDataSet - checkpoint: - force_checkpoint: 2020-01-01/data.csv -``` - -```eval_rst -.. note:: Specification of ``force_checkpoint`` is also supported via the shorthand notation as follows: -``` - -```yaml -my_partitioned_dataset: - type: IncrementalDataSet - path: s3://my-bucket-name/path/to/folder - dataset: pandas.CSVDataSet - checkpoint: 2020-01-01/data.csv -``` - -```eval_rst -.. note:: If you need to force the partitioned dataset to load all available partitions, set ``checkpoint`` to an empty string: -``` - -```yaml -my_partitioned_dataset: - type: IncrementalDataSet - path: s3://my-bucket-name/path/to/folder - dataset: pandas.CSVDataSet - checkpoint: "" -``` diff --git a/docs/source/06_nodes_and_pipelines/01_nodes.md b/docs/source/06_nodes_and_pipelines/01_nodes.md deleted file mode 100644 index 16d0439038..0000000000 --- a/docs/source/06_nodes_and_pipelines/01_nodes.md +++ /dev/null @@ -1,145 +0,0 @@ -# Nodes - -In this section we introduce the concept of a node, for which the relevant API documentation is [kedro.pipeline.node](/kedro.pipeline.node). - -Nodes are the building blocks of pipelines and represent tasks. Pipelines are used to combine nodes to build workflows, which range from simple machine learning workflows to end-to-end (E2E) production workflows. - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -You will first need to import libraries from Kedro and other standard tools to run the code snippets demonstrated below. - -```python -from kedro.pipeline import * -from kedro.io import * -from kedro.runner import * - -import pickle -import os -``` - -## How to create a node - -A node is created by specifying a function, input variable names and output variable names. Let's consider a simple function that adds two numbers: - -```python -def add(x, y): - return x + y -``` - -The function has two inputs (`x` and `y`) and a single output (the sum of the inputs). - -Here is how a node is created with this function: - -```python -adder_node = node(func=add, inputs=["a", "b"], outputs="sum") -adder_node -``` - -Here is the output: - -```console -Out[1]: Node(add, ['a', 'b'], 'sum', None) -``` - -You can also add labels to nodes which will be used to describe them in logs: - -```python -adder_node = node(func=add, inputs=["a", "b"], outputs="sum") -print(str(adder_node)) - -adder_node = node(func=add, inputs=["a", "b"], outputs="sum", name="adding_a_and_b") -print(str(adder_node)) -``` - -Gives the following output: - -```console -add([a,b]) -> [sum] -adding_a_and_b: add([a,b]) -> [sum] -``` - -Let's break down the node definition: - -* `add` is the Python function that will execute when the node runs -* `['a', 'b']` specify the input variable names -* `sum` specifies the return variable name. The value returned by `add` will be bound in this variable -* `name` is an optional label for the node, which can be used to provide description of the business logic it provides - -### Node definition syntax - -There is a syntax to describe function inputs and outputs. This allows different Python functions to be reused in nodes and supports dependency resolution in pipelines. - -### Syntax for input variables - -```eval_rst -+----------------------------------+-----------------+-----------------------------+---------------------------------------+ -| Input syntax | Meaning | Example function parameters | How function is called when node runs | -+==================================+=================+=============================+=======================================+ -| :code:`None` | No input | :code:`def f()` | :code:`f()` | -+----------------------------------+-----------------+-----------------------------+---------------------------------------+ -| :code:`'a'` | Single input | :code:`def f(arg1)` | :code:`f(a)` | -+----------------------------------+-----------------+-----------------------------+---------------------------------------+ -| :code:`['a', 'b']` | Multiple inputs | :code:`def f(arg1, arg2)` | :code:`f(a, b)` | -+----------------------------------+-----------------+-----------------------------+---------------------------------------+ -| :code:`dict(arg1='x', arg2='y')` | Keyword inputs | :code:`def f(arg1, arg2)` | :code:`f(arg1=x, arg2=y)` | -+----------------------------------+-----------------+-----------------------------+---------------------------------------+ -``` - -### Syntax for output variables - -```eval_rst -+----------------------------------+-------------------+-------------------------------------+ -| Output syntax | Meaning | Example return statement | -+==================================+===================+=====================================+ -| :code:`None` | No output | Does not return | -+----------------------------------+-------------------+-------------------------------------+ -| :code:`'a'` | Single output | :code:`return a` | -+----------------------------------+-------------------+-------------------------------------+ -| :code:`['a', 'b']` | List output | :code:`return [a, b]` | -+----------------------------------+-------------------+-------------------------------------+ -| :code:`dict(key1='a', key2='b')` | Dictionary output | :code:`return dict(key1=a, key2=b)` | -+----------------------------------+-------------------+-------------------------------------+ -``` - -Any combinations of the above are possible, except nodes of the form `node(f, None, None)` (at least a single input or output needs to be provided). - -## How to tag a node - -Tags may be useful to run part of a pipeline without changing the code. For instance, `kedro run --tag=ds` will only run nodes that have a `ds` tag attached. - -To tag a node, you can simply specify the `tags` argument, as follows: - -```python -node(func=add, inputs=["a", "b"], outputs="sum", name="adding_a_and_b", tags="node_tag") -``` - -Moreover, you can [tag all nodes in a `Pipeline`](./02_pipeline_introduction.md#how-to-tag-a-pipeline). If the pipeline definition contains the `tags=` argument, Kedro will attach the corresponding tag to every node within that pipeline. - -To run a pipeline using a tag: - -```bash -kedro run --tag=pipeline_tag -``` - -This will run only the nodes found within the pipeline tagged with `pipeline_tag` - - -## How to run a node - -To run a node, you need to instantiate its inputs. In this case, the node expects two inputs: - -```python -adder_node.run(dict(a=2, b=3)) -``` - -The output is as follows: - -```console -Out[2]: {'sum': 5} -``` - -```eval_rst -.. note:: It is also possible to call a node as a regular Python function: ``adder_node(dict(a=2, b=3))``. This will call ``adder_node.run(dict(a=2, b=3))`` behind the scenes. -``` diff --git a/docs/source/06_nodes_and_pipelines/03_modular_pipelines.md b/docs/source/06_nodes_and_pipelines/03_modular_pipelines.md deleted file mode 100644 index 507ff612cb..0000000000 --- a/docs/source/06_nodes_and_pipelines/03_modular_pipelines.md +++ /dev/null @@ -1,438 +0,0 @@ -# Modular pipelines - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -## What are modular pipelines? - -In many typical Kedro projects, a single (“main”) pipeline increases in complexity as the project evolves. To keep your project fit for purpose, we recommend that you create modular pipelines, which are logically isolated and can be reused. Modular pipelines are easier to develop, test and maintain, and are portable so they can be copied and reused between projects. - -## How do I create a modular pipeline? - -You can use a [project-specific CLI command](../09_development/03_commands_reference.md#kedro-commands) to create a modular pipeline. The pipeline name must adhere to [generic Python module naming rules](https://realpython.com/python-pep8/#naming-conventions): - -* Can only contain alphanumeric characters and underscores (`A-Za-z0-9_`) -* Must start with a letter or underscore -* Must be at least 2 characters long - - -```eval_rst -.. note:: Since ``kedro pipeline`` is a group of project-specific commands, those will only show up when your current working directory is the project root. If you see an error message like ``Error: No such command 'pipeline'``, this indicates that your working directory does not point to a valid Kedro project. -``` - -For the full list of available CLI options, you can always run `kedro pipeline create --help` for more information. - -```bash -kedro pipeline create -``` - -```eval_rst -.. note:: Although Kedro does not enforce the following project structure, we strongly encourage that you use it when you develop your modular pipelines. Future versions of Kedro may assume this structure. -``` - -The `kedro pipeline create ` command creates the following: - -***A modular pipeline in a subfolder*** - -The command creates a modular pipeline in `src//pipelines//`. The folder contains the following files: - -* `__init__.py` to make Python treat the code in the subfolder as a module -* boilerplate `README.md` for you to record information regarding the pipeline's execution -* `nodes.py` as a location for you to add code for the nodes in your new modular pipeline -* `pipeline.py` to expose the `create_pipeline` function at the top-level of the module. Calling `create_pipeline` with no arguments should return an instance of a [Pipeline](/kedro.pipeline.Pipeline): - -```python -from .pipelines import my_modular_pipeline_1 - -pipeline = my_modular_pipeline_1.create_pipeline() -``` - -When you run `kedro pipeline create` it does _not_ automatically add a corresponding entry to `register_pipelines()` in `src//pipeline_registry.py`. - -In order to make your new pipeline runnable (using the `kedro run --pipeline ` CLI command, for example), you need to modify `src//pipeline_registry.py` yourself. - -***Boilerplate configuration files*** - -The `kedro pipeline create ` command also creates a boilerplate parameter configuration file, `.yml`, in `conf//parameters/`, where `` defaults to `base`. - -The project configuration from `conf/base/parameters/.yml` is automatically discoverable by [KedroContext](/kedro.framework.context.KedroContext) and requires no manual change. - -***A placeholder folder for unit tests*** - -Finally, `kedro pipeline create ` also creates a placeholder for the pipeline unit tests in `src/tests/pipelines//`. - -## Recommendations -For ease of use and portability, consider these recommendations as you develop a modular pipeline: - -* A modular pipeline should include a `README.md`, with all the information regarding its execution -* A modular pipeline _may_ have external dependencies specified in `requirements.txt`. These dependencies are _not_ - currently installed by the [`kedro install`](../09_development/03_commands_reference.md#install-all-package-dependencies) command, so users of your pipeline would have to run `pip install -r src//pipelines//requirements.txt` before using the pipeline -* To ensure portability, modular pipelines should use relative imports when accessing their own objects and absolute imports otherwise. For example, in `pipeline.py`: - -```python -from external_package import add # importing from external package -from kedro.pipeline import node, Pipeline - -from .nodes import node1_func, node2_func # importing its own node functions - - -def create_pipeline(): - node1 = node(func=node1_func, inputs="a", outputs="b") - node2 = node(func=node2_func, inputs="c", outputs="d") - node3 = node(func=add, inputs=["b", "d"], outputs="sum") - return Pipeline([node1, node2, node3]) -``` - -* Modular pipelines should _not_ depend on the main Python package (`new_kedro_project` in this example) as this would break portability to another project -* Modular pipelines should be registered and stitched together in a main (or `__default__`) pipeline located in `src/new_kedro_project/pipeline_registry.py` - -The following example, illustrates how to import and instantiate two modular pipelines (`modular_pipeline_1` and `modular_pipeline_2`) within `src/new_kedro_project/pipeline_registry.py`: - -```python -from typing import Dict - -from kedro.pipeline import Pipeline - -from new_kedro_project.pipelines import ( - modular_pipeline_1 as mp1, - modular_pipeline_2 as mp2, -) - - -def register_pipelines() -> Dict[str, Pipeline]: - pipeline1 = mp1.create_pipeline() - pipeline2 = mp2.create_pipeline() - pipeline_all = pipeline1 + pipeline2 - return {"mp1": pipeline1, "mp2": pipeline2, "__default__": pipeline_all} -``` - -To run a pipeline by name from the command line: - -```bash -kedro run --pipeline mp2 -``` - -## How to share a modular pipeline - -### Package a modular pipeline -Since Kedro 0.16.4 you can package a modular pipeline by executing `kedro pipeline package ` command, which will generate a new [wheel file](https://pythonwheels.com/) for it. By default, the wheel file will be saved into `dist/` directory inside your project, however this can be changed using the `--destination` (`-d`) option. - -When you package your modular pipeline, Kedro will also automatically package files from 3 locations: - -* All the modular pipeline code in `src//pipelines//` -* Parameter files that match either the glob pattern `conf//parameters*/**/.yml` or `conf//parameters*/**//*`, where `` defaults to `base`. If you need to capture the parameters from a different config environment, run `kedro pipeline package --env ` -* Pipeline unit tests in `src/tests/pipelines/` - -Kedro will also include any requirements found in `src//pipelines//requirements.txt` in the modular pipeline wheel file. These requirements will later be taken into account when pulling a pipeline via `kedro pipeline pull`. - -```eval_rst -.. note:: Kedro will not package the catalog config files even if those are present in ``conf//catalog/.yml``. -``` - -If you plan to publish your packaged modular pipeline to some Python package repository like [PyPI](https://pypi.org/), you need to make sure that your modular pipeline name doesn't clash with any of the existing packages in that repository. However, there is no need to rename any of your source files if that is the case. Simply alias your package with a new name by running `kedro pipeline package --alias `. - -In addition to [PyPI](https://pypi.org/), you can also share the packaged wheel file directly, or via a cloud storage such as AWS S3. - -#### Package multiple modular pipelines - -If you are packaging multiple modular pipelines, you have the option to do it in bulk, by defining the specifications in the project's `pyproject.toml`: - -```toml -[tool.kedro.pipeline.package] -first_pipeline = {alias = "aliased_pipeline", destination = "somewhere/else", env = "uat"} -second_pipeline = {} -``` - -Where the keys (e.g. `first_pipeline`, `second_pipeline`) are the modular pipelines' folder names, and the values are the options that `kedro pipeline package ` accepts. - -```eval_rst -.. note:: Make sure `destination` is specified as a POSIX path even when working on a Windows machine. -``` - -### Pull a modular pipeline - -You can pull a modular pipeline from a wheel file by executing `kedro pipeline pull `, where `` is either a package name on PyPI or a path to the wheel file. Kedro will unpack the wheel file, and install the files in following locations in your Kedro project: - -* All the modular pipeline code in `src//pipelines//` -* Configuration files in `conf//parameters/.yml`, where `` defaults to `base`. If you want to place the parameters from a different config environment, run `kedro pipeline pull --env ` -* Pipeline unit tests in `src/tests/pipelines/` - -Kedro will also parse any requirements packaged with the modular pipeline and add them to project level `requirements.in`. It is advised to do `kedro install --build-reqs` to compile and install the updated list of requirements after pulling a modular pipeline. - -```eval_rst -.. note:: If a modular pipeline has embedded requirements and a project `requirements.in` file does not already exist, it will be generated based on the project `requirements.txt` before appending the modular pipeline requirements. -``` - -You can pull a modular pipeline from different locations, including local storage, PyPI and the cloud: - -- Pulling a modular pipeline from a local directory: - -```bash -kedro pipeline pull /dist/-0.1-py3-none-any.whl -``` - -- Pulling a modular pipeline from S3: - -```bash -kedro pipeline pull https://.s3..amazonaws.com/-0.1-py3-none-any.whl -``` - -- Pulling a modular pipeline from PyPI: - -```bash -kedro pipeline pull -``` - -If you are pulling the pipeline from a location that isn't PyPI, Kedro uses [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to locate and pull down your pipeline. If you need to provide any `fsspec`-specific arguments (say, if you're pulling your pipeline down from an S3 bucket and want to provide the S3 credentials inline or from a local server that requires tokens in the header) then you can use the `--fs-args` option to point to a YAML (or any `anyconfig`-supported configuration) file that contains the required configuration. - -```bash -kedro pipeline pull https:// --fs-args pipeline_pull_args.yml -``` - -where - -``` -# pipeline_pull_args.yml -client_kwargs: - headers: - Authorization: token -``` - -## A modular pipeline example template - -Here is an example of a modular pipeline which combines all of these concepts within a Kedro project: - -* The modular pipelines: - - `src/new_kedro_project/pipelines/data_engineering` - A pipeline that imputes missing data and discovers outlier data points - - `src/new_kedro_project/pipelines/feature_engineering` - A pipeline that generates temporal features while aggregating data and performs a train/test split on the data - - `src/new_kedro_project/pipelines/modelling` - A pipeline that fits models, does hyperparameter search and reports on model performance -* A main (or `__default__`) pipeline: - - `src/new_kedro_project/pipeline_registry.py` - combines 3 modular pipelines from the above - -
-Click to expand - -```console -new-kedro-project -├── .ipython/ -├── conf/ -├── data/ -├── docs/ -├── logs/ -├── notebooks/ -├── src -│   ├── new_kedro_project -│   │   ├── pipelines -│   │   │   ├── data_engineering -│   │   │   │   ├── __init__.py -│   │   │   │   ├── nodes.py -│   │   │   │   ├── pipeline.py -│   │   │   │   ├── requirements.txt -│   │   │   │   └── README.md -│   │   │   ├── feature_engineering -│   │   │   │   ├── __init__.py -│   │   │   │   ├── nodes.py -│   │   │   │   ├── pipeline.py -│   │   │   │   ├── requirements.txt -│   │   │   │   └── README.md -│   │   │   ├── modelling -│   │   │   │   ├── __init__.py -│   │   │   │   ├── nodes.py -│   │   │   │   ├── pipeline.py -│   │   │   │   ├── requirements.txt -│   │   │   │   └── README.md -│   │   │   └── __init__.py -│   │   ├── __init__.py -│   │   ├── hooks.py -│   │   ├── pipeline_registry.py -│   │   ├── __main__.py -| | └── settings.py -│   ├── tests -│   │   ├── __init__.py -│   │ ├── pipelines -│   │   │   ├── data_engineering -│   │   │   │   ├── __init__.py -│   │   │   │   └── test_pipeline.py -│   │   │   ├── feature_engineering -│   │   │   │   ├── __init__.py -│   │   │   │   └── test_pipeline.py -│   │   │   ├── modelling -│   │   │   │   ├── __init__.py -│   │   │   │   └── test_pipeline.py -│   │   └── test_run.py -│   ├── requirements.txt -│   └── setup.py -├── pyproject.toml -├── README.md -└── setup.cfg -``` -
- -### Configuration - -Nested configuration in modular pipelines is _not_ supported by Kedro. It means that putting config files (like `catalog.yml`) in `src//pipelines//conf` will have no effect on the Kedro project configuration, however you may document it as a custom step that other users must complete as part of setting up your modular pipeline. - -If you plan to manually hand off your modular pipeline to another project, you should document the configuration used by the pipeline in the `README.md` of your modular pipeline. For example, you may copy your configuration into the modular pipeline location before the pipeline hand off and instruct the users to copy `catalog.yml` into their top-level configuration: - -```bash -mkdir conf/base/catalog/ # create a separate folder for the pipeline configs -cp src//pipelines/data_engineering/conf/catalog.yml conf/base/catalog/data_engineering.yml # copy the pipeline configs -``` - -### Datasets - -It is important to keep in mind that Kedro resolves the execution order of your pipeline's node based on their input and output datasets. - -For example, if `node1` outputs the dataset `A`, and `node2` requires the dataset `A` as an input, then `node1` is guaranteed to be executed before `node2` when Kedro runs the pipeline. - -As a modular pipeline developer, you may not know how your pipeline will be integrated in the downstream projects and what data catalog configuration they may have. Therefore, it is crucial to make it clear in the pipeline documentation what datasets (names and types) are required as inputs by your modular pipeline and what datasets it produces as outputs. - -## How to connect existing pipelines - -When two existing pipelines need to work together, they should be connected by the input and output datasets. But the names might be different, requiring manual fixes to be applied to the pipeline itself. An alternative solution would be to use `pipeline()`, the modular pipelines connector. - -You can think of `pipeline()` as an equivalent to `node()`, which accepts an underlying function, inputs and outputs, and returns a `Node` object. Similarly, `pipeline()` accepts the underlying pipeline, inputs and outputs, and returns a `Pipeline` object. - -Consider this example: - -```python -cook_pipeline = Pipeline( - [node(defrost, "frozen_meat", "meat"), node(grill, "meat", "grilled_meat")] -) - -lunch_pipeline = Pipeline([node(eat, "food", None)]) -``` - -A simple `cook_pipeline + lunch_pipeline` doesn't work, because the `grilled_meat` output in the `cook_pipeline` needs to be mapped to the `food` input in the `lunch_pipeline`. This can be done in any of the following three (equivalent) ways: - -```python -from kedro.pipeline import pipeline - -final_pipeline1 = ( - pipeline(cook_pipeline, outputs={"grilled_meat": "food"}) + lunch_pipeline -) - -# or -final_pipeline2 = cook_pipeline + pipeline( - lunch_pipeline, inputs={"food": "grilled_meat"} -) - -# or -final_pipeline3 = pipeline( - cook_pipeline, outputs={"grilled_meat": "new_name"} -) + pipeline(lunch_pipeline, inputs={"food": "new_name"}) -``` - -Remember you can pass `Pipeline` objects in the constructor as well, like in the example below. This approach is cleaner and more idiomatic when you are combining multiple modular pipelines together. - -```python -final_pipeline = Pipeline( - [ - pipeline(cook_pipeline, outputs={"grilled_meat": "new_name"}), - pipeline(lunch_pipeline, inputs={"food": "new_name"}), - node(...), - ..., - ] -) -``` - -```eval_rst -.. note:: ``inputs`` should correspond to the pipeline free inputs, while ``outputs`` are either free or intermediary outputs. -``` - - -## How to use a modular pipeline twice -Consider the example: - -```python -cook_pipeline = Pipeline( - [ - node(defrost, "frozen_meat", "meat", name="defrost_node"), - node(grill, "meat", "grilled_meat"), - ] -) - -eat_breakfast_pipeline = Pipeline([node(eat_breakfast, "breakfast_food", None)]) -eat_lunch_pipeline = Pipeline([node(eat_lunch, "lunch_food", None)]) -``` - -Now we need to "defrost" two different types of food and input to different pipelines. But we can't use the `cook_pipeline` twice because the internal dataset names will conflict. We might try to call `pipeline()` and map all datasets, but the conflict from the explicitly set `name="defrost_node"` remains. - -Here is a solution that uses a namespace: - -```python -cook_breakfast_pipeline = pipeline( - cook_pipeline, - inputs="frozen_meat", # inputs stay the same, don't namespace - outputs={"grilled_meat": "breakfast_food"}, - namespace="breakfast", -) -cook_lunch_pipeline = pipeline( - cook_pipeline, - inputs="frozen_meat", # inputs stay the same, don't namespace - outputs={"grilled_meat": "lunch_food"}, - namespace="lunch", -) - -final_pipeline = ( - cook_breakfast_pipeline - + eat_breakfast_pipeline - + cook_lunch_pipeline - + eat_lunch_pipeline -) -``` - -`namespace="lunch"` renames all datasets and nodes, prefixing them with `"lunch."`, except those datasets that we explicitly "freeze" (`frozen_meat`) or remap (`grilled_meat`). - -Remapping free outputs is required since "breakfast_food" and "lunch_food" are the names expected by the `eat_breakfast_pipeline` and `eat_lunch_pipeline` respectively. - -The resulting pipeline now has two separate nodes, `breakfast.defrost_node` and `lunch.defrost_node`. Also two separate datasets `breakfast.meat` and `lunch.meat` connect the nodes inside the pipelines, causing no confusion between them. - -Note that `pipeline()` will also prefix single parameter referenced with `params:` in a node's inputs. However, it won't prefix `parameters`. - -For example: - -```python -raw_pipeline = Pipeline([node(node_func, ["input", "params:x"], "output")]) -final_pipeline = pipeline(raw_pipeline, namespace="new") -# `final_pipeline` will be `Pipeline([node(node_func, ["new.input", "params:new.x"], "new.output")])` -``` - -## How to use a modular pipeline with different parameters - -You can map parameter values in a similar way to inputs and outputs. Let's say you have two almost identical pipelines that differ by one parameter. You want to run the pipelines on the same set of inputs. - -```python -alpha_pipeline = Pipeline( - [ - node(node_func1, ["input1", "input2", "params:alpha"], "intermediary_output"), - node(node_func2, "intermediary_output", "output"), - ] -) -beta_pipeline = pipeline( - alpha_pipeline, - inputs={"input1", "input2"}, - parameters={"alpha": "beta"}, - namespace="beta", -) - -final_pipeline = alpha_pipeline + beta_pipeline -``` - -The value of parameter `alpha` is replaced with the value of parameter `beta`, assuming they both live in your parameters configuration (`parameters.yml`). The namespace ensures that outputs are not overwritten, so intermediate and final outputs are prefixed, i.e. `beta.intermediary_output`, `beta.output`. - -Note that similar to `inputs` and `outputs` namespacing rule, if you supply a `str` or a `Set[str]`, these explicitly listed parameters won't be namespaced. - -## How to clean up a modular pipeline -You can manually delete all the files that belong to a modular pipeline. However, Kedro also provides a CLI command to clean up automatically. It deletes the following files when you call `kedro pipeline delete `: - - -* All the modular pipeline code in `src//pipelines//` -* Configuration files `conf//parameters/.yml` and `conf//catalog/.yml`, where `` defaults to `base`. If the files are located in a different config environment, run `kedro pipeline delete --env `. -* Pipeline unit tests in `tests/pipelines//` - - -```eval_rst -.. note:: ``kedro pipeline delete`` won't remove the entry from ``pipeline_registry.py`` if you have imported the modular pipeline there. You must remove it manually to clean up, otherwise it will break your project because the import will raise an error. -``` diff --git a/docs/source/07_extend_kedro/01_common_use_cases.md b/docs/source/07_extend_kedro/01_common_use_cases.md deleted file mode 100644 index f73bce0876..0000000000 --- a/docs/source/07_extend_kedro/01_common_use_cases.md +++ /dev/null @@ -1,132 +0,0 @@ -# Common use cases - -Kedro has a few built-in mechanisms for you to extend its behaviour. This document explains how to select which mechanism to employ for the most common use cases. - -## Use Case 1: How to add extra behaviour to Kedro's execution timeline - -The execution timeline of a Kedro pipeline can be thought of as a sequence of actions performed by various Kedro library components, such as the [DataSets](/kedro.extras.datasets), [DataCatalog](/kedro.io.DataCatalog), [Pipeline](/kedro.pipeline.Pipeline), and [Node](/kedro.pipeline.node.Node). - -At different points in the lifecycle of these components, you may want to add extra behaviour. For example, you could add extra computation for profiling purposes _before_ and _after_ a node runs or _before_ and _after_ the I/O actions of a dataset, namely the `load` and `save` actions. - -Before Kedro 0.17.0, we added a few different APIs to allow you to extend Kedro's behaviour. For example, to allow extra behaviour _before_ and _after_ a node runs, we introduced the [decorators](07_decorators.md) API. Similarly, to allow extra behaviour _before_ and _after_ dataset I/O, we introduced the [transformers](06_transformers.md) API. - -While we addressed some immediate use cases, we have since decided to provide just one, single way to extend Kedro's execution timeline: Hooks. So, from Kedro version 0.17.0, we now deprecate decorators and transformers in favour of [Hooks](./02_hooks.md), which will be the recommended approach when you need to extend Kedro's execution timeline. - -## Use Case 2: How to integrate Kedro with additional data sources - -You can use [DataSets](/kedro.extras.datasets) to interface with various different data sources. If the data source you plan to use is not supported out of the box by Kedro, you can [create a custom dataset](03_custom_datasets.md). - -## Use Case 3: How to add or modify CLI commands - -If you want to customise a built-in Kedro command, such as `kedro run`, for a specific project, add a `cli.py` file that defines a custom `run()` function. You should add the `cli.py` file at the same level as `settings.py`. A template for the `cli.py` file is in the section below. - -
-Click to expand - -``` -"""Command line tools for manipulating a Kedro project. -Intended to be invoked via `kedro`.""" -import click -from kedro.framework.cli.utils import ( - _config_file_callback, - _reformat_load_versions, - _split_params, - env_option, - split_string, CONTEXT_SETTINGS, -) - -from kedro.framework.cli.project import ( - FROM_INPUTS_HELP, TO_OUTPUTS_HELP, FROM_NODES_HELP, TO_NODES_HELP, NODE_ARG_HELP, - RUNNER_ARG_HELP, PARALLEL_ARG_HELP, ASYNC_ARG_HELP, TAG_ARG_HELP, LOAD_VERSION_HELP, - PIPELINE_ARG_HELP, CONFIG_FILE_HELP, PARAMS_ARG_HELP -) - - -@click.group(context_settings=CONTEXT_SETTINGS, name=__file__) -def cli(): - """Command line tools for manipulating a Kedro project.""" - - -@cli.command() -@click.option( - "--from-inputs", type=str, default="", help=FROM_INPUTS_HELP, callback=split_string -) -@click.option( - "--to-outputs", type=str, default="", help=TO_OUTPUTS_HELP, callback=split_string -) -@click.option( - "--from-nodes", type=str, default="", help=FROM_NODES_HELP, callback=split_string -) -@click.option( - "--to-nodes", type=str, default="", help=TO_NODES_HELP, callback=split_string -) -@click.option("--node", "-n", "node_names", type=str, multiple=True, help=NODE_ARG_HELP) -@click.option( - "--runner", "-r", type=str, default=None, multiple=False, help=RUNNER_ARG_HELP -) -@click.option("--parallel", "-p", is_flag=True, multiple=False, help=PARALLEL_ARG_HELP) -@click.option("--async", "is_async", is_flag=True, multiple=False, help=ASYNC_ARG_HELP) -@env_option -@click.option("--tag", "-t", type=str, multiple=True, help=TAG_ARG_HELP) -@click.option( - "--load-version", - "-lv", - type=str, - multiple=True, - help=LOAD_VERSION_HELP, - callback=_reformat_load_versions, -) -@click.option("--pipeline", type=str, default=None, help=PIPELINE_ARG_HELP) -@click.option( - "--config", - "-c", - type=click.Path(exists=True, dir_okay=False, resolve_path=True), - help=CONFIG_FILE_HELP, - callback=_config_file_callback, -) -@click.option( - "--params", type=str, default="", help=PARAMS_ARG_HELP, callback=_split_params -) -def run( - tag, - env, - parallel, - runner, - is_async, - node_names, - to_nodes, - from_nodes, - from_inputs, - to_outputs, - load_version, - pipeline, - config, - params, -): - """Run the pipeline.""" - - == ADD YOUR CUSTOM RUN COMMAND CODE HERE == - -``` -
- -If you want to customise a Kedro command from a command group, such as `kedro pipeline` or `kedro jupyter`, you need to import the corresponding click command group from the Kedro framework `cli`. For `kedro pipeline` commands this would be `from kedro.framework.cli.pipeline import pipeline`, and for `kedro jupyter` commands `from kedro.framework.cli.jupyter import jupyter`. -You can then add or overwrite any command by adding it to the click group, as in the snippet below: -``` -@jupyter.command("notebook") -@env_option( - help="Open a notebook" -) -def notebook_run(...): - == ADD YOUR CUSTOM NOTEBOOK COMMAND CODE HERE == -``` - -To inject additional CLI commands intended to be reused across projects, please refer to our [plugins](./04_plugins.md) system. An example of one such command is the `kedro viz` command introduced by the official [Kedro-Viz](https://github.com/quantumblacklabs/kedro-viz) plugin. This command is intended to work on every Kedro project and therefore must be a standalone plugin. - -```eval_rst -.. note:: Your plugin's implementation can take advantage of other extension mechanisms such as Hooks. -``` - -## Use Case 4: How to customise the initial boilerplate of your project - -Sometimes you might want to tailor the starting boilerplate of a Kedro project to your specific needs. For example, your organisation might have a standard CI script that you want to include in every new Kedro project. To this end, please visit our guide to [create Kedro starters](./05_create_kedro_starters.md) to solve this extension requirement. diff --git a/docs/source/07_extend_kedro/02_hooks.md b/docs/source/07_extend_kedro/02_hooks.md deleted file mode 100644 index 957d860d9d..0000000000 --- a/docs/source/07_extend_kedro/02_hooks.md +++ /dev/null @@ -1,487 +0,0 @@ -# Hooks - -## Introduction - -Hooks are a mechanism to add extra behaviour to Kedro's main execution in an easy and consistent manner. Some examples may include: - -* Adding a transformer after the data catalog is loaded -* Adding data validation to the inputs before a node runs, and to the outputs after a node has run. This makes it possible to integrate with other tools like [Great-Expectations](https://docs.greatexpectations.io/en/latest/) -* Adding machine learning metrics tracking, e.g. using [MLflow](https://mlflow.org/), throughout a pipeline run - -## Concepts - -A Hook is comprised of a Hook specification and Hook implementation. To add Hooks to your project you will need to: - -* Provide a Hook implementation for an existing Kedro-defined Hook specification -* Register your Hook implementation in the `src//settings.py` file under the `HOOKS` key - -### Hook specification - -Kedro distinguishes between 2 main types of Hooks: execution timeline and component registration. - -#### Execution timeline Hooks - -Kedro defines Hook specifications for particular execution points where users can inject additional behaviour. Currently, the following Hook specifications are provided in [kedro.framework.hooks](/kedro.framework.hooks): - -* `after_catalog_created` -* `before_node_run` -* `after_node_run` -* `on_node_error` -* `before_pipeline_run` -* `after_pipeline_run` -* `on_pipeline_error` -* `before_dataset_loaded` -* `after_dataset_loaded` -* `before_dataset_saved` -* `after_dataset_saved` - -The naming convention for non-error Hooks is `__`, in which: - -* `` and `` refers to when the Hook executed, e.g. `before was run` or `after was created`. -* `` refers to the relevant component in the Kedro execution timeline for which this Hook adds extra behaviour, e.g. `catalog`, `node` and `pipeline`. - -The naming convention for error hooks is `on__error`, in which: - -* `` refers to the relevant component in the Kedro execution timeline that throws the error. - -[kedro.framework.hooks](/kedro.framework.hooks) lists the full specifications for which you can inject additional behaviours by providing an implementation. - - -#### Registration Hooks - -In addition, Kedro defines Hook specifications to register certain library components to be used with the project. This is where users can define their custom class implementations. Currently, the following Hook specifications are provided: - -* `register_pipelines` -* `register_catalog` - -The naming convention for registration hooks is `register_`. - -### Hook implementation - -You should provide an implementation for the specification that describes the point at which you want to inject additional behaviour. The Hook implementation should have the same name as the specification. The Hook must provide a concrete implementation with a subset of the corresponding specification's parameters (you do not need to use them all). - -To declare a Hook implementation, use the `@hook_impl` decorator. - -For example, the full signature of the [`after_data_catalog_created`](/kedro.framework.hooks.specs.DataCatalogSpecs) Hook specification is: - -```python -@hook_spec -def after_catalog_created( - self, - catalog: DataCatalog, - conf_catalog: Dict[str, Any], - conf_creds: Dict[str, Any], - save_version: str, - load_versions: Dict[str, str], - run_id: str, -) -> None: - pass -``` - -However, if you just want to use this Hook to add transformer for a data catalog after it is created, your Hook implementation can be as simple as: - -```python -# /src//hooks.py -from kedro.extras.transformers.time_profiler import ProfileTimeTransformer -from kedro.framework.hooks import hook_impl -from kedro.io import DataCatalog - - -class TransformerHooks: - @hook_impl - def after_catalog_created(self, catalog: DataCatalog) -> None: - catalog.add_transformer(ProfileTimeTransformer()) -``` - -```eval_rst -.. note:: The name of a module that contains Hooks implementation is arbitrary and is not restricted to ``hooks.py``. -``` - -We recommend that you group related Hook implementations under a namespace, preferably a class, within a `hooks.py` file in your project. - -#### Registering your Hook implementations with Kedro - -Hook implementations should be registered with Kedro using the `/src//settings.py` file under the `HOOKS` key. - -You can register more than one implementation for the same specification. They will be called in LIFO (last-in, first-out) order. - -The following example sets up a Hook so that the `after_data_catalog_created` implementation is called every time after a data catalog is created. - -```python -# /src//settings.py -from .hooks import ProjectHooks, TransformerHooks - -HOOKS = (ProjectHooks(), TransformerHooks()) -``` - -Kedro also has auto-discovery enabled by default. This means that any installed plugins that declare a Hooks entry-point will be registered. To learn more about how to enable this for your custom plugin, see our [plugin development guide](04_plugins.md#hooks). - -```eval_rst -.. note:: Auto-discovered Hooks will run *first*, followed by the ones specified in `settings.py`. -``` - - -#### Disable auto-registered plugins' Hooks - -Auto-registered plugins' Hooks can be disabled via `settings.py` as follows: - -```python -# /src//settings.py - -DISABLE_HOOKS_FOR_PLUGINS = ("",) -``` - -where `` is the name of an installed plugin for which the auto-registered Hooks must be disabled. - -## Common use cases - -### Use Hooks to extend a node's behaviour - -Prior to Kedro 0.16, to add extra behaviour before and after a node's execution, we recommended using [decorators](07_decorators.md) on individual nodes. We also exposed a convenience method to apply decorators to [all nodes in a `Pipeline`](07_decorators.md#how-to-apply-a-decorator-to-nodes). - -However, after the introduction of Hooks in 0.16, this capability is readily available through the [`before_node_run` and `after_node_run` Hooks](/kedro.framework.hooks.specs.NodeSpecs). Furthermore, you can apply extra behaviour to not only an individual node or an entire Kedro pipeline, but also to a _subset_ of nodes based on their tags or namespaces. For example, let's say we want to add the following extra behaviours to a node: - -```python -from kedro.pipeline.node import Node - - -def say_hello(node: Node): - """An extra behaviour for a node to say hello before running.""" - print(f"Hello from {node.name}") -``` - -Then you can either add it to a single node based on the node's name: - -```python -# /src//hooks.py - -from kedro.framework.hooks import hook_impl -from kedro.pipeline.node import Node - - -class ProjectHooks: - @hook_impl - def before_node_run(self, node: Node): - # adding extra behaviour to a single node - if node.name == "hello": - say_hello(node) -``` - -Or add it to a group of nodes based on their tags: - - -```python -# /src//hooks.py - -from kedro.framework.hooks import hook_impl -from kedro.pipeline.node import Node - - -class ProjectHooks: - @hook_impl - def before_node_run(self, node: Node): - if "hello" in node.tags: - say_hello(node) -``` - -Or add it to all nodes in the entire pipeline: - -```python -# /src//hooks.py - -from kedro.framework.hooks import hook_impl -from kedro.pipeline.node import Node - - -class ProjectHooks: - @hook_impl - def before_node_run(self, node: Node): - # adding extra behaviour to all nodes in the pipeline - say_hello(node) -``` - -If your use case takes advantage of a decorator, for example to retry a node's execution using a library such as [tenacity](https://tenacity.readthedocs.io/en/latest/), you can still decorate the node's function directly: - -```python -from tenacity import retry - - -@retry -def my_flaky_node_function(): - ... -``` - -Or applying it in the `before_node_run` Hook as follows: - -```python -# /src//hooks.py -from tenacity import retry - -from kedro.framework.hooks import hook_impl -from kedro.pipeline.node import Node - - -class ProjectHooks: - @hook_impl - def before_node_run(self, node: Node): - # adding retrying behaviour to nodes tagged as flaky - if "flaky" in node.tags: - node.func = retry(node.func) -``` -### Use Hooks to customise the dataset load and save methods -From Kedro 0.18.0 [Transformers](06_transformers.md) will be deprecated and we recommend using the `before_dataset_loaded`/`after_dataset_loaded` and `before_dataset_saved`/`after_dataset_saved` Hooks to customise the dataset `load` and `save` methods where appropriate. - -For example, you can add logging about the dataset load runtime as follows: - -```python -@property -def _logger(self): - return logging.getLogger(self.__class__.__name__) - - -@hook_impl -def before_dataset_loaded(self, dataset_name: str) -> None: - start = time.time() - logging.info("Loading dataset %s started at %0.3f", dataset_name, start) - - -@hook_impl -def after_dataset_loaded(self, dataset_name: str, data: Any) -> None: - end = time.time() - logging.info("Loading dataset %s ended at %0.3f", dataset_name, end) -``` - -## Under the hood - -Under the hood, we use [pytest's pluggy](https://pluggy.readthedocs.io/en/latest/) to implement Kedro's Hook mechanism. We recommend reading their documentation if you have more questions about the underlying implementation. - -## Hooks examples - -### Add data validation - -This example adds data validation to node inputs and outputs using [Great Expectations](https://docs.greatexpectations.io/en/latest/). - -* Install dependencies: - -```console -pip install great-expectations -``` - -* Implement `before_node_run` and `after_node_run` Hooks to validate inputs and outputs data respectively leveraging `Great Expectations`: - -```python -# /src//hooks.py -from typing import Any, Dict - -from kedro.framework.hooks import hook_impl -from kedro.io import DataCatalog - -import great_expectations as ge - - -class DataValidationHooks: - - # Map expectation to dataset - DATASET_EXPECTATION_MAPPING = { - "companies": "raw_companies_dataset_expectation", - "preprocessed_companies": "preprocessed_companies_dataset_expectation", - } - - @hook_impl - def before_node_run( - self, catalog: DataCatalog, inputs: Dict[str, Any], run_id: str - ) -> None: - """Validate inputs data to a node based on using great expectation - if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``. - """ - self._run_validation(catalog, inputs, run_id) - - @hook_impl - def after_node_run( - self, catalog: DataCatalog, outputs: Dict[str, Any], run_id: str - ) -> None: - """Validate outputs data from a node based on using great expectation - if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``. - """ - self._run_validation(catalog, outputs, run_id) - - def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any], run_id: str): - for dataset_name, dataset_value in data.items(): - if dataset_name not in self.DATASET_EXPECTATION_MAPPING: - continue - - dataset = catalog._get_dataset(dataset_name) - dataset_path = str(dataset._filepath) - expectation_suite = self.DATASET_EXPECTATION_MAPPING[dataset_name] - - expectation_context = ge.data_context.DataContext() - batch = expectation_context.get_batch( - {"path": dataset_path, "datasource": "files_datasource"}, - expectation_suite, - ) - expectation_context.run_validation_operator( - "action_list_operator", assets_to_validate=[batch], run_id=run_id - ) -``` - -* Register Hooks implementation, as described [above](#registering-your-hook-implementations-with-kedro) and run Kedro. - -`Great Expectations` example report: - -![](../meta/images/data_validation.png) - -### Add observability to your pipeline - -This example adds observability to your pipeline using [statsd](https://statsd.readthedocs.io/en/v3.3/configure.html) and makes it possible to visualise dataset size and node execution time using [Grafana](https://grafana.com/). - -* Install dependencies: - -```console -pip install statsd -``` - -* Implement `before_node_run` and `after_node_run` Hooks to collect metrics (DataSet size and node execution time): - -```python -# /src//hooks.py -import sys -from typing import Any, Dict - -import statsd -from kedro.framework.hooks import hook_impl -from kedro.pipeline.node import Node - - -class PipelineMonitoringHooks: - def __init__(self): - self._timers = {} - self._client = statsd.StatsClient(prefix="kedro") - - @hook_impl - def before_node_run(self, node: Node) -> None: - node_timer = self._client.timer(node.name) - node_timer.start() - self._timers[node.short_name] = node_timer - - @hook_impl - def after_node_run(self, node: Node, inputs: Dict[str, Any]) -> None: - self._timers[node.short_name].stop() - for dataset_name, dataset_value in inputs.items(): - self._client.gauge(dataset_name + "_size", sys.getsizeof(dataset_value)) - - @hook_impl - def after_pipeline_run(self): - self._client.incr("run") -``` - -* Register Hooks implementation, as described [above](#registering-your-hook-implementations-with-kedro) and run Kedro. - -`Grafana` example page: - -![](../meta/images/pipeline_observability.png) - -### Add metrics tracking to your model - -This examples adds metrics tracking using [MLflow](https://mlflow.org/). - -* Install dependencies: - -```console -pip install mlflow -``` - -* Implement `before_pipeline_run`, `after_pipeline_run` and `after_node_run` Hooks to collect metrics using `MLflow`: - -```python -# /src//hooks.py -from typing import Any, Dict - -import mlflow -import mlflow.sklearn -from kedro.framework.hooks import hook_impl -from kedro.pipeline.node import Node - - -class ModelTrackingHooks: - """Namespace for grouping all model-tracking hooks with MLflow together.""" - - @hook_impl - def before_pipeline_run(self, run_params: Dict[str, Any]) -> None: - """Hook implementation to start an MLflow run - with the same run_id as the Kedro pipeline run. - """ - mlflow.start_run(run_name=run_params["run_id"]) - mlflow.log_params(run_params) - - @hook_impl - def after_node_run( - self, node: Node, outputs: Dict[str, Any], inputs: Dict[str, Any] - ) -> None: - """Hook implementation to add model tracking after some node runs. - In this example, we will: - * Log the parameters after the data splitting node runs. - * Log the model after the model training node runs. - * Log the model's metrics after the model evaluating node runs. - """ - if node._func_name == "split_data": - mlflow.log_params( - {"split_data_ratio": inputs["params:example_test_data_ratio"]} - ) - - elif node._func_name == "train_model": - model = outputs["example_model"] - mlflow.sklearn.log_model(model, "model") - mlflow.log_params(inputs["parameters"]) - - @hook_impl - def after_pipeline_run(self) -> None: - """Hook implementation to end the MLflow run - after the Kedro pipeline finishes. - """ - mlflow.end_run() -``` - -* Register Hooks implementation, as described [above](#registering-your-hook-implementations-with-kedro) and run Kedro. - -`MLflow` example page: - -![](../meta/images/mlflow.png) - -### Modify node inputs using `before_node_run` hook - -If the `before_node_run` hook is implemented _and_ returns a dictionary, that dictionary is used to update the corresponding node inputs. - -For example, if a pipeline contains a node named `my_node`, which takes 2 inputs: `first_input` and `second_input`, to overwrite the value of `first_input` that is passed to `my_node`, we can implement the following hook: - -```python -from typing import Any, Dict, Optional - -from kedro.framework.hooks import hook_impl -from kedro.pipeline.node import Node -from kedro.io import DataCatalog - - -class NodeInputReplacementHook: - @hook_impl - def before_node_run( - self, node: Node, catalog: DataCatalog - ) -> Optional[Dict[str, Any]]: - """Replace `first_input` for `my_node`""" - if node.name == "my_node": - # return the string filepath to the `first_input` dataset - # instead of the underlying data - dataset_name = "first_input" - filepath = catalog._get_dataset(dataset_name)._filepath - return {"first_input": filepath} # `second_input` is not affected - return None -``` - -Node input overwrites implemented in `before_node_run` affect only a specific node and do not modify the corresponding datasets in the `DataCatalog`. - - -```eval_rst -.. note:: In the example above, the ``before_node_run`` hook implementation must return datasets present in the ``inputs`` dictionary. If they are not in ``inputs``, the node fails with the following error: ``Node expected X input(s) , but got the following Y input(s) instead: ``. -``` - - -To apply the changes once you have implemented a new hook, you need to register it, as described [above](#registering-your-hook-implementations-with-kedro), and then run Kedro. diff --git a/docs/source/07_extend_kedro/04_plugins.md b/docs/source/07_extend_kedro/04_plugins.md deleted file mode 100644 index 2bc63c88c6..0000000000 --- a/docs/source/07_extend_kedro/04_plugins.md +++ /dev/null @@ -1,184 +0,0 @@ -# Kedro plugins - - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -Kedro plugins allow you to create new features for Kedro and inject additional commands into the CLI. Plugins are developed as separate Python packages that exist outside of any Kedro project. - -## Overview - -Kedro uses [`setuptools`](https://setuptools.readthedocs.io/en/latest/setuptools.html), which is a collection of enhancements to the Python `distutils` to allow developers to build and distribute Python packages. Kedro uses various entry points in [`pkg_resources`](https://setuptools.readthedocs.io/en/latest/setuptools.html#dynamic-discovery-of-services-and-plugins) to provide plugin functionality. - -## Example of a simple plugin - -Here is a simple example of a plugin that prints the pipeline as JSON: - -`kedrojson/plugin.py` - -```python -import click -from kedro.framework.session import KedroSession - - -@click.group(name="JSON") -def commands(): - """Kedro plugin for printing the pipeline in JSON format""" - pass - - -@commands.command() -@click.pass_obj -def to_json(metadata): - """Display the pipeline in JSON format""" - session = KedroSession.create(metadata.package_name) - context = session.load_context() - print(context.pipeline.to_json()) -``` - -The plugin provides the following `entry_points` config in `setup.py`: - -```python -setup( - entry_points={"kedro.project_commands": ["kedrojson = kedrojson.plugin:commands"]} -) -``` - -Once the plugin is installed, you can run it as follows: -```bash -kedro to_json -``` - -## Working with `click` - -Commands must be provided as [`click` `Groups`](https://click.palletsprojects.com/en/7.x/api/#click.Group) - -The `click Group` will be merged into the main CLI Group. In the process, the options on the group are lost, as is any processing that was done as part of its callback function. - - -## Project context - -When they run, plugins may request information about the current project by creating a session and loading its context: - -```python -from pathlib import Path - -from kedro.framework.startup import _get_project_metadata -from kedro.framework.session import KedroSession - - -project_path = Path.cwd() -metadata = _get_project_metadata(project_path) -session = KedroSession.create(metadata.package_name, project_path) -context = session.load_context() -``` - -## Initialisation - -If the plugin initialisation needs to occur prior to Kedro starting, it can declare the `entry_point` key `kedro.init`. This entry point must refer to a function that currently has no arguments, but for future proofing you should declare it with `**kwargs`. - -## `global` and `project` commands - -Plugins may also add commands to the Kedro CLI, which supports two types of commands: - -* _global_ - available both inside and outside a Kedro project. Global commands use the `entry_point` key `kedro.global_commands`. -* _project_ - available only when a Kedro project is detected in the current directory. Project commands use the `entry_point` key `kedro.project_commands`. - -## Suggested command convention - -We use the following command convention: `kedro `, with `kedro ` acting as a top-level command group. This is our suggested way of structuring your plugin bit it is not necessary for your plugin to work. - -## Hooks - -You can develop hook implementations and have them automatically registered to the project context when the plugin is installed. To enable this for your custom plugin, simply add the following entry in your `setup.py`: - -```python -setup(entry_points={"kedro.hooks": ["plugin_name = plugin_name.plugin:hooks"]}) -``` - -where `plugin.py` is the module where you declare hook implementations: - -```python -import logging - -from kedro.framework.hooks import hook_impl - - -class MyHooks: - @hook_impl - def after_catalog_created(self, catalog): # pylint: disable=unused-argument - logging.info("Reached after_catalog_created hook") - - -hooks = MyHooks() -``` - -```eval_rst -.. note:: ``hooks`` should be an instance of the class defining the Hooks. -``` - -## CLI Hooks - -You can also develop hook implementations to extend Kedro's CLI behaviour in your plugin. To find available CLI hooks, please visit [kedro.framework.cli.hooks](/kedro.framework.cli.hooks). To register CLI hooks developed in your plugin with Kedro, add the following entry in your project's `setup.py`: - -```python -setup(entry_points={"kedro.cli_hooks": ["plugin_name = plugin_name.plugin:cli_hooks"]}) -``` - -where `plugin.py` is the module where you declare hook implementations: - -```python -import logging - -from kedro.framework.cli.hooks import cli_hook_impl - - -class MyCLIHooks: - @cli_hook_impl - def before_command_run(self, project_metadata, command_args): - logging.info( - "Command %s will be run for project %s", command_args, project_metadata - ) - - -cli_hooks = MyCLIHooks() -``` - -## Contributing process - -When you are ready to submit your code: - -1. Create a separate repository using our naming convention for `plugin`s (`kedro-`) -2. Choose a command approach: `global` and / or `project` commands: - - All `global` commands should be provided as a single `click` group - - All `project` commands should be provided as another `click` group - - The `click` groups are declared through the [`pkg_resources` entry_point system](https://setuptools.readthedocs.io/en/latest/setuptools.html#dynamic-discovery-of-services-and-plugins) -3. Include a `README.md` describing your plugin's functionality and all dependencies that should be included -4. Use GitHub tagging to tag your plugin as a `kedro-plugin` so that we can find it - -## Supported Kedro plugins - -- [Kedro-Docker](https://github.com/quantumblacklabs/kedro-docker), a tool for packaging and shipping Kedro projects within containers -- [Kedro-Airflow](https://github.com/quantumblacklabs/kedro-airflow), a tool for converting your Kedro project into an Airflow project -- [Kedro-Viz](https://github.com/quantumblacklabs/kedro-viz), a tool for visualising your Kedro pipelines - -## Community-developed plugins - -See the full list of plugins using the GitHub tag [kedro-plugin](https://github.com/topics/kedro-plugin). - - -```eval_rst -.. note:: Your plugin needs to have an `Apache 2.0 compatible license `_ to be considered for this list. -``` - -- [Kedro-Pandas-Profiling](https://github.com/BrickFrog/kedro-pandas-profiling), by [Justin Malloy](https://github.com/BrickFrog), uses [Pandas Profiling](https://github.com/pandas-profiling/pandas-profiling) to profile datasets in the Kedro catalog -- [find-kedro](https://github.com/WaylonWalker/find-kedro), by [Waylon Walker](https://github.com/WaylonWalker), automatically constructs pipelines using `pytest`-style pattern matching -- [kedro-static-viz](https://github.com/WaylonWalker/kedro-static-viz), by [Waylon Walker](https://github.com/WaylonWalker), generates a static [Kedro-Viz](https://github.com/quantumblacklabs/kedro-viz) site (HTML, CSS, JS) -- [steel-toes](https://github.com/WaylonWalker/steel-toes), by [Waylon Walker](https://github.com/WaylonWalker), prevents stepping on toes by automatically branching data paths -- [kedro-wings](https://github.com/tamsanh/kedro-wings), by [Tam-Sanh Nguyen](https://github.com/tamsanh), simplifies and speeds up pipeline creation by auto-generating catalog datasets -- [kedro-great](https://github.com/tamsanh/kedro-great), by [Tam-Sanh Nguyen](https://github.com/tamsanh), integrates Kedro with [Great Expectations](https://greatexpectations.io), enabling catalog-based expectation generation and data validation on pipeline run -- [Kedro-Accelerator](https://github.com/deepyaman/kedro-accelerator), by [Deepyaman Datta](https://github.com/deepyaman), speeds up pipelines by parallelizing I/O in the background -- [kedro-dataframe-dropin](https://github.com/mzjp2/kedro-dataframe-dropin), by [Zain Patel](https://github.com/mzjp2), lets you swap out pandas datasets for modin or RAPIDs equivalents for specialised use to speed up your workflows (e.g on GPUs) -- [kedro-kubeflow](https://github.com/getindata/kedro-kubeflow), by [Mateusz Pytel](https://github.com/em-pe) and [Mariusz Strzelecki](https://github.com/szczeles), lets you run and schedule pipelines on Kubernetes clusters using [Kubeflow Pipelines](https://www.kubeflow.org/docs/pipelines/overview/pipelines-overview/) -- [kedro-mlflow](https://github.com/Galileo-Galilei/kedro-mlflow), by [Yolan Honoré-Rougé](https://github.com/galileo-galilei), and [Takieddine Kadiri](https://github.com/takikadiri) facilitates [Mlflow](https://www.mlflow.org/) integration inside Kedro projects while enforcing [Kedro's principles](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#what-are-the-primary-advantages-of-kedro). Its main features are modular configuration, automatic parameters tracking, datasets versioning, Kedro pipelines packaging and serving and automatic synchronization between training and inference pipelines for high reproducibility of machine learning experiments and ease of deployment. A tutorial is provided in the [kedro-mlflow-tutorial repo](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial). diff --git a/docs/source/07_extend_kedro/05_create_kedro_starters.md b/docs/source/07_extend_kedro/05_create_kedro_starters.md deleted file mode 100644 index f35394e73b..0000000000 --- a/docs/source/07_extend_kedro/05_create_kedro_starters.md +++ /dev/null @@ -1,90 +0,0 @@ -# Create a Kedro starter - -Kedro starters are used to create projects that contain code to run as-is, or to adapt and extend. A good example is the Iris dataset example of basic Kedro project layout, configuration and initialisation code. A team may find it useful to build Kedro starters to create reusable projects that bootstrap a common base and can be extended. - -A Kedro starter is a [Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/) template that contains the boilerplate code for a Kedro project. - -## How to create a Kedro starter - -To create a Kedro starter, you need a base project to convert to a `cookiecutter` template, which forms the boilerplate for all projects that use the Kedro starter. - -Install `cookiecutter` as follows: - -```bash -pip install cookiecutter -``` - -You then need to decide which are: - -* the common, boilerplate parts of the project -* the configurable elements, which need to be replaced by `cookiecutter` strings - -## Configuration variables - -By default, when you create a new project using a Kedro starter, `kedro new` launches in interactive mode. The user is then prompted for the variables that have been set in `prompts.yml`. - -The most basic and empty starter triggered by `kedro new` is set up with the following three variables: - -* `project_name` - A human readable name for the new project -* `repo_name` - A name for the directory that holds the project repository -* `python_package` - A Python package name for the project package (see [Python package naming conventions](https://www.python.org/dev/peps/pep-0008/#package-and-module-names)) - -See the configuration for this basic configuration in [the default starter setup](https://github.com/quantumblacklabs/kedro/blob/master/kedro/templates/project/prompts.yml). - -As the creator of the Kedro starter you can customise the prompts triggered by `kedro new` by adding your own prompts in `prompts.yml`. This is an example of a custom prompt: - -```yaml -custom_prompt: - title: "Prompt title:" - text: | - Prompt description that explains to the user what - information they should provide. -``` - -At the very least, the prompt `title` must be defined for the prompt to be valid. After Kedro gets the user's input for each prompt, we pass the value to [`cookiecutter`](https://cookiecutter.readthedocs.io/en/1.7.2/), so every key in your `prompts.yml` must have a corresponding key in [`cookiecutter.json`](https://cookiecutter.readthedocs.io/en/1.7.2/tutorial1.html#cookiecutter-json). - -If the input to the prompts needs to be **validated**, for example to make sure it only has alphanumeric characters, you can add regex validation rules via the `regex_validator` key. For more complex validation, have a look at [cookiecutter pre/post-generate hooks](https://cookiecutter.readthedocs.io/en/1.7.2/advanced/hooks.html#using-pre-post-generate-hooks-0-7-0). - -If you want `cookiecutter` to provide sensible **defaults** in case a user doesn't provide any input, you can add those to `cookiecutter.json`. See the default starter [`cookiecutter.json`](https://github.com/quantumblacklabs/kedro/blob/master/kedro/templates/project/cookiecutter.json) as example. - -### Example Kedro starter - -To review an example Kedro starter, clone [`pandas-iris`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pandas-iris) from Github. - -When you create an Iris dataset example project by calling `kedro new`, you supply three configuration variables as the documentation in [Create a new project](../02_get_started/04_new_project.md) describes. These variables are `project_name`, `repo_name` and `python_package` and they are supplied interactively or by means of a configuration file. You can see how these variables are used by inspecting the template: - -**project_name** - -The human-readable `project-name` variable is used in the [README.md](https://github.com/quantumblacklabs/kedro-starters/tree/master/pandas-iris/README.md) for the new project. - -**repo_name** - -The project structure contains a folder labelled [`{{ cookiecutter.repo_name }}`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pandas-iris/%7B%7B%20cookiecutter.repo_name%20%7D%7D), which forms the top-level folder to contain the Iris dataset example when it is created. The folder storing the example project is represented by `cookiecutter.repo_name`, which is a customisable variable, as you would expect. - -**python_package** - -Within the parent folder, inside the `src` subfolder, is another configurable variable [{{ cookiecutter.python_package }}](https://github.com/quantumblacklabs/kedro-starters/tree/master/pandas-iris/%7B%7B%20cookiecutter.repo_name%20%7D%7D/src/%7B%7B%20cookiecutter.python_package%20%7D%7D) which contains the source code for the example pipelines. The variable is also used within [`__main__.py`](https://github.com/quantumblacklabs/kedro-starters/tree/master/pandas-iris/%7B%7B%20cookiecutter.repo_name%20%7D%7D/src/%7B%7B%20cookiecutter.python_package%20%7D%7D/__main__.py). - -Here is the layout of the project as a Cookiecutter template: - -``` -{{ cookiecutter.repo_name }} # Parent directory of the template -├── conf # Project configuration files -├── data # Local project data (not committed to version control) -├── docs # Project documentation -├── logs # Project output logs (not committed to version control) -├── notebooks # Project related Jupyter notebooks (can be used for experimental code before moving the code to src) -├── README.md # Project README -├── setup.cfg # Configuration options for tools e.g. `pytest` or `flake8` -└── src # Project source code - └── {{ cookiecutter.python_package }} - ├── __init.py__ - ├── hooks.py - ├── pipelines - ├── pipeline_registry.py - ├── __main__.py - └── settings.py - ├── requirements.txt - ├── setup.py - └── tests -``` diff --git a/docs/source/07_extend_kedro/06_transformers.md b/docs/source/07_extend_kedro/06_transformers.md deleted file mode 100644 index 785b790370..0000000000 --- a/docs/source/07_extend_kedro/06_transformers.md +++ /dev/null @@ -1,131 +0,0 @@ -# Dataset transformers (deprecated) - -```eval_rst -.. warning:: The transformer API will be deprecated in 0.18.0. We recommend using the ``before_dataset_loaded``/``after_dataset_loaded`` and ``before_dataset_saved``/``after_dataset_saved`` Hooks to customise the dataset ``load`` and ``save`` methods where appropriate. -``` - -As we describe in the [documentation about how Kedro works with data](../05_data/01_data_catalog.md#transforming-datasets), Kedro transformers intercept the load and save operations on Kedro `DataSet`s. - -Use cases for Kedro transformers include: - - - Data validation - - Operation performance tracking - - Data format conversion (although we would recommend [Transcoding](../05_data/01_data_catalog.md#transcoding-datasets) for this) - -### Develop your own dataset transformer - -To illustrate the use case for operation performance tracking, this section demonstrates how to build a transformer to track memory consumption. In fact, Kedro provides a built-in memory profiler, but this example shows how to build your own, using [memory-profiler](https://github.com/pythonprofilers/memory_profiler). - - -```eval_rst -.. note:: To work with this example, you need to ``pip install memory_profiler`` before you start. -``` - -A custom transformer should: - -* Inherit from the `kedro.io.AbstractTransformer` base class -* Implement the `load` and `save` method - -Within the project in which you want to use the transformer, create a file in `src//` called `memory_profile.py` and paste the following code into it: - -
-Click to expand - -```python -import logging -from typing import Callable, Any - -from kedro.io import AbstractTransformer -from memory_profiler import memory_usage - - -def _normalise_mem_usage(mem_usage): - # memory_profiler < 0.56.0 returns list instead of float - return mem_usage[0] if isinstance(mem_usage, (list, tuple)) else mem_usage - - -class ProfileMemoryTransformer(AbstractTransformer): - """A transformer that logs the maximum memory consumption during load and save calls""" - - @property - def _logger(self): - return logging.getLogger(self.__class__.__name__) - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - mem_usage, data = memory_usage( - (load, [], {}), - interval=0.1, - max_usage=True, - retval=True, - include_children=True, - ) - # memory_profiler < 0.56.0 returns list instead of float - mem_usage = _normalise_mem_usage(mem_usage) - - self._logger.info( - "Loading %s consumed %2.2fMiB memory at peak time", data_set_name, mem_usage - ) - return data - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - mem_usage = memory_usage( - (save, [data], {}), - interval=0.1, - max_usage=True, - retval=False, - include_children=True, - ) - mem_usage = _normalise_mem_usage(mem_usage) - - self._logger.info( - "Saving %s consumed %2.2fMiB memory at peak time", data_set_name, mem_usage - ) -``` -
- -Next, you need to update `TransformerHooks` to apply your custom transformer. Add the following to a `hooks.py` file in your project. - -
-Click to expand - -```python -... -from .memory_profile import ProfileMemoryTransformer # new import - - -class TransformerHooks: - @hook_impl - def after_catalog_created(self, catalog: DataCatalog) -> None: - catalog.add_transformer(ProfileTimeTransformer()) - - # as memory tracking is quite time-consuming, for demonstration purposes - # let's apply profile_memory only to the model_input_table - catalog.add_transformer(ProfileMemoryTransformer(), "model_input_table") -``` -
- -Finally, update `HOOKS` variable in `settings.py` as follows: - -```python -HOOKS = (TransformerHooks(),) -``` - -Then re-run the pipeline: - -```console -$ kedro run -``` - -The output should look similar to the following: - -``` -... -2019-11-13 15:55:01,674 - kedro.io.data_catalog - INFO - Saving data to `model_input_table` (CSVDataSet)... -2019-11-13 15:55:12,322 - ProfileMemoryTransformer - INFO - Saving model_input_table consumed 606.98MiB memory at peak time -2019-11-13 15:55:12,322 - ProfileTimeTransformer - INFO - Saving model_input_table took 10.648 seconds -2019-11-13 15:55:12,357 - kedro.runner.sequential_runner - INFO - Completed 3 out of 6 tasks -2019-11-13 15:55:12,358 - kedro.io.data_catalog - INFO - Loading data from `model_input_table` (CSVDataSet)... -2019-11-13 15:55:13,933 - ProfileMemoryTransformer - INFO - Loading model_input_table consumed 533.05MiB memory at peak time -2019-11-13 15:55:13,933 - ProfileTimeTransformer - INFO - Loading model_input_table took 1.576 seconds -... -``` diff --git a/docs/source/07_extend_kedro/07_decorators.md b/docs/source/07_extend_kedro/07_decorators.md deleted file mode 100644 index 64d9ff37ea..0000000000 --- a/docs/source/07_extend_kedro/07_decorators.md +++ /dev/null @@ -1,126 +0,0 @@ -# Decorators (deprecated) - -```eval_rst -.. warning:: The decorator API will be deprecated in 0.18.0. We recommend using Hooks to extend a node's behaviour. -``` - -A decorator is a computation that runs before and after execution. You can apply [Python decorators](https://wiki.python.org/moin/PythonDecorators) to Kedro nodes or an entire Kedro pipeline. - -## How to apply a decorator to nodes - -This example illustrates decorators that modify the first string argument of a given function: - -```python -from functools import wraps -from typing import Callable - - -def apply_f(func: Callable) -> Callable: - @wraps(func) - def with_f(*args, **kwargs): - return func(*["f({})".format(a) for a in args], **kwargs) - - return with_f - - -def apply_g(func: Callable) -> Callable: - @wraps(func) - def with_g(*args, **kwargs): - return func(*["g({})".format(a) for a in args], **kwargs) - - return with_g - - -def apply_h(func: Callable) -> Callable: - @wraps(func) - def with_h(*args, **kwargs): - return func(*["h({})".format(a) for a in args], **kwargs) - - return with_h -``` - -To make sure that `apply_f` is applied to every function call, including within Kedro nodes: - -```python -@apply_f -def say_hello(name): - print("Hello {}!".format(name)) - - -hello_node = node(say_hello, "name", None) -hello_node.run(dict(name="Kedro")) -``` - -`Output`: - -```console -In [3]: hello_node.run(dict(name="Kedro")) -Hello f(Kedro)! -Out[3]: {} -``` - -If you want to apply an additional decorator to the same function, but just for another node: - -```python -hello_node_wrapped = node(apply_g(say_hello), "name", None) - -hello_node.run(dict(name="Kedro")) -hello_node_wrapped.run(dict(name="Kedro")) -``` - -`Output`: - -```console -Hello f(Kedro)! -Hello f(g(Kedro))! -Out[4]: {} -``` - -## How to apply multiple decorators to nodes - -You can also provide a list of decorators as shown here: - -```python -hello_wrapped = node(apply_g(apply_h(say_hello)), "name", None) -hello_decorated = hello_node.decorate(apply_g, apply_h) - -hello_wrapped.run(dict(name="Kedro")) -hello_decorated.run(dict(name="Kedro")) -``` - -`Output`: - -```console -Hello f(h(g(Kedro)))! -Hello f(h(g(Kedro)))! -``` - -## How to apply a decorator to a pipeline - -Decorators can also be useful for monitoring your pipeline. You can apply one or more decorators to an entire pipeline, much as you do for a node. - -For example, if you want to apply the decorator above to all pipeline nodes simultaneously: - -```python -hello_pipeline = Pipeline( - [node(say_hello, "name1", None), node(say_hello, "name2", None)] -).decorate(apply_g, apply_h) - -SequentialRunner().run( - hello_pipeline, DataCatalog({}, dict(name1="Kedro", name2="Python")) -) -``` - -`Output`: - -```console -Hello f(h(g(Kedro)))! -Hello f(h(g(Python)))! -Out[9]: {} -``` - -## Kedro decorators - -Kedro currently has one built-in decorator: `log_time`, which logs the time taken to execute a node. You can find it in [`kedro.pipeline.decorators`](/kedro.pipeline.decorators.log_time). - -Other decorators can be found in [`kedro.extras.decorators`](/kedro.extras.decorators), for which you will need to install the required dependencies. diff --git a/docs/source/08_logging/01_logging.md b/docs/source/08_logging/01_logging.md deleted file mode 100644 index 578a21b813..0000000000 --- a/docs/source/08_logging/01_logging.md +++ /dev/null @@ -1,37 +0,0 @@ -# Logging - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -Kedro uses, and facilitates, the use of Python’s `logging` library by providing a default logging configuration. This can be found in `conf/base/logging.yml` in every project generated using Kedro’s CLI `kedro new` command. - -## Configure logging - -You can customise project logging in `conf//logging.yml` using [standard Kedro mechanisms for handling configuration](../04_kedro_project_setup/02_configuration.md). The configuration should comply with the guidelines from the `logging` library. Find more about it in [the documentation for `logging` module](https://docs.python.org/3/library/logging.html). - -## Use logging - -After reading and applying project logging configuration, `kedro` will start emitting the logs automatically. To log your own code, you are advised to do the following: - -```python -import logging - -log = logging.getLogger(__name__) -log.warning("Issue warning") -log.info("Send information") -``` - -## Logging for `anyconfig` - -By default, [anyconfig](https://github.com/ssato/python-anyconfig) library that is used by `kedro` to read configuration files emits a log message with `INFO` level on every read. To reduce the amount of logs being sent for CLI calls, default project logging configuration in `conf/base/logging.yml` sets the level for `anyconfig` logger to `WARNING`. - -If you would like `INFO` level messages to propagate, you can update `anyconfig` logger level in `conf/base/logging.yml` as follows: - -```yaml -loggers: - anyconfig: - level: INFO # change - handlers: [console, info_file_handler, error_file_handler] - propagate: no -``` diff --git a/docs/source/09_development/03_commands_reference.md b/docs/source/09_development/03_commands_reference.md deleted file mode 100644 index 358d672405..0000000000 --- a/docs/source/09_development/03_commands_reference.md +++ /dev/null @@ -1,418 +0,0 @@ -# Kedro's command line interface - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -Kedro's command line interface (CLI) is used to give commands to Kedro via a terminal shell (such as the terminal app on macOS, or cmd.exe or PowerShell on Windows). You need to use the CLI to set up a new Kedro project, and to run it. - -### Autocompletion (optional) - -If you are using macOS or Linux, you can set up your shell to autocomplete `kedro` commands. If you don't know the type of shell you are using, first type the following: - -```bash -echo $0 -``` - -
-If you are using Bash (click to expand) -
-Add the following to your ~/.bashrc (or just run it on the command line): - -```bash -eval "$(_KEDRO_COMPLETE=source kedro)" -``` -
- -
-If you are using Z shell (ZSh) (click to expand) -
-Add the following to ~/.zshrc: - -```bash -eval "$(_KEDRO_COMPLETE=source_zsh kedro)" -``` -
- -
-If you are using Fish (click to expand) -
-Add the following to ~/.config/fish/completions/foo-bar.fish: - -```bash -eval (env _KEDRO_COMPLETE=source_fish kedro) -``` -
- -### Invoke Kedro CLI from Python (optional) -You can invoke the Kedro CLI as a Python module: - -```bash -python -m kedro -``` - -## Kedro commands -Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. Project-specific commands are called from within a project directory and apply to that particular project. Global commands can be run anywhere and don't apply to any particular project: - -* Global Kedro commands - * [`kedro --help`](#get-help-on-kedro-commands) - * [`kedro --version`](#confirm-the-kedro-version) - * [`kedro docs`](#open-the-kedro-documentation-in-your-browser) - * [`kedro info`](#confirm-kedro-information) - * [`kedro new`](#create-a-new-kedro-project) - -* Project-specific Kedro commands - * [`kedro activate-nbstripout`](#strip-output-cells) - * [`kedro build-docs`](#build-the-project-documentation) - * [`kedro build-reqs`](#build-the-project-s-dependency-tree) - * [`kedro catalog list`](#list-datasets-per-pipeline-per-type) - * [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file) - * [`kedro install`](#install-all-package-dependencies) - * [`kedro ipython`](#notebooks) - * [`kedro jupyter convert`](#copy-tagged-cells) - * [`kedro jupyter lab`](#notebooks) - * [`kedro jupyter notebook`](#notebooks) - * [`kedro lint`](#lint-your-project) - * [`kedro package`](#deploy-the-project) - * [`kedro pipeline create `](#create-a-new-modular-pipeline-in-your-project) - * [`kedro pipeline delete `](#delete-a-modular-pipeline) - * [`kedro pipeline package `](#package-a-modular-pipeline) - * [`kedro pipeline pull `](#pull-a-modular-pipeline) - * [`kedro registry describe `](#describe-a-registered-pipeline) - * [`kedro registry list`](#list-all-registered-pipelines-in-your-project) - * [`kedro run`](#run-the-project) - * [`kedro test`](#test-your-project) - -## Global Kedro commands - -The following are Kedro commands that apply globally and can be run from any directory location. - -```eval_rst -.. note:: You only need to use one of those given below (e.g. specify ``kedro -V`` **OR** ``kedro --version``). -``` - -### Get help on Kedro commands - -```bash -kedro -kedro -h -kedro --help -``` - -### Confirm the Kedro version - -```bash -kedro -V -kedro --version -``` - -### Confirm Kedro information - -```bash -kedro info -``` -Returns output similar to the following, depending on the version of Kedro used and plugins installed. - -``` - _ _ -| | _____ __| |_ __ ___ -| |/ / _ \/ _` | '__/ _ \ -| < __/ (_| | | | (_) | -|_|\_\___|\__,_|_| \___/ -v0.17.4 - -kedro allows teams to create analytics -projects. It is developed as part of -the Kedro initiative at QuantumBlack. - -Installed plugins: -kedro_viz: 3.4.0 (hooks:global,line_magic) - -``` - -### Create a new Kedro project - -```bash -kedro new -``` - -### Open the Kedro documentation in your browser - -```bash -kedro docs -``` - -## Project-specific Kedro commands - -```eval_rst -.. note:: All project related CLI commands should be run from the project’s root directory. -``` - -Kedro's command line interface (CLI) allows you to associate a set of commands and dependencies with a target, which you can then execute from inside the project directory. - -The commands a project supports are specified on the framework side. If you want to customise any of the Kedro commands you can do this either by adding a file called `cli.py` or by injecting commands into it via the [`plugin` framework](../07_extend_kedro/04_plugins.md). Find the template for the `cli.py` file [here](../07_extend_kedro/01_common_use_cases.md#use-case-3-how-to-add-or-modify-cli-commands). - -### Project setup - -#### Build the project's dependency tree - -```bash -kedro build-reqs -``` - -This command runs [`pip-compile`](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) on the project's `src/requirements.in` file. If the file doesn't exist, Kedro will create it by copying from `src/requirements.txt`. - -`kedro build-reqs` also accepts and passes through CLI options accepted by `pip-compile`. For example, `kedro build-reqs --generate-hashes` will call `pip-compile --generate-hashes src/requirements.in`. - -#### Install all package dependencies - -The following runs [`pip`](https://github.com/pypa/pip) to install all package dependencies specified in `src/requirements.txt`: - -```bash -kedro install -``` - -For further information, see the [`kedro install` documentation](../04_kedro_project_setup/01_dependencies.md#kedro-install). - - -### Run the project -Call the `run()` method of the `KedroSession` defined in `kedro.framework.session`. - -```bash -kedro run -``` - -`KedroContext` can be extended in `run.py` (`src/project-name/run.py`). In order to use the extended `KedroContext` you need to set `context_path` in [`pyproject.toml`](../12_faq/02_architecture_overview) configuration file. - -#### Modifying a `kedro run` - -Kedro has options to modify pipeline runs. Here is a list of CLI arguments supported out of the box: - -```eval_rst -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| CLI command | Description | Multiple options allowed? | -+===========================================================================+=================================================================================+===========================+ -| :code:`kedro run --from-inputs dataset1,dataset2` | A list of dataset names which should be used as a starting point | No | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --from-nodes node1,node2` | A list of node names which should be used as a starting point | No | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --to-nodes node3,node4` | A list of node names which should be used as an end point | No | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --node debug_me,debug_me_too` | Run only nodes with specified names | Yes | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --runner runner_name` | Run the pipeline with a specific runner. Cannot be used together with --parallel| No | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --parallel` | Run the pipeline using the `ParallelRunner`. If not specified, use the | No | -| | `SequentialRunner`. Cannot be used together with --runner | | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --env env_name` | Run the pipeline in the env_name environment. Defaults to local if not provided | No | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --tag some_tag1,some_tag2` | Run only nodes which have any of these tags attached | Yes | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --load-version="some_dataset:YYYY-MM-DDThh.mm.ss.sssZ"` | Specify a particular dataset version (timestamp) for loading | Yes | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --pipeline de` | Run the whole pipeline by its name | No | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --config config.yml` | Specify all command line options in a configuration file called config.yml | No | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -| :code:`kedro run --params param_key1:value1,param_key2:2.0` | Does a parametrised kedro run with {"param_key1": "value1", "param_key2": 2} | Yes | -+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------------------------+ -``` - -You can also combine these options together, so the following command runs all the nodes from `split` to `predict` and `report`: - -```bash -kedro run --from-nodes split --to-nodes predict,report -``` - -This functionality is extended to the `kedro run --config config.yml` command, which allows you to [specify run commands in a configuration file](../04_kedro_project_setup/02_configuration.md#configure-kedro-run-arguments). - -A parameterised run is best used for dynamic parameters, i.e. running the same pipeline with different inputs, for static parameters that do not change we recommend following the [Kedro project setup methodology](../04_kedro_project_setup/02_configuration.md#parameters). - -### Deploy the project - -The following packages your application as one `.egg` file and one `.whl` file within the `dist/` folder of your project: - -```bash -kedro package -``` - -See the Python documentation for [further information about packaging](https://packaging.python.org/overview/). - -### Pull a modular pipeline -Since Kedro 0.16.4 you can pull a modular pipeline into your Kedro project as follows: - -```bash -kedro pipeline pull -``` - -The above command will take the bundled `.whl` file and do the following: - -* Place source code in `src//pipelines/` -* Place parameters in `conf/base/parameters/.yml` -* Pull out tests and place in `src/tests/pipelines/` - -`kedro pipeline pull` works with PyPI, local and cloud storage: - -* PyPI: `kedro pipeline pull ` with `` being a package on PyPI -* Local storage: `kedro pipeline pull /dist/-0.1-py3-none-any.whl` -* Cloud storage: `kedro pipeline pull s3:///-0.1-py3-none-any.whl` - -### Project quality - -#### Build the project documentation - -```bash -kedro build-docs -``` - -The `build-docs` command builds [project documentation](../03_tutorial/05_package_a_project.md#add-documentation-to-your-project) using the [Sphinx](https://www.sphinx-doc.org) framework. To further customise your documentation, please refer to `docs/source/conf.py` and the [Sphinx documentation](http://www.sphinx-doc.org/en/master/usage/configuration.html). - - -#### Lint your project - -```bash -kedro lint -``` - -Your project is linted with [`black`](https://github.com/psf/black), [`flake8`](https://gitlab.com/pycqa/flake8) and [`isort`](https://github.com/PyCQA/isort). - - -#### Test your project - -The following runs all `pytest` unit tests found in `src/tests`, including coverage (see the file `.coveragerc`): - -```bash -kedro test -``` - -### Project development - -#### Modular pipelines - -##### Create a new [modular pipeline](../06_nodes_and_pipelines/03_modular_pipelines) in your project - -```bash -kedro pipeline create -``` - -##### Package a modular pipeline -The following command packages all the files related to a modular pipeline into a [wheel file](https://pythonwheels.com/): - -```bash -kedro pipeline package -``` - -Further information is available in the [pipeline documentation](../06_nodes_and_pipelines/03_modular_pipelines.md#package-a-modular-pipeline). - -##### Pull a modular pipeline in your project -The following command pulls all the files related to a modular pipeline from either [Pypi](https://pypi.org/) or a storage location of a [wheel file](https://pythonwheels.com/). - -```bash -kedro pipeline pull (or path to a wheel file) -``` - -Further information is available in the [pipeline documentation](../06_nodes_and_pipelines/03_modular_pipelines.md#pull-a-modular-pipeline). - -##### Delete a modular pipeline -The following command deletes all the files related to a modular pipeline in your Kedro project. - -```bash -kedro pipeline delete -``` - -Further information is available in the [pipeline documentation](../06_nodes_and_pipelines/03_modular_pipelines.md#pull-a-modular-pipeline). - - -#### Registered pipelines - -##### Describe a registered pipeline - -```bash -kedro registry describe -``` -The output includes all the nodes in the pipeline. If no pipeline name is provided, this command returns all nodes in the `__default__` pipeline. - -##### List all registered pipelines in your project - -```bash -kedro registry list -``` - -#### Datasets - -##### List datasets per pipeline per type - -```bash -kedro catalog list -``` -The results include datasets that are/aren't used by a specific pipeline. - -The command also accepts an optional `--pipeline` argument that allows you to specify the pipeline name(s) (comma-separated values) in order to filter datasets used only by those named pipeline(s). For example: - -```bash -kedro catalog list --pipeline "ds,de" -``` - -#### Data Catalog - -##### Create a Data Catalog YAML configuration file - -The following command creates a Data Catalog YAML configuration file with `MemoryDataSet` datasets for each dataset in a registered pipeline, if it is missing from the `DataCatalog`. - -```bash -kedro catalog create --pipeline -``` - -The command also accepts an optional `--env` argument that allows you to specify a configuration environment (defaults to `base`). - -The command creates the following file: `//catalog/.yml` - -#### Notebooks - -To start a Jupyter Notebook: - -```bash -kedro jupyter notebook -``` - -To start JupyterLab: - -```bash -kedro jupyter lab -``` - -To start an IPython shell: - -```bash -kedro ipython -``` - -Every time you start or restart a notebook kernel, a startup script (`/.ipython/profile_default/startup/00-kedro-init.py`) will add the following variables in scope: - -- `context`: An instance of `kedro.framework.context.KedroContext` class or custom context class extending `KedroContext` if one was set to `CONTEXT_CLASS` in `settings.py` file (further details of how to use `context` can be found [in the IPython documentation](../11_tools_integration/02_ipython.md)) -- `startup_error` (`Exception`) -- `catalog` - -To reload these variables at any point in your notebook (e.g. if you updated `catalog.yml`) use the [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html#line-magics) `%reload_kedro`, which can be also used to see the error message if any of the variables above are undefined. - -If you get an error message `Module ```` not found. Make sure to install required project dependencies by running ``kedro install`` command first.` when running any of those commands, it indicates that some Jupyter or IPython dependencies are not installed in your environment. To resolve this you will need to do the following: - -1. Make sure the corresponding dependency is present in `src/requirements.in` (`src/requirements.txt` if not compiled) -2. Run [`kedro install`](#install-all-package-dependencies) command from your terminal - -##### Copy tagged cells -To copy the code from cells [tagged](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#cell-tags) with `node` tag into Python files under `src//nodes/` in a Kedro project: - -```bash -kedro jupyter convert --all -``` - -##### Strip output cells -Output cells of Jupyter Notebook should not be tracked by git, especially if they contain sensitive information. To strip them out: - -```bash -kedro activate-nbstripout -``` - -This command adds a `git hook` which clears all notebook output cells before committing anything to `git`. It needs to run only once per local repository. diff --git a/docs/source/10_deployment/01_deployment_guide.md b/docs/source/10_deployment/01_deployment_guide.md deleted file mode 100644 index bab6791dbf..0000000000 --- a/docs/source/10_deployment/01_deployment_guide.md +++ /dev/null @@ -1,23 +0,0 @@ -# Deployment guide - -## Deployment choices - -Your choice of deployment method will depend on a number of factors. In this section we provide a number of guides for different approaches. - -If you decide to deploy your Kedro project on a single machine, you should consult our [guide to single-machine deployment](02_single_machine.md), and decide whether to [use Docker for container-based deployment](./02_single_machine.md#container-based) or to use [package-based deployment](./02_single_machine.md#package-based) or to [use the CLI to clone and deploy](./02_single_machine.md#cli-based) your codebase to a server. - -If your pipeline is sizeable, you will want to run parts of it on separate machines, so will need to consult our [guide to distributed deployment](03_distributed.md). - -We also provide information to help you deploy to the following: - -* to [Argo Workflows](04_argo.md) -* to [Prefect](05_prefect.md) -* to [Kubeflow Workflows](06_kubeflow.md) -* to [AWS Batch](07_aws_batch.md) -* to [Databricks](08_databricks.md) - - - -In addition, we also provide instructions on [how to integrate a Kedro project with Amazon SageMaker](09_aws_sagemaker.md). - -![](../meta/images/deployments.png) diff --git a/docs/source/10_deployment/05_prefect.md b/docs/source/10_deployment/05_prefect.md deleted file mode 100644 index 4b79cedd41..0000000000 --- a/docs/source/10_deployment/05_prefect.md +++ /dev/null @@ -1,135 +0,0 @@ -# Deployment with Prefect - -This page explains how to run your Kedro pipeline using [Prefect Core](https://www.prefect.io/products/core/), an open source workflow management system. - -In scope of this deployment we are interested in [Prefect Server](https://docs.prefect.io/orchestration/server/overview.html#what-is-prefect-server) which is an open-source backend that makes it easy to monitor and execute your Prefect flows and automatically extends the Prefect Core. - -```eval_rst -.. note:: Prefect Server ships out-of-the-box with a fully featured user interface. -``` - -## Prerequisites - -To use Prefect Core and Prefect Server, make sure you have the following prerequisites in place: - -- Prefect Core is [installed](https://docs.prefect.io/core/getting_started/installation.html#installation) on your machine -- [Docker](https://www.docker.com/) and [Docker Compose](https://docs.docker.com/compose/) are installed and Docker Engine is running -- Prefect Server is [up and running](https://docs.prefect.io/core/getting_started/installation.html#running-the-local-server-and-ui) -- `PREFECT__LOGGING__EXTRA_LOGGERS` environment variable is set (it is required to get Kedro logs emitted): - -```console -export PREFECT__LOGGING__EXTRA_LOGGERS="['kedro']" -``` - -## How to run your Kedro pipeline using Prefect - -### Convert your Kedro pipeline to Prefect flow - -To build a Prefect [flow](https://docs.prefect.io/core/concepts/flows.html) for your Kedro pipeline programmatically and register it with the Prefect API, use the following Python script, which should be stored in your project’s root directory: - -```python -# /register_prefect_flow.py -from pathlib import Path - -import click - -from prefect import Client, Flow, Task -from prefect.utilities.exceptions import ClientError - -from kedro.framework.project import pipelines -from kedro.framework.session import KedroSession -from kedro.framework.startup import bootstrap_project -from kedro.io import DataCatalog, MemoryDataSet -from kedro.pipeline.node import Node -from kedro.runner import run_node - - -class KedroTask(Task): - """Kedro node as a Prefect task.""" - - def __init__(self, node: Node, catalog: DataCatalog) -> None: - self._node = node - self._catalog = catalog - super().__init__(name=node.name, tags=node.tags) - - def run(self): - run_node(self._node, self._catalog) - - -@click.command() -@click.option("-p", "--pipeline", "pipeline_name", default=None) -@click.option("--env", "-e", type=str, default=None) -def build_and_register_flow(pipeline_name, env): - """Register a Kedro pipeline as a Prefect flow.""" - project_path = Path.cwd() - metadata = bootstrap_project(project_path) - - session = KedroSession.create(project_path=project_path, env=env) - context = session.load_context() - - catalog = context.catalog - pipeline_name = pipeline_name or "__default__" - pipeline = pipelines.get(pipeline_name) - - unregistered_ds = pipeline.data_sets() - set(catalog.list()) - for ds_name in unregistered_ds: - catalog.add(ds_name, MemoryDataSet()) - - flow = Flow(metadata.project_name) - - tasks = {} - for node, parent_nodes in pipeline.node_dependencies.items(): - if node._unique_key not in tasks: - node_task = KedroTask(node, catalog) - tasks[node._unique_key] = node_task - else: - node_task = tasks[node._unique_key] - - parent_tasks = [] - - for parent in parent_nodes: - if parent._unique_key not in tasks: - parent_task = KedroTask(parent, catalog) - tasks[parent._unique_key] = parent_task - else: - parent_task = tasks[parent._unique_key] - - parent_tasks.append(parent_task) - - flow.set_dependencies(task=node_task, upstream_tasks=parent_tasks) - - client = Client() - try: - client.create_project(project_name=metadata.project_name) - except ClientError: - # `metadata.project_name` project already exists - pass - - # Register the flow with the server - flow.register(project_name=metadata.project_name) - - # Start a local agent that can communicate between the server - # and your flow code - flow.run_agent() - - -if __name__ == "__main__": - build_and_register_flow() -``` - -```eval_rst -.. note:: The script launches a `local agent `_. Remember to stop the agent with Ctrl-C when you complete. -``` - - -### Run Prefect flow - -Now, having the flow registered, you can use [Prefect UI](https://docs.prefect.io/orchestration/ui/dashboard.html) to orchestrate and monitor it. - -Navigate to http://localhost:8080/default?flows= to see your registered flow. - -![](../meta/images/prefect_flows.png) - -Click on the flow to open it and then trigger your flow using the "RUN"/"QUICK RUN" button. - -![](../meta/images/prefect_flow_details.png) diff --git a/docs/source/10_deployment/06_kubeflow.md b/docs/source/10_deployment/06_kubeflow.md deleted file mode 100644 index 9efed100a7..0000000000 --- a/docs/source/10_deployment/06_kubeflow.md +++ /dev/null @@ -1,210 +0,0 @@ -# Deployment with Kubeflow Pipelines - -This page explains how to convert your Kedro pipeline to use [Kubeflow Pipelines](https://github.com/kubeflow/pipelines), an open-source toolkit for machine learning (ML). You can use it to deploy ML workflows onto [Kubernetes](https://kubernetes.io/). - -## Why would you use Kubeflow Pipelines? -Kubeflow Pipelines is an end-to-end (E2E) orchestration tool to deploy, scale and manage your machine learning systems within Docker containers. You can schedule and compare runs, and examine detailed reports on each run. - -Here are the main reasons to use Kubeflow Pipelines: - -- It is cloud-agnostic and can run on any Kubernetes cluster -- Kubeflow is tailored towards machine learning workflows for model deployment, experiment tracking, and hyperparameter tuning -- You can re-use components and pipelines to create E2E solutions - -## Prerequisites - -To use Kubeflow Pipelines, make sure you have the following prerequisites in place: - -- Kubeflow Pipelines is [installed](https://www.kubeflow.org/docs/started/getting-started/) on your Kubernetes cluster -- Kubeflow Pipelines SDK is [installed](https://www.kubeflow.org/docs/pipelines/sdk/install-sdk/) locally -- A `name` attribute is set for each Kedro [node](/kedro.pipeline.node) since it is used to trigger runs -- All node input/output DataSets must be configured in `catalog.yml` and refer to an external location (e.g. [AWS S3](../05_data/01_data_catalog.md#using-the-data-catalog-with-the-yaml-api)); you cannot use the `MemoryDataSet` in your workflow - -```eval_rst -.. note:: Each node runs in its own container. -``` - -## How to run your Kedro pipeline using Kubeflow Pipelines - -### Containerise your Kedro project - -First, you need to containerise your Kedro project, using any preferred container solution (e.g. [`Docker`](https://www.docker.com/)), to build an image to use in Kubeflow Pipelines. - -For the purpose of this walk-through, we are going to assume a `Docker` workflow. We recommend the [`Kedro-Docker`](https://github.com/quantumblacklabs/kedro-docker) plugin to streamline the process. [Instructions for Kedro-Docker are in the plugin's README.md](https://github.com/quantumblacklabs/kedro-docker/blob/master/README.md). - -After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./02_single_machine.md#how-to-use-container-registry). - -### Create a workflow spec - -A workflow spec is a yaml file specifying the description of deployment in Kubernetes. In order to build a workflow spec for your Kedro pipeline programmatically you can use the following Python script that should be stored in your project’s root directory: - -```python -# /build_kubeflow_pipeline.py -import re -from pathlib import Path -from typing import Dict, Set - -import click - -from kfp import aws, dsl -from kfp.compiler.compiler import Compiler - -from kedro.framework.project import pipelines -from kedro.framework.startup import bootstrap_project -from kedro.pipeline.node import Node - -_PIPELINE = None -_IMAGE = None - - -@click.command() -@click.argument("image", required=True) -@click.option("-p", "--pipeline", "pipeline_name", default=None) -@click.option("--env", "-e", type=str, default=None) -def generate_kfp(image: str, pipeline_name: str, env: str) -> None: - """Generates a workflow spec yaml file from a Kedro pipeline. - - Args: - image: container image name. - pipeline_name: pipeline name to build a workflow spec. - env: Kedro configuration environment name. - - """ - global _PIPELINE - global _IMAGE - _IMAGE = image - - project_path = Path.cwd() - metadata = bootstrap_project(project_path) - project_name = metadata.project_name - - pipeline_name = pipeline_name or "__default__" - _PIPELINE = pipelines.get(pipeline_name) - - Compiler().compile(convert_kedro_pipeline_to_kfp, project_name + ".yaml") - - -@dsl.pipeline(name="Kedro pipeline", description="Kubeflow pipeline for Kedro project") -def convert_kedro_pipeline_to_kfp() -> None: - """Convert from a Kedro pipeline into a kfp container graph.""" - node_dependencies = _PIPELINE.node_dependencies - kfp_ops = _build_kfp_ops(node_dependencies) - for node, dependencies in node_dependencies.items(): - for dependency in dependencies: - kfp_ops[node.name].after(kfp_ops[dependency.name]) - - -def _build_kfp_ops( - node_dependencies: Dict[Node, Set[Node]] -) -> Dict[str, dsl.ContainerOp]: - """Build kfp container graph from Kedro node dependencies.""" - kfp_ops = {} - - for node in node_dependencies: - name = clean_name(node.name) - kfp_ops[node.name] = dsl.ContainerOp( - name=name, - image=_IMAGE, - command=["kedro"], - arguments=["run", "--node", node.name], - ).apply( - # Configure the container to use AWS credentials. - aws.use_aws_secret( - "aws-secrets", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - ) - ) - return kfp_ops - - -def clean_name(name: str) -> str: - """Reformat a name. - - Returns: - name: formatted name. - - """ - return re.sub(r"[\W_]+", "-", name).strip("-") - - -if __name__ == "__main__": - generate_kfp() -``` - -The script accepts one required argument: - -- `image`: image transferred to the container registry - -You can also specify two optional arguments: - -- `--pipeline`: pipeline name for which you want to build a workflow spec -- `--env`: Kedro configuration environment name, defaults to `local` - -For the purpose of this walk-through, we are going to use AWS S3 bucket for DataSets therefore `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables must be set to have an ability to communicate with S3. The `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` values should be stored in [Kubernetes Secrets](https://kubernetes.io/docs/concepts/configuration/secret/) (an example Kubernetes Secrets spec is given [below](#authenticate-kubeflow-pipelines)). - - -Finally, run the helper script from project's directory to build the workflow spec (the spec will be saved to `/.yaml` file). - -```console -$ cd -$ python build_kubeflow_pipeline.py -``` - -### Authenticate Kubeflow Pipelines - -Before submitting the workflow spec you need to deploy your AWS credentials to Kubernetes Secrets. You should also ensure that the credentials you are going to use have all necessary [permissions to the relevant S3 resources](https://docs.aws.amazon.com/AmazonS3/latest/dev/using-with-s3-actions.html). - -Here's an example Secrets spec: - -```yaml -# secret.yml -apiVersion: v1 -kind: Secret -metadata: - name: aws-secrets - namespace: kubeflow -data: - AWS_ACCESS_KEY_ID: - AWS_SECRET_ACCESS_KEY: -type: Opaque -``` - -```eval_rst -.. note:: Kubeflow uses `kubeflow` as the default namespace. -``` - -You can use the following command to encode AWS keys to base64: - -```console -$ echo -n | base64 -``` - -Run the following commands to deploy the Kubernetes Secrets to the `kubeflow` namespace and check that it was created: - -```console -$ kubectl create -f secret.yml -$ kubectl get secrets aws-secrets -n kubeflow -``` - -You can find more information about AWS configuration in [the Kubeflow Pipelines documentation](https://www.kubeflow.org/docs/aws/pipeline/). - -### Upload workflow spec and execute runs - -Once a Kubernetes Secrets is deployed, upload the workflow spec `.yaml` to Kubeflow. Below is the example of how to upload and execute the Kubeflow Pipelines through the UI (see [how to open the pipelines dashboard](https://www.kubeflow.org/docs/components/pipelines/pipelines-quickstart/#deploy-kubeflow-and-open-the-kubeflow-pipelines-ui)). - -First, go to "Pipelines" on the left panel, and click "Upload pipeline", and you will see the following page to upload your workflow spec. -![context input graphic](../meta/images/kubeflow_pipelines_upload_pipeline.png) - -Once you have uploaded the workflow spec, go to "Experiments", and select the pipeline you want to run in the following page. -![context input graphic](../meta/images/kubeflow_pipelines_experiment_run.png) - -Once the run execution is complete, you can see the status of your pipeline graph. -![context input graphic](../meta/images/kubeflow_pipelines_dag.png) - -In order to clean up your Kubeflow pipelines, go to "Pipelines" on the left panel, select the pipeline you want to delete, and delete the pipeline. - -A Kubernetes Secrets can be removed with the following command. -```console -kubectl delete secrets aws-secrets -n kubeflow -``` - -You can find more information about Kubeflow Pipelines in [their documentation](https://www.kubeflow.org/docs/pipelines/). diff --git a/docs/source/10_deployment/08_databricks.md b/docs/source/10_deployment/08_databricks.md deleted file mode 100644 index 42c26ae52d..0000000000 --- a/docs/source/10_deployment/08_databricks.md +++ /dev/null @@ -1,377 +0,0 @@ -# Deployment to a Databricks cluster - - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -This tutorial uses the [PySpark Iris Kedro Starter](https://github.com/quantumblacklabs/kedro-starters/tree/master/pyspark-iris) to illustrate how to bootstrap a Kedro project using Spark and deploy it to a [Databricks cluster on AWS](https://databricks.com/aws). It is split into 2 sections: - -* [Databricks Connect workflow](#run-the-kedro-project-with-databricks-connect) (recommended) -* [Databricks Notebook workflow](#run-kedro-project-from-a-databricks-notebook) (the setup of this is more involved) - -## Prerequisites - -Both section have the following prerequisites: - -* New or existing [AWS account](https://aws.amazon.com/premiumsupport/knowledge-center/create-and-activate-aws-account/) with administrative privileges -* Active [Databricks deployment](https://docs.databricks.com/getting-started/account-setup.html) on AWS (Databricks Community Edition won't suffice as it doesn't allow you to provision personal tokens) -* [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) installed on your local machine - -## Run the Kedro project with Databricks Connect - -In this section, we show how to create a sample Iris project with PySpark, connect it to the Databricks cluster using [Databricks Connect](https://docs.databricks.com/dev-tools/databricks-connect.html), and trigger a run from the local machine. - -```eval_rst -.. note:: The additional requirement in this section is to have `Java 8 installed `_ on your local machine (as Databricks Connect does not support Java 11). -``` - -### 1. Project setup - -First, let's create a new virtual environment and, within it, a new Kedro project: - -```bash -# create fresh virtual env -# NOTE: minor Python version of the environment -# must match the version on the Databricks cluster -conda create --name iris_databricks python=3.7 -y -conda activate iris_databricks - -# install Kedro and create a new project -pip install "kedro~=0.17.4" -# name your project Iris Databricks when prompted for it -kedro new --starter pyspark-iris -``` - -### 2. Install dependencies and run locally - -Now, as the project has been successfully created, we should move into the project root directory, install project dependencies, and then start a local test run using [Spark local execution mode](https://stackoverflow.com/a/54064507/3364156), which means that all Spark jobs will be executed in a single JVM locally, rather than in a cluster. `pyspark-iris` Kedro starter used to generate the project already has all necessary configuration for it to work, you just need to have `pyspark` Python package installed, which is done for you by `kedro install` command below. - -```bash -# change the directory to the project root -cd iris-databricks/ -# compile and install the project dependencies, this may take a few minutes -kedro install -# start a local run -kedro run -``` - -You should get a similar output: -```console -... -2020-09-09 18:57:36,762 - iris_databricks.pipelines.data_science.nodes - INFO - Model accuracy: 100.00% -2020-09-09 18:57:36,762 - kedro.runner.sequential_runner - INFO - Completed 5 out of 5 tasks -2020-09-09 18:57:36,762 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. -``` - -### 3. Create a Databricks cluster - -If you already have an active cluster with runtime version `7.1`, you can skip this step. Here is [how to find clusters](https://docs.databricks.com/clusters/clusters-manage.html) in your Databricks workspace. - -Follow the [Databricks official guide](https://docs.databricks.com/clusters/create.html) to create a new cluster. For the purpose of this tutorial (and to minimise costs) we recommend the following settings: -* Runtime: `7.1 (Scala 2.12, Spark 3.0.0)` -* Enable autoscaling: `off` -* Terminate after 120 minutes of inactivity: `on` -* Worker type: `m4.large` -* Driver Type: `Same as worker` -* Workers: `2` -* Advanced options -> Instances -> # Volumes: `1` - -While your cluster is being provisioned, you can continue to the next step. - -### 4. Install Databricks Connect - -[Databricks Connect](https://docs.databricks.com/dev-tools/databricks-connect.html) is a Python library that you must install within your local environment: - -```bash -# first, we need to uninstall pyspark package -# as Databricks Connect comes with its own implementation of it -pip uninstall -y pyspark - -# install the version equal to the cluster environment -pip install "databricks-connect==7.1" -``` - -### 5. Configure Databricks Connect - -You can [create the personal access token](https://docs.databricks.com/dev-tools/api/latest/authentication.html#generate-a-personal-access-token) needed by Databricks Connect by following the official documentation. - -```eval_rst -.. note:: Databricks Community Edition does not allow you to provision personal tokens, therefore it won't work for this. -``` - -You also need to retrieve the [Databricks workspace URL](https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) (the domain you log into when accessing your deployment), and the [Cluster ID](https://docs.databricks.com/workspace/workspace-details.html#cluster-url-and-id), which you connect to. - -Now, as you have all the necessary credentials, [configure `databricks-connect`](https://docs.databricks.com/dev-tools/databricks-connect.html#step-2-configure-connection-properties). To do so, run the CLI command and follow the prompts: - -```bash -databricks-connect configure -``` - -Alternatively, you can configure Databricks Connect by setting the environment variables as follows: - -```bash -export DATABRICKS_ADDRESS=https://dbc-XXXXXXXX-XXXX.cloud.databricks.com \ - DATABRICKS_API_TOKEN=XXX \ - DATABRICKS_CLUSTER_ID=XXXX-XXXXXX-XXXXXX \ - DATABRICKS_ORG_ID=0 \ - DATABRICKS_PORT=15001 -``` - -Let's test the configuration by running from the CLI: - -```bash -databricks-connect test -``` - -### 6. Copy local data into DBFS - -Our Spark jobs will now run on Databricks, so we need to give them access to the relevant input data. Copy your local `data/` directory into the [Databricks File System (DBFS)](https://docs.databricks.com/data/databricks-file-system.html). - -Run `python` from the CLI to start the interactive session, and then execute the following script: - -```python -from pyspark.dbutils import DBUtils -from pyspark.sql import SparkSession - -from pathlib import Path - -spark = SparkSession.builder.getOrCreate() -dbutils = DBUtils(spark.sparkContext) - -data_dir = Path.cwd() / "data" -dbutils.fs.cp( - f"file://{data_dir.as_posix()}", "dbfs:/iris-databricks/data", recurse=True -) - -# make sure DBFS ls returns a similar result -dbutils.fs.ls("dbfs:/iris-databricks/data/01_raw/") -# [FileInfo(path='dbfs:/iris-databricks/data/01_raw/.gitkeep', name='.gitkeep', size=0), -# FileInfo(path='dbfs:/iris-databricks/data/01_raw/iris.csv', name='iris.csv', size=3858)] -``` - -Then type `exit()` to terminate the Python session. - -Finally, modify the project catalog so that the `example_iris_data` dataset points to a new DBFS location instead of local. You can use Kedro [configuration environments](../04_kedro_project_setup/02_configuration.md#additional-configuration-environments) for this. - -Copy the `catalog.yml` from `base` into `dbfs` environment by running the CLI command: - -```bash -mkdir conf/dbfs -cp conf/base/catalog.yml conf/dbfs/catalog.yml -``` - -Then open `conf/dbfs/catalog.yml` in any text editor and modify the `filepath` for `example_iris_data` as follows: - -```yaml -example_iris_data: - type: spark.SparkDataSet - filepath: dbfs:/iris-databricks/data/01_raw/iris.csv # <<< change the filepath to this - file_format: csv - load_args: - header: true - inferSchema: true - save_args: - sep: "," - header: true -``` - -### 7. Run the project - -Configuration is now complete, and you are ready to run your Kedro project on Databricks! - -Trigger the run from the CLI locally using the `dbfs` configuration environment: - -```bash -kedro run --env dbfs -``` - -You will notice that the logs of the run execution differ slightly. You should see similar output: -```console -... -2020-09-09 20:28:16,482 - kedro.io.data_catalog - INFO - Loading data from `example_predictions` (MemoryDataSet)... -2020-09-09 20:28:16,483 - kedro.pipeline.node - INFO - Running node: report_accuracy([example_predictions]) -> None -View job details at https://dbc-XXXXXX-XXXX.cloud.databricks.com/?o=0#/setting/clusters/XXXX-XXXXXX-XXXXXX/sparkUi -2020-09-09 20:28:19,531 - iris_databricks.pipelines.data_science.nodes - INFO - Model accuracy: 97.06% -2020-09-09 20:28:19,533 - kedro.runner.sequential_runner - INFO - Completed 5 out of 5 tasks -2020-09-09 20:28:19,533 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. -``` - -Open the `Spark UI` tab in your Databricks cluster UI, and you should see a similar list of completed jobs: - -![](../meta/images/spark_jobs_databricks.png) - - -## Run Kedro project from a Databricks notebook - -As noted in [this post describing CI/CD automation on Databricks](https://databricks.com/blog/2020/06/05/automate-continuous-integration-and-continuous-delivery-on-databricks-using-databricks-labs-ci-cd-templates.html#toc-2), _"Users may find themselves struggling to keep up with the numerous notebooks containing the ETL, data science experimentation, dashboards etc."_ - -Therefore, we do not recommend that you rely on the notebooks for running and/or deploying your Kedro pipelines unless it is unavoidable. The workflow described in this section may be useful for experimentation and initial data analysis stages, but it is _not_ designed for productionisation. - -### Extra requirements - -In addition to the [common prerequisites](#prerequisites), to run through this section you would need: - -* An account on [GitHub](https://github.com/) (free tier or above) -* [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) installed on your local machine - -### 1. Create Kedro project - -Firstly, we will need to reproduce the first three steps from the previous section: -1. [Project setup](#project-setup) -2. [Dependency installation](#install-dependencies-and-run-locally) -3. [Databricks cluster provisioning](#create-a-databricks-cluster) - -As a result you should get: -* A Kedro project, which runs with the local version of PySpark library -* A running Databricks cluster - -### 2. Create GitHub personal access token - -To synchronise the project between the local development environment and Databricks we will use a private GitHub repository that you will create in the next step. For authentication we will need a GitHub personal access token, so go ahead and [create such token](https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token) in your GitHub developer settings. - -```eval_rst -.. note:: Make sure that ``repo`` scopes are enabled for your token. -``` - -### 3. Create a GitHub repository - -Now you should [create a new repository in GitHub](https://docs.github.com/en/github/getting-started-with-github/create-a-repo) using the official guide. You can keep the repository private and you don't need to commit to it just yet. - -To connect to the newly created repository you can use one of 2 options: - -* **SSH:** If you choose to connect with SSH, you will also need to configure [the SSH connection to GitHub](https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh), unless you already have [an existing SSH key](https://docs.github.com/en/github/authenticating-to-github/checking-for-existing-ssh-keys) configured for GitHub -* **HTTPS:** If using HTTPS, you will be asked for your GitHub username and password when you push your first commit - please use your GitHub username and your [personal access token](#create-github-personal-access-token) generated in the previous step as a password and [_not_ your original GitHub password](https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password). - -### 4. Push Kedro project to the GitHub repository - -We will use a CLI to push the newly created Kedro project to GitHub. First, you need to initialise Git in your project root directory: - -```bash -# change the directory to the project root -cd iris-databricks/ -# initialise git -git init -``` - -Then, create the first commit: - -```bash -# add all files to git staging area -git add . -# create the first commit -git commit -m "first commit" -``` - -Finally, push the commit to GitHub: - -```bash -# configure a new remote -# for HTTPS run: -git remote add origin https://github.com//.git -# or for SSH run: -git remote add origin git@github.com:/.git - -# verify the new remote URL -git remote -v - -# push the first commit -git push --set-upstream origin master -``` - -### 5. Configure the Databricks cluster - -The project has now been pushed to your private GitHub repository, and in order to pull it from the Databricks, we need to configure personal access token you generated in [Step 2](#create-github-personal-access-token). - -[Log into your Databricks workspace](https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) and then: -1. Open `Clusters` tab -2. Click on your cluster name -3. Press `Edit` -4. Go to the `Advanced Options` and then `Spark` - -![](../meta/images/databricks_cluster_edit.png) - -Then in the `Environment Variables` section add your `GITHUB_USER` and `GITHUB_TOKEN` as shown on the picture: - -![](../meta/images/databricks_cluster_env_vars.png) - - -```eval_rst -.. note:: For security purposes, we strongly recommend against hard-coding any secrets into your notebooks. -``` - -Then press `Confirm` button. Your cluster will be restarted to apply the changes, this will take a few minutes. - -### 6. Run your Kedro project from the Databricks notebook - -Congratulations, you are now ready to run your Kedro project from the Databricks! - -[Create your Databricks notebook](https://docs.databricks.com/notebooks/notebooks-manage.html#create-a-notebook) and remember to [attach it to the cluster](https://docs.databricks.com/notebooks/notebooks-manage.html#attach) you have just configured. - -In your newly created notebook put each code snippet from below into a separate cell and then [run all cells](https://docs.databricks.com/notebooks/notebooks-use.html#run-all-cells): - -* Clone your project from GitHub - -```console -%sh rm -rf ~/projects/iris-databricks && git clone --single-branch --branch master https://${GITHUB_USER}:${GITHUB_TOKEN}@github.com/${GITHUB_USER}/.git ~/projects/iris-databricks -``` - -* Install the latest version of Kedro compatible with version `0.17.4` - -```console -%pip install "kedro[spark.SparkDataSet]~=0.17.4" -``` - -* Copy input data into DBFS - -```python -import logging -from pathlib import Path - -# suppress excessive logging from py4j -logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR) - -# copy project data into DBFS -project_root = Path.home() / "projects" / "iris-databricks" -data_dir = project_root / "data" -dbutils.fs.cp( - f"file://{data_dir.as_posix()}", f"dbfs://{data_dir.as_posix()}", recurse=True -) - -# make sure the data has been copied -dbutils.fs.ls((data_dir / "01_raw").as_posix()) -``` - -You should get a similar output: -```console -Out[11]: [FileInfo(path='dbfs:/root/projects/iris-databricks/data/01_raw/.gitkeep', name='.gitkeep', size=0), - FileInfo(path='dbfs:/root/projects/iris-databricks/data/01_raw/iris.csv', name='iris.csv', size=3858)] -``` - -* Run Kedro project - -```python -from kedro.framework.session import KedroSession -from kedro.framework.startup import bootstrap_project - -bootstrap_project(project_root) - -with KedroSession.create(project_path=project_root) as session: - session.run() -``` - -You should get a similar output: - -```console -... -2020-09-16 10:45:21,991 - kedro.io.data_catalog - INFO - Loading data from `example_predictions` (MemoryDataSet)... -2020-09-16 10:45:21,991 - kedro.pipeline.node - INFO - Running node: report_accuracy([example_predictions]) -> None -2020-09-16 10:45:23,128 - iris_databricks.pipelines.data_science.nodes - INFO - Model accuracy: 97.30% -2020-09-16 10:45:23,144 - kedro.runner.sequential_runner - INFO - Completed 5 out of 5 tasks -2020-09-16 10:45:23,145 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. -Out[12]: {} -``` - -Your complete notebook should look similar to this (the results are hidden): - -![](../meta/images/databricks_notebook_example.png) diff --git a/docs/source/10_deployment/09_aws_sagemaker.md b/docs/source/10_deployment/09_aws_sagemaker.md deleted file mode 100644 index 32979ec60c..0000000000 --- a/docs/source/10_deployment/09_aws_sagemaker.md +++ /dev/null @@ -1,354 +0,0 @@ -# How to integrate Amazon SageMaker into your Kedro pipeline - - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -This tutorial explains how to integrate a Kedro project with [Amazon SageMaker](https://aws.amazon.com/sagemaker/) in order to train a machine learning model. It shows how to build machine learning pipelines in Kedro and while taking advantage of the power of SageMaker for potentially compute-intensive machine learning tasks. - -The Kedro project will still run locally (or on one of many supported workflow engines like [Argo](./04_argo.md), [Prefect](./05_prefect.md), [Kubeflow](./06_kubeflow.md), [AWS Batch](./07_aws_batch.md) and others), but the model training step will be offloaded onto SageMaker. - -## Why would you use Amazon SageMaker? - -Machine learning development is a complex, expensive, and labour-intensive process with very specific requirements for the execution environment (ML tools and libraries, hardware requirements for CPU or GPU-optimised algorithms). Amazon SageMaker provides the components used for machine learning in a single toolset so models get to production faster with much less effort and at lower cost. Sagemaker supports both classical machine learning libraries like [`Scikit-Learn`](https://scikit-learn.org/) or [`XGBoost`](https://xgboost.readthedocs.io/), and Deep Learning frameworks such as [`TensorFlow`](https://www.tensorflow.org/) or [`PyTorch`](https://pytorch.org/). - -Amazon SageMaker is a fully-managed service and its features are covered by the [official service documentation](https://docs.aws.amazon.com/sagemaker/index.html). In this tutorial we will focus on training a simple machine learning model on SageMaker as part of Kedro pipeline execution. - -## Prerequisites - -To use Amazon SageMaker, make sure you have the following prerequisites in place: -- An [AWS account set up](https://aws.amazon.com/premiumsupport/knowledge-center/create-and-activate-aws-account/) -- [Configured AWS credentials](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html) on your local machine -- Generated Kedro project called **Kedro Tutorial** using [Kedro Spaceflights starter](https://github.com/quantumblacklabs/kedro-starters/tree/master/spaceflights/) -- Completed the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) - -## Prepare the environment - -### Install SageMaker package dependencies - -First, you should add extra package dependencies that are required to communicate with SageMaker via its [Python SDK](https://sagemaker.readthedocs.io/en/stable/). - -If you have run `kedro install` at least once for your project, you should already have the `src/requirements.in` file, so you need to modify that. Otherwise, if you have never run `kedro install` for your project, you should modify `src/requirements.txt`. Open the corresponding file with a text editor and add the following lines at the end of the file: - -```text -sagemaker>=2.13.0 -s3fs>=0.3.0, <0.4.1 # will be needed to work with AWS S3 -``` - -Since you have added two extra dependencies, you should compile and install the updated project dependencies by running the following from your terminal: - -```bash -cd -kedro install --build-reqs -``` - -```eval_rst -.. note:: All CLI commands in the following sections should be executed from the project root directory. -``` - -### Create SageMaker execution role - -1. Sign into the AWS Management Console and open the [IAM console](https://console.aws.amazon.com/iam/) -2. In the left navigation pane, choose `Roles` -3. Choose `Create role` -4. For role type, select `AWS Service`, find and choose `SageMaker`, and then pick the `SageMaker - Execution` use case, then click `Next: Permissions` -5. On the `Attach permissions policy` page, select `AmazonSageMakerFullAccess` managed policy, then click `Next: Review` -6. Give it a name (for example, `AmazonSageMaker-ExecutionRole`) and choose `Create role` - -![IAM role creation wizard](../meta/images/aws_create_iam_role.png) - -### Create S3 bucket - -You should [create a new S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/user-guide/create-bucket.html) rather than use an existing one because SageMaker jobs will save source script data to the bucket root. Having a dedicated bucket for this tutorial makes the cleanup easier. - -Your bucket name should contain the word `sagemaker`, this way the role that we created earlier will automatically have all necessary access permissions to it. Your new bucket does not require public access, so you can leave `Block all public access` setting enabled and preserve other defaults. - -It's generally a good practice to create AWS resources (like S3 bucket above) for the tutorial within the same [region](https://aws.amazon.com/about-aws/global-infrastructure/regions_az/#Regions) that you have in your [local configuration](#prerequisites) where possible. It helps reduce the network transmission latency and ensure that different services can talk to each other. - -## Update the Kedro project - -### Create the configuration environment - -Configuration in Kedro is logically separated into [configuration environments](../04_kedro_project_setup/02_configuration.md#additional-configuration-environments) which are loaded in specific order where the project is run. To separate SageMaker-specific configuration from the default one, let's create a new configuration environment. Go ahead and create a `conf/sagemaker` folder and then create the following files in it. - -```eval_rst -.. note:: ``${key}`` in the YAML snippets below is a special syntax which allows you to template the project configuration. You don't need to replace those values, just paste them as-is. -``` - -* `catalog.yml` - defines the datasets that need to be saved into S3 (rather than kept in memory): - -```yaml -X_train@pickle: - type: pickle.PickleDataSet - filepath: ${s3.train_path}/X_train.pickle - -X_train@path: - type: MemoryDataSet - data: ${s3.train_path}/X_train.pickle - -y_train: - type: pickle.PickleDataSet - filepath: ${s3.train_path}/y_train.pickle -``` - -> *Node:* `@pickle` and `@path` in the dataset names above correspond to the [dataset transcoding](https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html#transcoding-datasets) feature of Kedro. This allows to pass S3 path to the `X_train` dataset instead of the actual data itself to the `train_model_sagemaker` node that you will create shortly. - -* `parameters.yml` - contains the configuration for [SageMaker Scikit Learn Estimator](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html#scikit-learn-estimator): - -```yaml -sklearn_estimator_kwargs: - entry_point: src/kedro_tutorial/sagemaker_entry_point.py # you will create this file later - role: # put the name of the role you've created earlier - instance_type: ml.m4.xlarge - instance_count: 1 - framework_version: 0.23-1 - output_path: ${s3.output_path} -``` - -* `globals.yml` - contains the values that will be used to extrapolate the values in the config files above (you will need to replace the bucket name in the snippet below): - -```yaml -s3: - train_path: s3:///train - output_path: s3:///output -``` - -### Update the project hooks - -Now you need to tell Kedro to use the [`TemplatedConfigLoader`](https://kedro.readthedocs.io/en/stable/kedro.config.TemplatedConfigLoader.html) instead of the default `ConfigLoader` class to read the project configuration. It is very easy to do via `settings.py` file - open `src/kedro_tutorial/settings.py` file and set the `CONFIG_LOADER_CLASS` constant: - -```python -from kedro.config import TemplatedConfigLoader - - -CONFIG_LOADER_CLASS = TemplatedConfigLoader -``` - -### Update the data science pipeline - -Now modify the data science pipeline in the project to send the training job to Amazon SageMaker and to process the resulting model artifact afterwards. - -#### Create node functions - -Open `src/kedro_tutorial/pipelines/data_science/nodes.py` and add these new node functions: - -
-Click to expand - -```python -import pickle -import tarfile -from typing import Any, Dict - -import fsspec -from sagemaker.sklearn.estimator import SKLearn -from sklearn.linear_model import LinearRegression - -# - - -def train_model_sagemaker( - X_train_path: str, sklearn_estimator_kwargs: Dict[str, Any] -) -> str: - """Train the linear regression model on SageMaker. - - Args: - X_train_path: Full S3 path to `X_train` dataset. - sklearn_estimator_kwargs: Keyword arguments that will be used - to instantiate SKLearn estimator. - - Returns: - Full S3 path to `model.tar.gz` file containing the model artifact. - - """ - sklearn_estimator = SKLearn(**sklearn_estimator_kwargs) - - # we need a path to the directory containing both - # X_train (feature table) and y_train (target variable) - inputs_dir = X_train_path.rsplit("/", 1)[0] - inputs = {"train": inputs_dir} - - # wait=True ensures that the execution is blocked - # until the job finishes on SageMaker - sklearn_estimator.fit(inputs=inputs, wait=True) - - training_job = sklearn_estimator.latest_training_job - job_description = training_job.describe() - model_path = job_description["ModelArtifacts"]["S3ModelArtifacts"] - return model_path - - -def untar_model(model_path: str) -> LinearRegression: - """Unarchive the linear regression model artifact produced - by the training job on SageMaker. - - Args: - model_path: Full S3 path to `model.tar.gz` file containing - the model artifact. - - Returns: - Trained model. - - """ - with fsspec.open(model_path) as s3_file, tarfile.open( - fileobj=s3_file, mode="r:gz" - ) as tar: - # we expect to have only one file inside the `model.tar.gz` archive - filename = tar.getnames()[0] - model_obj = tar.extractfile(filename) - return pickle.load(model_obj) -``` -
- -#### Update the pipeline definition - -Open `src/kedro_tutorial/pipelines/data_science/pipeline.py` and replace its contents with the following: - -
-Click to expand - -```python -from kedro.pipeline import Pipeline, node - -from .nodes import ( - evaluate_model, - split_data, - train_model_sagemaker, - untar_model, -) - - -def create_pipeline(**kwargs): - return Pipeline( - [ - node( - func=split_data, - inputs=["model_input_table", "parameters"], - outputs=["X_train@pickle", "X_test", "y_train", "y_test"], - ), - node( - func=train_model_sagemaker, - inputs=["X_train@path", "params:sklearn_estimator_kwargs"], - outputs="model_path", - ), - node(untar_model, inputs="model_path", outputs="regressor"), - node( - func=evaluate_model, - inputs=["regressor", "X_test", "y_test"], - outputs=None, - ), - ] - ) -``` -
- -Great, you are almost ready to run your pipeline with the SageMaker integration. The last step before we can do that will be to create the entry point script. - -### Create the SageMaker entry point - -SageMaker job requires an entry point script that it will execute to perform the actual model training. This script will be automatically uploaded into S3 and run as part of the training job. SageMaker will also automatically [download the training data from S3](https://sagemaker.readthedocs.io/en/stable/overview.html#prepare-a-training-script) to the container before running the job and then upload the trained model artifact back into S3 after the job is complete. Therefore the entry point script does not need to worry about data transfer from and to S3, but it will need to serialise/deserialise such data using `pickle`. - -Create the file `src/kedro_tutorial/sagemaker_entry_point.py` and paste the following into it: - -
-Click to expand - -```python -import argparse -import pickle -from os import getenv -from pathlib import Path -from typing import Any - -from sklearn.linear_model import LinearRegression - - -def _pickle(path: Path, data: Any) -> None: - """Pickle the object and save it to disk""" - with path.open("wb") as f: - pickle.dump(data, f) - - -def _unpickle(path: Path) -> Any: - """Unpickle the object from a given file""" - with path.open("rb") as f: - return pickle.load(f) - - -def _get_arg_parser() -> argparse.ArgumentParser: - """Instantiate the command line argument parser""" - parser = argparse.ArgumentParser() - - parser.add_argument( - "--output-data-dir", type=str, default=getenv("SM_OUTPUT_DATA_DIR") - ) - parser.add_argument("--model-dir", type=str, default=getenv("SM_MODEL_DIR")) - parser.add_argument("--train", type=str, default=getenv("SM_CHANNEL_TRAIN")) - parser.add_argument("--test", type=str, default=getenv("SM_CHANNEL_TEST")) - - return parser - - -def main(): - """The main script entry point which will be called by SageMaker - when running the training job - """ - parser = _get_arg_parser() - args = parser.parse_args() - - data_path = Path(args.train) - X_train = _unpickle(data_path / "X_train.pickle") - y_train = _unpickle(data_path / "y_train.pickle") - - regressor = LinearRegression() - regressor.fit(X_train, y_train) - - model_dir = Path(args.model_dir) - _pickle(model_dir / "regressor.pickle", regressor) - - -if __name__ == "__main__": - # SageMaker will run this script as the main program - main() -``` -
- -## Run the project - -You are now ready to run your project! To do that, execute the following CLI command: -```bash -kedro run --env sagemaker -``` - -The first 4 nodes of the pipeline will still run locally, but then you should see a similar output in the terminal: - -```console -2020-10-06 21:20:25,696 - kedro.runner.sequential_runner - INFO - Completed 4 out of 7 tasks -2020-10-06 21:20:25,696 - kedro.io.data_catalog - INFO - Loading data from `X_train@path` (MemoryDataSet)... -2020-10-06 21:20:25,696 - kedro.io.data_catalog - INFO - Loading data from `params:sklearn_estimator_kwargs` (MemoryDataSet)... -2020-10-06 21:20:25,697 - kedro.pipeline.node - INFO - Running node: train_model_sagemaker([X_train@path,params:sklearn_estimator_kwargs]) -> [model_path] -2020-10-06 21:20:25,713 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials -2020-10-06 21:20:25,793 - sagemaker.image_uris - INFO - Same images used for training and inference. Defaulting to image scope: inference. -2020-10-06 21:20:27,197 - sagemaker - INFO - Creating training-job with name: sagemaker-scikit-learn-2020-10-06-20-20-25-801 -2020-10-06 20:20:27 Starting - Starting the training job... -2020-10-06 20:20:33 Starting - Launching requested ML instances... -2020-10-06 20:21:31 Starting - Preparing the instances for training... -2020-10-06 20:22:27 Downloading - Downloading input data... -2020-10-06 20:23:02 Training - Downloading the training image... - -... [SageMaker Job Logs] ... - -2020-10-06 20:23:56 Uploading - Uploading generated training model -2020-10-06 20:23:56 Completed - Training job completed -2020-10-06 21:24:20,485 - kedro.io.data_catalog - INFO - Saving data to `model_path` (MemoryDataSet)... -2020-10-06 21:24:20,486 - kedro.runner.sequential_runner - INFO - Completed 5 out of 7 tasks -``` - -You should also find your training job if you open [SageMaker console](https://console.aws.amazon.com/sagemaker/home) and choose `Training jobs` tab from the left. - -Now you know how to run serverless machine learning jobs using SageMaker right from your Kedro pipeline! - -## Cleanup - -To cleanup the resources, simply delete the [S3 bucket](#create-s3-bucket) and, optionally, the [IAM role](#create-sagemaker-execution-role) you've created earlier (IAM resources are free). The job details of an already completed SageMaker training job cannot be deleted, but such jobs don't incur any costs. diff --git a/docs/source/11_tools_integration/02_ipython.md b/docs/source/11_tools_integration/02_ipython.md deleted file mode 100644 index 89c5616fc4..0000000000 --- a/docs/source/11_tools_integration/02_ipython.md +++ /dev/null @@ -1,403 +0,0 @@ -# Use Kedro with IPython and Jupyter Notebooks/Lab - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -This section follows the [Iris dataset example](../02_get_started/05_example_project.md) and demonstrates how to use Kedro with IPython and Jupyter Notebooks / Lab. We also recommend a video that explains the transition from the use of vanilla Jupyter Notebooks to using Kedro, from [Data Engineer One](https://www.youtube.com/watch?v=dRnCovp1GRQ&t=50s&ab_channel=DataEngineerOne). - - - - -## Why use a Notebook? -There are reasons why you may want to use a Notebook, although in general, the principles behind Kedro would discourage their use because they have some [drawbacks when they are used to create production or reproducible code](https://towardsdatascience.com/5-reasons-why-you-should-switch-from-jupyter-notebook-to-scripts-cb3535ba9c95). However, there are occasions when you'd want to put some code into a Notebook, for example: - -* To conduct exploratory data analysis -* For experimentation as you create new Python functions (nodes) -* As a tool for reporting and presentations - - -## Kedro and IPython - -You may want to use a Python kernel inside a Jupyter notebook (formerly known as IPython) to experiment with your Kedro code. - -To start a standalone IPython session, run the following command in the root directory of your Kedro project: - -```bash -kedro ipython -``` -This opens an iPython session in your shell, which you can terminate, when you have finished, by typing: - -```python -exit() -``` -### Load `DataCatalog` in IPython - -To test the IPython session, load the [Iris test example](https://www.kaggle.com/uciml/iris) data inside the IPython console as follows: - -```python -catalog.load("example_iris_data").head() -``` -You should see the following in your shell: - -```bash -kedro.io.data_catalog - INFO - Loading data from `example_iris_data` (CSVDataSet)... - - sepal_length sepal_width petal_length petal_width species -0 5.1 3.5 1.4 0.2 setosa -1 4.9 3.0 1.4 0.2 setosa -2 4.7 3.2 1.3 0.2 setosa -3 4.6 3.1 1.5 0.2 setosa -4 5.0 3.6 1.4 0.2 setosa -``` - - -#### Dataset versioning - -If you enable [versioning](../05_data/01_data_catalog.md#versioning-datasets-and-ml-models), you can load a particular version of a dataset. Given a catalog entry: - -```yaml -example_train_x: - type: pandas.CSVDataSet - filepath: data/02_intermediate/example_train_x.csv - versioned: true -``` - -and having run your pipeline at least once, you may specify which version to load: - -```python -catalog.load("example_train_x", version="2019-12-13T15.08.09.255Z") -``` - -## Kedro and Jupyter - -You may want to use Jupyter notebooks to experiment with your code as you develop new nodes for a pipeline, although you can write them as regular Python functions without a notebook. To use Kedro's Jupyter session: - -```bash -kedro jupyter notebook -``` - -This starts a Jupyter server and opens a window in your default browser. - -```eval_rst -.. note:: If you want Jupyter to listen to a different port number, then run ``kedro jupyter notebook --port ``. -``` - -Navigate to the `notebooks` folder of your Kedro project and create a new notebook. - -![](../meta/images/jupyter_create_new_notebook.png) - -```eval_rst -.. note:: The only kernel available by default has a name of the current project. If you need to access all available kernels, add ``--all-kernels`` to the command above. -``` - -Every time you start or restart a Jupyter or IPython session in the CLI using a `kedro` command, a startup script in `.ipython/profile_default/startup/00-kedro-init.py` is executed. It adds the following variables in scope: - -* `catalog` (`DataCatalog`) - Data catalog instance that contains all defined datasets; this is a shortcut for `context.catalog`, but it's only created at startup time, whereas `context.catalog` is rebuilt everytime. -* `context` (`KedroContext`) - Kedro project context that provides access to Kedro's library components. -* `session` (`KedroSession`) - Kedro session that orchestrates the run -* `startup_error` (`Exception`) - An error that was raised during the execution of the startup script or `None` if no errors occurred - -## How to use `context` - -The `context` variable allows you to interact with Kedro library components from within the Kedro Jupyter notebook. - -![context input graphic](../meta/images/jupyter_notebook_showing_context.png) - -With `context`, you can access the following variables and methods: - -- `context.project_path` (`Path`) - Root directory of the project -- `context.project_name` (`str`) - Project folder name -- `context.catalog` (`DataCatalog`) - An instance of [DataCatalog](/kedro.io.DataCatalog) -- `context.config_loader` (`ConfigLoader`) - An instance of [ConfigLoader](/kedro.config.ConfigLoader) -- `context.pipeline` (`Pipeline`) - The `__default__` pipeline - -### Run the pipeline - -If you wish to run the whole main pipeline within a notebook cell, you can do so by running: - -```python -session.run() -``` - -The command runs the nodes from your default project pipeline in a sequential manner. - -To parameterise your pipeline run, refer to [a later section on this page on run parameters](#additional-parameters-for-session-run) which lists all available options. - - -### Parameters - -The `context` object exposes the `params` property, which allows you to access all project parameters: - -```python -parameters = context.params # type: Dict -parameters["example_test_data_ratio"] -# returns the value of 'example_test_data_ratio' key from 'conf/base/parameters.yml' -``` - -```eval_rst -.. note:: You need to reload Kedro variables by calling `%reload_kedro` and re-run the code snippet above if you change the contents of ``parameters.yml``. -``` - -### Load/Save `DataCatalog` in Jupyter - -You can load a dataset defined in your `conf/base/catalog.yml`: - -```python -df = catalog.load("example_iris_data") -df.head() -``` - -![load the catalog and output head graphic](../meta/images/jupyter_notebook_workflow_loading_data.png) - -The save operation in the example below is analogous to the load. - -Put the following dataset entry in `conf/base/catalog.yml`: - -```yaml -my_dataset: - type: pandas.JSONDataSet - filepath: data/01_raw/my_dataset.json -``` - -Next, you need to reload Kedro variables by calling `%reload_kedro` line magic in your Jupyter notebook. - -Finally, you can save the data by executing the following command: - -```python -my_dict = {"key1": "some_value", "key2": None} -catalog.save("my_dataset", my_dict) -``` - -### Additional parameters for `session.run()` -You can also specify the following optional arguments for `session.run()`: - -```eval_rst -+---------------+----------------+-------------------------------------------------------------------------------+ -| Argument name | Accepted types | Description | -+===============+================+===============================================================================+ -| tags | Iterable[str] | Construct the pipeline using only nodes which have this tag attached. | -| | | A node is included in the resulting pipeline if it contains any of those tags | -+---------------+----------------+-------------------------------------------------------------------------------+ -| runner | AbstractRunner | An instance of Kedro [AbstractRunner](/kedro.runner.AbstractRunner); | -| | | can be an instance of a [ParallelRunner](/kedro.runner.ParallelRunner) | -+---------------+----------------+-------------------------------------------------------------------------------+ -| node_names | Iterable[str] | Run only nodes with specified names | -+---------------+----------------+-------------------------------------------------------------------------------+ -| from_nodes | Iterable[str] | A list of node names which should be used as a starting point | -+---------------+----------------+-------------------------------------------------------------------------------+ -| to_nodes | Iterable[str] | A list of node names which should be used as an end point | -+---------------+----------------+-------------------------------------------------------------------------------+ -| from_inputs | Iterable[str] | A list of dataset names which should be used as a starting point | -+---------------+----------------+-------------------------------------------------------------------------------+ -| to_outputs | Iterable[str] | A list of dataset names which should be used as an end point | -+---------------+----------------+-------------------------------------------------------------------------------+ -| to_outputs | Iterable[str] | A list of dataset names which should be used as an end point | -+---------------+----------------+-------------------------------------------------------------------------------+ -| load_versions | Dict[str, str] | A mapping of a dataset name to a specific dataset version (timestamp) | -| | | for loading - this applies to the versioned datasets only | -+---------------+----------------+-------------------------------------------------------------------------------+ -| pipeline_name | str | Name of the modular pipeline to run - must be one of those returned | -| | | by register_pipelines function from src//pipeline_registry.py | -+---------------+----------------+-------------------------------------------------------------------------------+ -``` - -This list of options is fully compatible with the list of CLI options for the `kedro run` command. In fact, `kedro run` is calling `session.run()` behind the scenes. - - -## Global variables - -Add customised global variables to `.ipython/profile_default/startup/00-kedro-init.py`. For example, if you want to add a global variable for `parameters` from `parameters.yml`, update `reload_kedro()` as follows: - -```python -@register_line_magic -def reload_kedro(project_path, line=None): - """Line magic which reloads all Kedro default variables.""" - # ... - global parameters - try: - # ... - session = KedroSession.create("", project_path) - _activate_session(session) - context = session.load_context() - parameters = context.params - # ... - logging.info( - "Defined global variable `context`, `session`, `catalog` and `parameters`" - ) - except: - pass -``` - - -## Convert functions from Jupyter Notebooks into Kedro nodes - -Built into the Kedro Jupyter workflow is the ability to convert multiple functions defined in the Jupyter notebook(s) into Kedro nodes. You need a single CLI command. - -Here is how it works: - -* Start a Jupyter notebook session: `kedro jupyter notebook` -* Create a new notebook and paste the following code into the first cell: - -```python -def some_action(): - print("This function came from `notebooks/my_notebook.ipynb`") -``` - -* Enable tags toolbar: `View` menu -> `Cell Toolbar` -> `Tags` -![Enable the tags toolbar graphic](../meta/images/jupyter_notebook_workflow_activating_tags.png) - -* Add the `node` tag to the cell containing your function -![Add the node tag graphic](../meta/images/jupyter_notebook_workflow_tagging_nodes.png) - -```eval_rst -.. tip:: The notebook can contain multiple functions tagged as ``node``, each of them will be exported into the resulting Python file -``` - -* Save your Jupyter notebook to `notebooks/my_notebook.ipynb` -* Run `kedro jupyter convert notebooks/my_notebook.ipynb` from the terminal to create a Python file `src//nodes/my_notebook.py` containing `some_action` function definition - - -```eval_rst -.. tip:: You can also convert all your notebooks at once by calling ``kedro jupyter convert --all``. -``` - -* The `some_action` function can now be used in your Kedro pipelines - -## IPython extension - -Kedro also has an IPython extension (`kedro.extras.extensions.ipython`) that allows you to start an `ipython` shell directly and then initialize `context`, `catalog`, and `session` variables. This can be used as a replacement for `.ipython/profile_default/startup/00-kedro-init.py`. - -When you start an `ipython` shell in a project root then you only need to load the extension to get the variables. - -```bash -cd -ipython - -In [1]: %load_ext kedro.extras.extensions.ipython -``` - -When you start an `ipython` shell outside a project root and load the extension the variables won't be loaded. -Run `%reload_kedro ` to get the variables, or `%init_kedro ` to set the project path for subsequent calls and then call simply `%reload_kedro` after that without having to specify the path. - -```ipython -In [1]: %load_ext kedro.extras.extensions.ipython -In [2]: %reload_kedro -``` - -or - -```ipython -In [1]: %load_ext kedro.extras.extensions.ipython -In [2]: %init_kedro -In [3]: %reload_kedro -``` - -```eval_rst - .. note:: Note that if you want to pass an argument to `reload_kedro` line magic function, you should call it like a normal Python function (e.g `reload_kedro(path, env=env, extra_params=extra_params)` rather than using `%reload_kedro` in a notebook cell (e.g. `%reload_kedro(path, extra_params=extra_params)` wouldn’t work). You might have to call `%automagic False` beforehand to make this work. -``` - -To configure the extension to be loaded automatically every time when you open an IPython shell, do the following: - -* Run `ipython profile create` to create the config file `~/.ipython/profile_default/ipython_config.py` if it doesn't exist -* Edit `~/.ipython/profile_default/ipython_config.py`: - - uncomment the extensions - - add Kedro extension to the list as follows: `c.InteractiveShellApp.extensions = ["kedro.extras.extensions.ipython"]` - -## IPython loader - -The script `tools/ipython/ipython_loader.py` helps to locate IPython startup directory and run all Python scripts in it when working with Jupyter notebooks and IPython sessions. It should work identically not just within a Kedro project, but also with any project that contains IPython startup scripts. - -The script automatically locates the `.ipython/profile_default/startup` directory by starting from the current working directory and going up the directory tree. If the directory is found, all Python scripts in it are executed. - -```eval_rst -.. note:: This script will only run startup scripts from the first encountered ``.ipython/profile_default/startup`` directory. All consecutive ``.ipython`` directories higher up in the directory tree will be disregarded. -``` - - -### Installation - -To install this script simply download it into your default IPython config directory: - -```bash -mkdir -p ~/.ipython/profile_default/startup -wget -O ~/.ipython/profile_default/startup/ipython_loader.py https://raw.githubusercontent.com/quantumblacklabs/kedro/master/tools/ipython/ipython_loader.py -``` - -### Prerequisites - -For this script to work, the following conditions must be met: - -* Your project must contain the `.ipython/profile_default/startup` folder in its root directory. -* The Jupyter notebook should be saved inside the project root directory or within any nested subfolder of the project directory. -* An IPython interactive session should be started with the working directory pointing to the project root directory or any nested subdirectory. - -For example, given the following project structure: - -```console -new-kedro-project/ -├── .ipython -│   └── profile_default -│   └── startup -│   └── 00-kedro-init.py -├── conf/ -├── data/ -├── docs/ -├── logs/ -├── notebooks -│   └── subdir1 -│   └── subdir2 -└── src/ -``` - -If your `Notebook.ipynb` is placed anywhere in the following, `.ipython/profile_default/startup/00-kedro-init.py` will automatically be executed on every notebook startup: - -* `new-kedro-project/notebooks/` -* `new-kedro-project/notebooks/subdir1/` -* `new-kedro-project/notebooks/subdir1/subdir2/` -* or even `new-kedro-project/` (although this is strongly discouraged). - - -```eval_rst -.. note:: Given the example structure above, this script will not load your IPython startup scripts if the notebook is saved anywhere outside ``new-kedro-project`` directory. -``` - -### Troubleshooting and FAQs - -#### How can I stop my notebook terminating? - -If you close the notebook and its kernel is idle, it will be automatically terminated by the Jupyter server after 30 seconds of inactivity. However, if the notebook kernel is busy, it won't be automatically terminated by the server. - -You can change the timeout by passing `--idle-timeout=` option to `kedro jupyter notebook` or `kedro jupyter lab` call. If you set `--idle-timeout=0`, this will disable automatic termination of idle notebook kernels. - -#### Why can't I run `kedro jupyter notebook`? - -In certain cases, you may not be able to run `kedro jupyter notebook`, which means that you have to work in a standard Jupyter session. This may be because you don't have a CLI access to the machine where the Jupyter server is running or you've opened a Jupyter notebook by running `jupyter notebook` from the terminal. In that case, you can create a `context` variable yourself by running the following block of code at the top of your notebook: - -```python -from pathlib import Path -from kedro.framework.session import KedroSession -from kedro.framework.session.session import _activate_session - -current_dir = Path.cwd() # this points to 'notebooks/' folder -project_path = current_dir.parent # point back to the root of the project -session = KedroSession.create("", project_path) -_activate_session(session) -context = session.load_context() -``` - -#### How can I reload the `session`, `context`, `catalog` and `startup_error` variables? - -To reload these variables at any point (e.g., if you update `catalog.yml`), use the [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) `%reload_kedro`. This magic can also be used to see the error message if any of the variables above are undefined. - -![reload kedro line magic graphic](../meta/images/jupyter_notebook_loading_context.png) - -Note that if you want to pass an argument to `reload_kedro` line magic function, you should call it like a normal Python function (e.g `reload_kedro(extra_params=extra_params)` rather than using `%reload_kedro` in a notebook cell (e.g. `%reload_kedro(extra_params=extra_params)` wouldn't work). - -If the `KEDRO_ENV` environment variable is specified, the startup script loads that environment, otherwise it defaults to `local`. Instructions for setting the environment variable can be found in the [Kedro configuration documentation](../04_kedro_project_setup/02_configuration.md#additional-configuration-environments). - -### Kedro-Viz and Jupyter - -If you have [Kedro-Viz](https://github.com/quantumblacklabs/kedro-viz) installed then you can display an interactive visualisation of your pipeline directly in your notebook using the [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) `%run_viz`. You should see a visualisation like the following: - -![](../meta/images/jupyter_notebook_kedro_viz.png) diff --git a/docs/source/12_faq/01_faq.md b/docs/source/12_faq/01_faq.md deleted file mode 100644 index ed06429ed3..0000000000 --- a/docs/source/12_faq/01_faq.md +++ /dev/null @@ -1,189 +0,0 @@ -# Frequently asked questions - -The following lists a set of questions that we have been asked about Kedro in the past. If you have a different - question which isn't answered here, check out [GitHub Discussions](https://github.com/quantumblacklabs/kedro/discussions) or talk to the community on the [Discord Server](https://discord.gg/akJDeVaxnB). -## What is Kedro? - -Kedro is an open-source Python framework for creating reproducible, maintainable and modular data science code. It - borrows concepts from software engineering and applies them to machine-learning code; applied concepts include - modularity, separation of concerns and versioning. - -For the source code, take a look at the [Kedro repository on Github](https://github.com/quantumblacklabs/kedro). - -## Who maintains Kedro? - -Kedro was originally designed by [Aris Valtazanos](https://github.com/arisvqb) and [Nikolaos Tsaousis](https://github.com/tsanikgr) to solve challenges they faced in their project work. - -Their work was later turned into an internal product by [Peteris Erins](https://github.com/Pet3ris), [Ivan Danov](https://github.com/idanov), [Nikolaos Kaltsas](https://github.com/nikos-kal), [Meisam Emamjome](https://github.com/misamae) and [Nikolaos Tsaousis](https://github.com/tsanikgr). - -Currently, the core Kedro team consists of -[Yetunde Dada](https://github.com/yetudada), -[Ivan Danov](https://github.com/idanov), -[Richard Westenra](https://github.com/richardwestenra), -[Lorena Balan](https://github.com/lorenabalan), -[Lim Hoang](https://github.com/limdauto), -[Jo Stichbury](https://github.com/stichbury), -[Merel Theisen](https://github.com/MerelTheisenQB), -[Gabriel Comym](https://github.com/GabrielComymQB), -[Liam Brummitt](https://github.com/bru5), -[Susanna Wong](https://github.com/studioswong), -[Rashida Kanchwala](https://github.com/rashidakanchwala), -[Joel Schwarzmann](https://github.com/datajoely), -[Antony Milne](https://github.com/AntonyMilneQB), -[Jiri Klein](https://github.com/jiriklein), -[Ignacio Paricio](https://github.com/ignacioparicio) and -[Hamza Oza](https://github.com/hamzaoza). - -Former core team members with significant contributions include: -[Gordon Wrigley](https://github.com/tolomea), -[Nasef Khan](https://github.com/nakhan98), -[Anton Kirilenko](https://github.com/Flid), -[Zain Patel](https://github.com/mzjp2), -[Laís Carvalho](https://github.com/laisbsc), -[Kiyohito Kunii](https://github.com/921kiyo), -[Dmitrii Deriabin](https://github.com/dmder), -[Andrii Ivaniuk](https://github.com/andrii-ivaniuk). - -And last, but not least, all the open-source contributors whose work went into all Kedro [releases](https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md). - -## What are the primary advantages of Kedro? - -If you're a Data Scientist, then you should be interested in Kedro because it enables you to: - -- **Write cleaner code,** so that your Python code is easy to maintain and re-run in future; it does this by applying standardisation and software-engineering best practices -- **Make a seamless transition from development to production,** as you can write quick, throw-away exploratory code and - transition to maintainable, easy-to-share, code experiments quickly -- **Stay current in machine learning operations [(MLOps)](https://en.wikipedia.org/wiki/MLOps),** as Kedro takes care - of the principles you need to create data science code that lasts; you'll always be two steps in front of industry standards -- **Integrate with your data science workflow,** and use tools in the data science ecosystem, like Tensorflow, SciKit-Learn or Jupyter notebooks for experimentation. You can also take advantage of tools to produce for producing - quality code like Sphinx (documentation); `black`, `isort` and `flake8` (code linting and formatting); and,`pytest` (unit tests) - -If you're a Machine-Learning Engineer or Data Engineer, then you should be interested in Kedro because: - -- **Standardisation creates efficiency,** establishing proper analytics code foundations can save up to 80% of your hours down the road when putting models in production -- **You can focus on solving problems, not setting up projects,** Kedro provides the scaffolding to build more - complex data and machine-learning pipelines. There's a focus on spending less time on the tedious "plumbing" required to maintain analytics code; this means that you have more time to solve new problems -- **A data-driven framework makes pipelines easy,** by permitting data versioning, incremental computing and automatic pipeline running order resolution -- **It is platform-agnostic,** allowing you to choose what compute or platform to run your Kedro workflow; Databricks - and products like Kubeflow, Argo, Prefect and Airflow are deployment targets -- **It is easy to extend**, by using Hooks to add in tools like [MLFlow](https://mlflow.org/) (experiment tracking), [Great Expectations](https://greatexpectations.io/) (data validation and profiling) and [Grafana](https://grafana.com/) (pipeline monitoring) - -If you're a Project Lead, then you should be interested in Kedro because: - -- **It allows for effortless teamwork and an ability to scale analytics across an organisation.** Kedro standardises team workflows; the modular structure of Kedro facilitates a higher level of collaboration when teams solve problems together -- We stand for **no more fire drills.** You can remove long delays created because you have to refactor a data - science proof of concept into production -- **You don't need to start from scratch,** standardisation and separation of concerns makes it possible to reuse analytics code -- **See your project like never before,** Kedro’s pipeline visualization plugin lets you see a blueprint of your team's developing workflows and better collaborate with business stakeholders - -## How does Kedro compare to other projects? - -Some of our open-source users have called Kedro, the [React](https://medium.com/quantumblack/beyond-the-notebook-and-into-the-data-science-framework-revolution-a7fd364ab9c4) or Django for data science code and we think it's a - suitable framing for who we are. We exist to standardise how data science code is created. - -Everyone sees the pipeline abstraction in Kedro and gets excited, thinking that we're similar to orchestrators like - Airflow, Luigi, Prefect, Dagster, Flyte, Kubeflow and more. We focus on a different problem, which is the process of - _authoring_ pipelines, as opposed to _running, scheduling and monitoring_ them. - -The responsibility of _"What time will this pipeline run?"_, _"How do I manage my compute?"_ and _"How will I know if it - failed?"_ is left to the orchestrators. We also have deployment guidelines for using orchestrators as deployment - targets and are working in collaboration with the maintainers of some of those tools to make the deployment experience as enjoyable as possible. - -## What is data engineering convention? - -[Bruce Philp](https://github.com/bruceaphilp) and [Guilherme Braccialli](https://github.com/gbraccialli-qb) at -[QuantumBlack](https://github.com/quantumblacklabs) are the brains behind a layered data-engineering convention as a model of managing data. You can find an [in-depth walk through of their convention](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71) as a blog post on Medium. - -Refer to the following table below for a high level guide to each layer's purpose - -```eval_rst -.. note:: The data layers don’t have to exist locally in the ``data`` folder within your project, but we recommend that you structure your S3 buckets or other data stores in a similar way. -``` - -![](../meta/images/data_engineering_convention.png) - -```eval_rst -+----------------+---------------------------------------------------------------------------------------------------+ -| Folder in data | Description | -+================+===================================================================================================+ -| Raw | Initial start of the pipeline, containing the sourced data model(s) that should never be changed, | -| | it forms your single source of truth to work from. These data models are typically un-typed in | -| | most cases e.g. csv, but this will vary from case to case. | -+----------------+---------------------------------------------------------------------------------------------------+ -| Intermediate | Optional data model(s), which are introduced to type your :code:`raw` data model(s), e.g. | -| | converting string based values into their current typed representation. | -+----------------+---------------------------------------------------------------------------------------------------+ -| Primary | Domain specific data model(s) containing cleansed, transformed and wrangled data from either | -| | :code:`raw` or :code:`intermediate`, which forms your layer that you input into your feature | -| | engineering. | -+----------------+---------------------------------------------------------------------------------------------------+ -| Feature | Analytics specific data model(s) containing a set of features defined against the :code:`primary` | -| | data, which are grouped by feature area of analysis and stored against a common dimension. | -+----------------+---------------------------------------------------------------------------------------------------+ -| Model input | Analytics specific data model(s) containing all :code:`feature` data against a common dimension | -| | and in the case of live projects against an analytics run date to ensure that you track the | -| | historical changes of the features over time. | -+----------------+---------------------------------------------------------------------------------------------------+ -| Models | Stored, serialised pre-trained machine learning models. | -+----------------+---------------------------------------------------------------------------------------------------+ -| Model output | Analytics specific data model(s) containing the results generated by the model based on the | -| | :code:`model input` data. | -+----------------+---------------------------------------------------------------------------------------------------+ -| Reporting | Reporting data model(s) that are used to combine a set of :code:`primary`, :code:`feature`, | -| | :code:`model input` and :code:`model output` data used to drive the dashboard and the views | -| | constructed. It encapsulates and removes the need to define any blending or joining of data, | -| | improve performance and replacement of presentation layer without having to redefine the data | -| | models. | -+----------------+---------------------------------------------------------------------------------------------------+ -``` - -## How do I upgrade Kedro? - -We use [Semantic Versioning](https://semver.org/). The best way to safely upgrade is to check our [release notes](https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md) for any notable breaking changes. Follow the steps in the migration guide included for that specific release. - -Once Kedro is installed, you can check your version as follows: - -``` -kedro --version -``` - -To later upgrade Kedro to a different version, simply run: - -``` -pip install kedro -U -``` - -When migrating an existing project to a newer Kedro version, make sure you also update the `project_version` in your `pyproject.toml` file from the project root directory or, for projects generated with Kedro<0.17.0, in your `ProjectContext`, which is found in `src//run.py`. - -## How can I use a development version of Kedro? - -> *Important:* The development version of Kedro is not guaranteed to be bug-free and/or compatible with any of the [stable versions](https://pypi.org/project/kedro/#history). We do not recommend that you use a development version of Kedro in any production systems. Please install and use with caution. - -If you want to try out the latest, most novel functionality of Kedro which has not been released yet, you can run the following installation command: - -```console -pip install git+https://github.com/quantumblacklabs/kedro.git@develop -``` - -This will install Kedro from the `develop` branch of the GitHub repository, which is always the most up to date. This command will install Kedro from source, unlike `pip install kedro` which installs from PyPI. - -If you want to rollback to the stable version of Kedro, execute the following in your environment: - -```console -pip uninstall kedro -y -pip install kedro -``` - -## How can I find out more about Kedro? - -There are a host of articles, podcasts, talks and Kedro showcase projects in the [`kedro-community`](https://github.com/quantumblacklabs/kedro-community) repository. - -Our preferred Kedro-community channel for feedback is through [GitHub issues](https://github.com/quantumblacklabs/kedro/issues). We update the codebase regularly; you can find news about updates and features in the [RELEASE.md file on the Github repository](https://github.com/quantumblacklabs/kedro/blob/develop/RELEASE.md). - -## How can I cite Kedro? - -If you're an academic, Kedro can also help you, for example, as a tool to solve the problem of reproducible research. Find our citation reference on [Zenodo](https://zenodo.org/record/4336685). - -## How can I get my question answered? - -If your question isn't answered above, check out [GitHub Discussions](https://github.com/quantumblacklabs/kedro/discussions) or talk to the community on the [Discord Server](https://discord.gg/akJDeVaxnB). diff --git a/docs/source/12_faq/02_architecture_overview.md b/docs/source/12_faq/02_architecture_overview.md deleted file mode 100644 index 9d082192b8..0000000000 --- a/docs/source/12_faq/02_architecture_overview.md +++ /dev/null @@ -1,51 +0,0 @@ -# Kedro architecture overview - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.4``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -![Kedro architecture diagram](../meta/images/kedro_architecture.png) - -At a high level, Kedro consists of five main parts: - -### Kedro project - -As a data pipeline developer, you will interact with a Kedro project, which consists of: - -* The **`conf/`** directory, which contains configuration for the project, such as data catalog configuration, parameters, etc. -* The **`src`** directory, which contains the source code for the project, including: - * The **`pipelines`** directory, which contains the source code for your pipelines. - * **`settings.py`** file contains the settings for the project, such as library component registration, custom hooks registration, etc. - * **`hooks.py`**, which contains custom [Hooks implementations](../07_extend_kedro/02_hooks) in the project, including both registration hooks and extension hooks. - * **`pipeline_registry.py`** file defines the project pipelines, i.e. pipelines that can be run using `kedro run --pipeline`. - * **`__main__.py`** file serves as the main entry point of the project in [package mode](../03_tutorial/05_package_a_project.md#package-your-project). -* **`pyproject.toml`** identifies the project root by providing project metadata, including: - * `package_name`: A valid Python package name for your project package - * `project_name`: A human readable name for your project - * `project_version`: Kedro version with which the project was generated - -### Kedro starter - -You can use a [Kedro starter](../02_get_started/06_starters) to generate a Kedro project that contains boilerplate code. We maintain a set of [official starters](https://github.com/quantumblacklabs/kedro-starters/) but you can also use a custom starter of your choice. - -### Kedro library - -Kedro library consists of independent units, each responsible for one aspect of computation in a data pipeline: - -* **`Config Loader`** provides utility to parse and load configuration defined in a Kedro project. -* **`Pipeline`** provides a collection of abstractions to model data pipelines. -* **`Runner`** provides an abstraction for different execution strategy of a data pipeline. -* **`I/O`** provides a collection of abstractions to handle I/O in a project, including `DataCatalog` and many `DataSet` implementations. - -### Kedro framework - -Kedro framework serves as the interface between a Kedro project and Kedro library components. The major building blocks of the Kedro framework include: - -* **`Session`** is responsible for managing the lifecycle of a Kedro run. -* **`Context`** holds the configuration and Kedro's main functionality, and also serves as the main entry point for interactions with core library components. -* **`Hooks`** defines all hook specifications available to extend Kedro. -* **`CLI`** defines built-in Kedro CLI commands and utilities to load custom CLI commands from plugins. - -### Kedro extension - -You can also extend Kedro behaviour in your project using a Kedro extension, which can be a custom starter, a Python library with extra hooks implemenations, extra CLI commands such as [Kedro-Viz](https://github.com/quantumblacklabs/kedro-viz) or a custom library component implementation. diff --git a/docs/source/12_faq/03_kedro_principles.md b/docs/source/12_faq/03_kedro_principles.md deleted file mode 100644 index 2891aed5d7..0000000000 --- a/docs/source/12_faq/03_kedro_principles.md +++ /dev/null @@ -1,24 +0,0 @@ -# Kedro Principles - -After a long discussion, we wrote this set of principles to summarise our development philosophy and to guide us through future decisions about Kedro. - -### 1. Modularity at the core ️📦 -Modularity allows for easy construction, flexible arrangements and reusability of components, resulting in an extensible and customisable system. Kedro is built around the idea of enabling modular data engineering and data science code. To make this possible, we take this as our core tenet and make sure Kedro’s own components are modular and independent of each other as much as possible. Each component has clearly defined responsibilities and narrow interfaces. We aim to make most of our components highly decoupled from each other and ensure they can be used on their own. - -### 2. Grow beginners into experts 🌱 -Every user is on a learning journey and Kedro aims to be the perfect vehicle for such an adventure. We want Kedro to be loved by users across all different levels of experience. Kedro should be your companion as a beginner, taking your first steps into building data products, or as an expert user, well-seasoned in taking machine-learning models into production. - -### 3. User empathy without unfounded assumptions 🤝 -Kedro is designed with the user in mind, but makes no guesses about what the user has in mind. We strive to understand our users without claiming we are one with them. Our users trust us with their time to learn our API and we should make sure we spend their time wisely. All our assumptions should be grounded on extensive experience and hard data. - -### 4. Simplicity means bare necessities 🍞 -We believe that simplicity is “attained not when there is no longer anything to add, but when there is no longer anything to take away”, very much like Antoine de Saint-Exupéry’s definition of perfection. Simplicity is hard to achieve, but once achieved it is easy to understand and rely on. In our pursuit of simplicity at Kedro we start by defining it as something composed of small number of parts, with small number of features or functional branches and having very little optionality. Simple things are easy, robust, reliable and loved by everyone. They can be used in countless ways on their own or effortlessly become a part of a more complex system since they are modular by nature. - -### 5. There should be one obvious way of doing things 🎯 -Inspired by The Zen of Python, we recommend certain ways of accomplishing tasks. We do this because it allows users to focus on their original problem rather than deal with accidental complexity. That doesn’t mean that it will be impossible to do things using a different way; but, as one becomes more accustomed to Kedro, it will become apparent that there is a preferred way of doing things. Kedro is an opinionated framework, and this is built into its design. - -### 6. A sprinkle of magic is better than a spoonful of it ✨ -The declarative nature of Kedro introduces some magic by hiding the imperative implementation details. However, we recognise that this departure from Python’s preference for explicit solutions can be taken too far and quickly spiral into “dark magic”. Dark magic introduces confusion for the users and can make it easy for them to get lost in their own project. That’s why we have a strong preference for common sense over dark magic and making things obvious rather than clever. Nevertheless, magic is sometimes justified if it simplifies how things work. We promise to use it sparingly and only for good. - -### 7. Lean process and lean product 👟 -Kedro started as a small framework which tackles big problems in the delivery of data science projects from inception to production. We fully subscribe to the principles of lean software development and do our best to eliminate waste as much as possible. We favour small incremental changes over big bang deliveries of functionality and in general we strive to achieve more with less. diff --git a/docs/source/13_resources/01_logos.md b/docs/source/13_resources/01_logos.md deleted file mode 100644 index fb9024e472..0000000000 --- a/docs/source/13_resources/01_logos.md +++ /dev/null @@ -1,18 +0,0 @@ -# Images and icons - -## White background - -### Icon -![Icon on white background](../meta/images/kedro_icon_no-type_whitebg.svg) - -### Icon with text -![Icon with text on white background](../meta/images/kedro_icon_type_whitebg.svg) - -## Black background - -### Icon -![Icon on black background](../meta/images/kedro_icon_no-type_blackbg.svg) - - -### Icon with text -![Icon with text on black background](../meta/images/kedro_icon_type_blackbg.svg) diff --git a/docs/source/14_contribution/01_contribute_to_kedro.md b/docs/source/14_contribution/01_contribute_to_kedro.md deleted file mode 100644 index b96eb318a6..0000000000 --- a/docs/source/14_contribution/01_contribute_to_kedro.md +++ /dev/null @@ -1,11 +0,0 @@ -# Introduction - -We welcome any and all contributions to Kedro, at whatever level you can manage. For example, you could: - -- Join the community on [Discord](https://discord.gg/akJDeVaxnB) -- Troubleshoot other users' questions or get answers to your own queries on [GitHub discussions](https://github.com/quantumblacklabs/kedro/discussions) -- Make a pull request on the [kedro-community Github repo](https://github.com/quantumblacklabs/kedro-community) to update the curated list of Kedro community content. -- Report a bug or propose a new feature on [GitHub issues](https://github.com/quantumblacklabs/kedro/issues) -- [Review other contributors' PRs](https://github.com/quantumblacklabs/kedro/pulls) -- [Contribute code](./02_developer_contributor_guidelines.md), for example to fix a bug or add a feature -- [Contribute to the documentation](04_documentation_contributor_guidelines.md) diff --git a/docs/source/14_contribution/02_developer_contributor_guidelines.md b/docs/source/14_contribution/02_developer_contributor_guidelines.md deleted file mode 100644 index 09b0d07b70..0000000000 --- a/docs/source/14_contribution/02_developer_contributor_guidelines.md +++ /dev/null @@ -1,213 +0,0 @@ -# Guidelines for contributing developers - -This page explains the principles and development process that we ask contributing developers to follow. - -**Any contributions you make will be under the [Apache 2.0 Software License](https://github.com/quantumblacklabs/kedro/blob/master/LICENSE.md)** - -In short, when you submit code changes, your submissions are understood to be under the same the [Apache 2.0 License](https://github.com/quantumblacklabs/kedro/blob/master/LICENSE.md) that covers the Kedro project. You should have permission to share the submitted code. - -Each code file should have a legal header, i.e. the content of [`LICENSE.md`](https://github.com/quantumblacklabs/kedro/blob/master/LICENSE.md). -There is an automated check to verify that it exists. The check will highlight any issues and suggest a solution. - -```eval_rst -.. note:: You don't need to contribute code to help the Kedro project. See our list of other ways `you can contribute to Kedro `_. -``` - -## Introduction - -This guide is a practical description of: - -* How to set up your development environment to contribute to Kedro. -* How to prepare a pull request against the Kedro repository. - - -## Before you start: development set up - -To work on the Kedro codebase, you will need to be set up with Git, and Make. - -```eval_rst -.. note:: If your development environment is Windows, you can use the ``win_setup_conda`` and ``win_setup_env`` commands from `Circle CI configuration `_ to guide you in the correct way to do this. -``` - -You will also need to create and activate virtual environment. If this is unfamiliar to you, read through our [pre-requisites documentation](../02_get_started/01_prerequisites.md). - -Next, you'll need to fork the [Kedro source code from the Github repository](https://github.com/quantumblacklabs/kedro): - -* Fork the project by clicking **Fork** in the top-right corner of the [Kedro GitHub repository](https://github.com/quantumblacklabs/kedro) -* Choose your target account - -If you need further guidance, consult the [Github documentation about forking a repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo#forking-a-repository). - -You are almost ready to go. In your terminal, navigate to the folder into which you forked the Kedro code. - -Run these commands to install everything you need to work with Kedro: - -``` -make install-test-requirements -make install-pre-commit -``` - -Once the above commands have executed successfully, do a sanity check to ensure that `kedro` works in your environment: - -``` -make test -``` - -```eval_rst -.. note:: If the tests in ``tests/extras/datasets/spark`` are failing, and you are not planning to work on `Spark `_ related features, then you can run a reduced test suite that excludes them. Do this by executing ``make test-no-spark``. -``` - -## Get started: areas of contribution - -Once you are ready to contribute, a good place to start is to take a look at the `good first issues` and `help wanted issues` on [GitHub](https://github.com/quantumblacklabs/kedro/issues). - -We focus on three areas for contribution: `core`, `extras` and `plugin`: - -- `core` refers to the primary Kedro library. Read the [`core` contribution process](#core-contribution-process) for details. -- `extras` refers to features that could be added to `core` that do not introduce too many dependencies or require new Kedro CLI commands to be created e.g. [adding a new dataset](../07_extend_kedro/03_custom_datasets.md) to the `kedro.extras.dataset` data management module. All the datasets are placed under `kedro.extras.datasets` to separate heavy dependencies (e.g Pandas) from Kedro `core` components. Read the [`extras` contribution process](#extras-contribution-process) for more information. -- [`plugin`](../07_extend_kedro/04_plugins.md) refers to new functionality that requires a Kedro CLI command e.g. adding in Airflow functionality. The [`plugin` development documentation](../07_extend_kedro/04_plugins.md) contains guidance on how to design and develop a Kedro `plugin`. - - -### `core` contribution process - -Typically, we only accept small contributions to the `core` Kedro library but we accept new features as plugins or additions to the [`extras`](https://github.com/quantumblacklabs/kedro/tree/master/kedro/extras) module. - -To contribute: - -1. Create a feature branch on your forked repository and push all your local changes to that feature branch. -2. Is your change [non-breaking and backwards-compatible](./03_backwards_compatibility.md)? Your feature branch should branch off from: -
    -
  1. master if you intend for it to be a non-breaking, backwards-compatible change.
  2. -
  3. develop if you intend for it to be a breaking change.
  4. -
-3. Before you submit a pull request (PR), please ensure that unit, end-to-end (E2E) tests and linting are passing for your changes by running `make test`, `make e2e-tests` and `make lint` locally, have a look at the section [Running checks locally](#ci--cd-and-running-checks-locally) below. -4. Open a PR: -
    -
  1. For backwards compatible changes, open a PR against the quantumblacklabs:master branch from your feature branch.
  2. -
  3. For changes that are NOT backwards compatible, open a PR against the quantumblacklabs:develop branch from your feature branch.
  4. -
- -5. Await reviewer comments. -6. Update the PR according to the reviewer's comments. -7. Your PR will be merged by the Kedro team once all the comments are addressed. - -```eval_rst -.. note:: We will work with you to complete your contribution but we reserve the right to take over abandoned PRs. -``` - -### `extras` contribution process - -You can add new work to `extras` if you do not need to create a new Kedro CLI command: - -1. Create an [issue](https://github.com/quantumblacklabs/kedro/issues) describing your contribution. -2. Work in [`extras`](https://github.com/quantumblacklabs/kedro/tree/master/kedro/extras) and create a feature branch on your forked repository and push all your local changes to that feature branch. -3. Before you submit a pull request, please ensure that unit, E2E tests and linting are passing for your changes by running `make test`,`make e2e-tests` and `make lint` locally, have a look at the section [Running checks locally](#ci--cd-and-running-checks-locally) below. -4. Include a `README.md` with instructions on how to use your contribution. -5. Is your change [non-breaking and backwards-compatible](./03_backwards_compatibility.md)? -
    -
  1. For backwards compatible changes, open a PR against the quantumblacklabs:master branch from your feature branch.
  2. -
  3. For changes that are NOT backwards compatible, open a PR against the quantumblacklabs:develop branch from your feature branch.
  4. -
- -6. Reference your issue in the PR description (e.g., `Resolves #`). -7. Await review comments, then update the PR according to the reviewer's comments. -8. Your PR will be merged by the Kedro team once all the comments are addressed. - -```eval_rst -.. note:: We will work with you to complete your contribution but we reserve the right to take over abandoned PRs. -``` - -## Create a pull request - -Create your pull request with a descriptive title. Before you submit it, consider the following: - -* You should aim for cross-platform compatibility on Windows, macOS and Linux -* We use [SemVer](https://semver.org/) for versioning -* We have designed our code to be compatible with Python 3.6 onwards and our style guidelines are (in cascading order): - * [PEP 8 conventions](https://www.python.org/dev/peps/pep-0008/) for all Python code - * [Google docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) for code comments - * [PEP 484 type hints](https://www.python.org/dev/peps/pep-0484/) for all user-facing functions / class methods e.g. - -``` -def count_truthy(elements: List[Any]) -> int: - return sum(1 for elem in elements if element) -``` - -Ensure that your PR builds cleanly before you submit it, by running the CI/CD checks locally, as follows: - -To run E2E tests you need to install the test requirements which includes `behave`. -We also use [pre-commit](https://pre-commit.com) hooks for the repository to run the checks automatically. - -```eval_rst -.. note:: If Spark/PySpark/Hive tests for datasets are failing it might be due to the lack of Java>8 support from Spark. You can try using ``export JAVA_HOME=$(/usr/libexec/java_home -v 1.8)`` which `works under macOS or other workarounds `_. -``` - -#### PEP-8 Standards (`pylint` and `flake8`) - -```bash -make lint -``` - -#### Unit tests, 100% coverage (`pytest`, `pytest-cov`) - -You need the dependencies from `test_requirements.txt` installed. - -```bash -make test -``` - -```eval_rst -.. note:: We place `conftest.py `_ files in some test directories to make fixtures reusable by any tests in that directory. If you need to see which test fixtures are available and where they come from, you can issue the following command ``pytest --fixtures path/to/the/test/location.py``. -``` - -#### E2E tests (`behave`) - -```bash -behave -``` - -#### Others - -Our CI / CD also checks that `kedro` installs cleanly on a fresh Python virtual environment, a task which depends on successfully building the documentation: - -```bash -make build-docs -``` - -```eval_rst -.. note:: This command will only work on Unix-like systems and requires ``pandoc`` to be installed. -``` - -### Hints on pre-commit usage - -The checks will automatically run on all the changed files on each commit. -Even more extensive set of checks (including the heavy set of `pylint` checks) -will run before the push. - -The pre-commit/pre-push checks can be omitted by running with `--no-verify` flag, as per below: - -```bash -git commit --no-verify <...> -git push --no-verify <...> -``` -(`-n` alias works for `git commit`, but not for `git push`) - -All checks will run during CI build, so skipping checks on push will -not allow you to merge your code with failing checks. - -You can uninstall the pre-commit hooks by running: - -```bash -make uninstall-pre-commit -``` -`pre-commit` will still be used by `make lint`, but will not install the git hooks. - - -## Need help? - -Working on your first pull request? You can learn how from these resources: - -* [First timers only](https://www.firsttimersonly.com/) -* [How to contribute to an open source project on GitHub](https://egghead.io/courses/how-to-contribute-to-an-open-source-project-on-github) - -Please check the Q&A on [GitHub discussions](https://github.com/quantumblacklabs/kedro/discussions) and ask any new questions about the development process there too! diff --git a/docs/source/14_contribution/04_documentation_contributor_guidelines.md b/docs/source/14_contribution/04_documentation_contributor_guidelines.md deleted file mode 100644 index c087407220..0000000000 --- a/docs/source/14_contribution/04_documentation_contributor_guidelines.md +++ /dev/null @@ -1,154 +0,0 @@ -# Contribute to the Kedro documentation - -You are welcome to contribute to the Kedro documentation if you find something incorrect or missing, or have other improvement suggestions. - -You can tell us what we should change or by make a PR to change it yourself. - -Before you contribute any documentation changes, please read this page so you are familiar with the [Kedro documentation style guidelines](#kedro-documentation-style-guide). - -## How do I rebuild the documentation after I make changes to it? - -Our documentation is written in Markdown and built from by Sphinx, coordinated by a [build script](https://github.com/quantumblacklabs/kedro/blob/master/docs/build-docs.sh). - -If you make changes to the markdown for the Kedro documentation, you can rebuild it within a Unix-like environment (with `pandoc` installed). - -If you are a Windows user, you can still contribute to the documentation, but you cannot rebuild it. This is fine! As long as you have made an effort to verify that your Markdown is rendering correctly, and you have followed our basic guidelines, we will be happy to take your final draft as a pull request and rebuild it for you. - -The following instructions are specifically for people working with documentation who may not already have a development setup. If you are comfortable with virtual environments, cloning and branching from a git repo and using `make` you don't need them and can probably jump to the section called [Build the documentation](#build-the-documentation). - -### Set up to build Kedro documentation - -Follow the setup instructions in the [developer contributor guide](./02_developer_contributor_guidelines.md#before-you-start-development-set-up) -to fork the Kedro repo, create and activate a Python virtual environment and install the dependencies necessary to build the documentation. - - -### Build the documentation - -**MacOS users** can use `make` commands to build the documentation: - -```bash -make build-docs -``` - -The build will take a few minutes to finish, and a successful result is a set of HTML documentation in `docs/build/html`, which you can review by navigating to the following file and opening it: `docs/build/html/index.html`. - - -## Extend Kedro documentation - -### Add new pages - -All Kedro documentation is collated and built from a single index file, [`index.rst`](https://github.com/quantumblacklabs/kedro/blob/master/docs/source/index.rst) found in the `docs/source` folder. - -If you add extra pages of documentation, you should always include them within `index.rst` file to include them in the table of contents and let Sphinx know to build them alongside the rest of the documentation. - -### Move or remove pages - -To move or remove a page of documentation, first locate it in the repo, and also locate where it is specified in the `index.rst` or `.rst` for the relevant section within the table of contents. - -### Create a pull request - -You need to submit any changes to the documentation via a branch. - -[Find out more about the process of submitting a PR to the Kedro project](./02_developer_contributor_guidelines.md). - -### Help! - -There is no shame in breaking the documentation build. Sphinx is incredibly fussy and even a single space in the wrong place will sometimes cause problems. A range of other issues can crop up and block you, whether you're technically experienced or less familiar with working with git, conda and Sphinx. - -Ask for help over on [GitHub discussions](https://github.com/quantumblacklabs/kedro/discussions). - -## Kedro documentation style guide - -This is the style guide we have used to create [documentation about Kedro](https://kedro.readthedocs.io/en/stable/). - -When you are writing documentation for your own project, you may find it useful to follow these rules. We also ask anyone kind enough to contribute to the Kedro documentation to follow our preferred style to maintain consistency and simplicity. - -We prefer to think of the following list as guidelines rather than rules because have made them lightweight to encourage you to contribute. - -Where it's not obvious what the style should be, it's worth consulting the [Microsoft style guide](https://docs.microsoft.com/en-gb/style-guide/welcome/). We also use the [INCITS Inclusive Terminology Guidelines](https://standards.incits.org/apps/group_public/download.php/131246/eb-2021-00288-001-INCITS-Inclusive-Terminology-Guidelines.pdf). - -```eval_rst -.. note:: If you are unsure of our preferred style, just do what you can in your documentation contribution, and note any queries. We can always iterate the submission with you when you create a pull request. -``` - -### Language -* Use UK English - -### Formatting -* Use Markdown formatting -* Mark code blocks with the appropriate language to enable syntax highlighting -* We use a `bash` lexer for all codeblocks that represent the terminal, and we don't include the prompt - -### Links -* Make hyperlink descriptions as descriptive as you can. This is a good description: - -```text -Learn how to [update the project pipeline](https://kedro.readthedocs.io/en/stable/03_tutorial/04_create_pipelines.html#update-the-project-pipeline) -``` - -This is less helpful: - -```text -Learn how to update the [project pipeline](https://kedro.readthedocs.io/en/stable/03_tutorial/04_create_pipelines.html#update-the-project-pipeline) -``` - -Don't write this: - -```text -To learn how to update the project pipeline, see [here](https://kedro.readthedocs.io/en/stable/03_tutorial/04_create_pipelines.html#update-the-project-pipeline) -``` - -### Capitalisation -* Only capitalise proper nouns e.g. names of technology products, other tools and services. See the [Kedro lexicon section](#kedro-lexicon) below for additional guidance. -* Don't capitalise cloud, internet, machine learning, advanced analytics etc. as per the [Microsoft style guide](https://docs.microsoft.com/en-us/style-guide/a-z-word-list-term-collections/term-collections/accessibility-terms). -* Follow sentence case, which capitalises only the first word of a title/subtitle. We prefer this: - -```text -## An introduction to pipelines -``` - -Don't write this: - -```text -## An Introduction to Pipelines -``` - -### Bullets -* Capitalise the first word. -* Don't put a period at the end unless it's a full sentence. Aim for consistency within a block of bullets if you have some bullets with full sentences and others without, you'll need to put a period at the end of each of them. Like in this set. -* Don't use numbered bullets except for a sequence of activities or where you have to refer back to one of them in the text (or a diagram). - -### Notes -We use callout sections formatted in `.rst` to bring attention to key points. For example: - -```eval_rst -.. note:: Do not pass "Go", do not collect £200. -``` - -* You will need to use restructured text formatting within the box. Aim to keep the formatting of the callout text plain, although you can include bold, italic, code and links. -* Keep the amount of text (and the number of callouts used) to a minimum. -* Prefer to use `note`, `warning` and `important` only, rather than a number of different colours/types of callout. - * Use `note` for notable information - * Use `warning` to indicate a potential `gotcha` - * Use `important` when highlighting a key point that cannot be ignored - -### Kedro lexicon -* Name of our product: Kedro and Kedro-Viz (note capitalisation). -* We are QuantumBlack Labs. -* Use journal and pipeline as these aren't proper nouns. Tend to lower case except if there is a precedent (see next bullet). -* Use Hooks (not hooks, except where it's a necessary part of your code example). We are taking our lead from React here, so capitalising despite it not seeming consistent with other rules. -* Use dataset (not data set, or data-set) for a generic dataset. - * Use capitalised DataSet when talking about a specific Kedro dataset class e.g. CSVDataSet. -* Use data catalog for a generic data catalog. - * Use Data Catalog to talk about the [Kedro Data Catalog](../05_data/01_data_catalog.md). - -### Style -* Keep your sentences short and easy to read. -* Do not plagiarise other authors. Link to their text and credit them. -* Avoid colloquialisms that may not translate to other regions/languages. -* Avoid technical terminology, particularly acronyms, that do not pass the "Google test", which means it is not possible to find their meaning from a simple Google search. -* Use imperatives to make instructions, or second person. - * For example "Complete the configuration steps" or "You should complete the configuration steps". Don't use the passive "The configuration steps should be completed" (see next bullet). -* Avoid passive tense. What is passive tense? If you can add "by zombies" to the end of any sentence, it is passive. - * For example: "The configuration steps should be completed." can also be read as: "The configuration should be completed BY ZOMBIES". - * Instead, you'd write this: "You should complete the configuration steps" or better still, "Complete the configuration steps". diff --git a/docs/source/15_api_docs/kedro.extras.logging.color_logger.ColorHandler.rst b/docs/source/15_api_docs/kedro.extras.logging.color_logger.ColorHandler.rst deleted file mode 100644 index 8a762bb2c7..0000000000 --- a/docs/source/15_api_docs/kedro.extras.logging.color_logger.ColorHandler.rst +++ /dev/null @@ -1,6 +0,0 @@ -kedro.extras.logging.color\_logger.ColorHandler -=============================================== - -.. currentmodule:: kedro.extras.logging.color_logger - -.. autoclass:: ColorHandler diff --git a/docs/source/15_api_docs/kedro.framework.cli.jupyter.SingleKernelSpecManager.rst b/docs/source/15_api_docs/kedro.framework.cli.jupyter.SingleKernelSpecManager.rst deleted file mode 100644 index dc5f6c74dc..0000000000 --- a/docs/source/15_api_docs/kedro.framework.cli.jupyter.SingleKernelSpecManager.rst +++ /dev/null @@ -1,6 +0,0 @@ -kedro.framework.cli.jupyter.SingleKernelSpecManager -=================================================== - -.. currentmodule:: kedro.framework.cli.jupyter - -.. autoclass:: SingleKernelSpecManager diff --git a/docs/source/15_api_docs/kedro.framework.session.store.ShelveStore.rst b/docs/source/15_api_docs/kedro.framework.session.store.ShelveStore.rst deleted file mode 100644 index e1149edf0f..0000000000 --- a/docs/source/15_api_docs/kedro.framework.session.store.ShelveStore.rst +++ /dev/null @@ -1,6 +0,0 @@ -kedro.framework.session.store.ShelveStore -========================================= - -.. currentmodule:: kedro.framework.session.store - -.. autoclass:: ShelveStore diff --git a/docs/source/15_api_docs/kedro.versioning.journal.JournalFileHandler.rst b/docs/source/15_api_docs/kedro.versioning.journal.JournalFileHandler.rst deleted file mode 100644 index 386ce5313a..0000000000 --- a/docs/source/15_api_docs/kedro.versioning.journal.JournalFileHandler.rst +++ /dev/null @@ -1,6 +0,0 @@ -kedro.versioning.journal.JournalFileHandler -=========================================== - -.. currentmodule:: kedro.versioning.journal - -.. autoclass:: JournalFileHandler diff --git a/docs/source/css/qb1-sphinx-rtd.css b/docs/source/_static/css/qb1-sphinx-rtd.css similarity index 92% rename from docs/source/css/qb1-sphinx-rtd.css rename to docs/source/_static/css/qb1-sphinx-rtd.css index ca72aabd8c..3f11d0ceee 100644 --- a/docs/source/css/qb1-sphinx-rtd.css +++ b/docs/source/_static/css/qb1-sphinx-rtd.css @@ -1,11 +1,11 @@ -@import url("https://fonts.googleapis.com/css?family=Titillium+Web:300,400,600"); +@import url("https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600&display=swap"); html, body.wy-body-for-nav { margin: 0; padding: 0; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; - font-family: 'Titillium Web', sans-serif; + font-family: 'Inter', sans-serif !important; font-weight: 400; line-height: 2rem; } @@ -19,7 +19,7 @@ html { } body.wy-body-for-nav { - font-size: 1.6rem; + font-size: 1.4289rem; background: rgb(250, 250, 250) !important; color: #1e1e21; position: relative; @@ -36,8 +36,8 @@ body.wy-body-for-nav { padding: 6px 12px; color: #666; background-color: #fff; - font-family: inherit; - font-size: 1.6rem; + font-family: 'Inter', sans-serif !important; + font-size: 1.4289rem; border: 1px #ccc solid; border-radius: 2px; transition: all ease 0.15s; @@ -116,7 +116,7 @@ body.wy-body-for-nav { .wy-body-for-nav .wy-menu-vertical p.caption { margin: 1.5em 0 0.2em; padding: 0; - font-size: 2rem; + font-size: 1.4289rem; color: #161616; font-weight: normal; text-transform: none; @@ -143,13 +143,17 @@ body.wy-body-for-nav { .wy-body-for-nav .wy-menu-vertical li a { display: block; margin: 0; - padding: 1.1rem 0 !important; - font-size: 1.6rem; + padding: 0.9rem 0 !important; + font-size: 1.4289rem; line-height: 1.2; color: #222; background: none !important; } +.wy-menu-vertical li.toctree-l1>a { + font-size: 1.4289rem; +} + .rst-content.style-external-links a.reference.external:after { color: inherit; opacity: 0.8; @@ -259,6 +263,7 @@ body.wy-body-for-nav { .wy-body-for-nav .wy-menu-vertical li.current a:hover { background: none; + font-weight: 600; } .wy-menu-vertical a span.toctree-expand { @@ -304,7 +309,7 @@ article ul li { } h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend { - font-family: 'Titillium Web', sans-serif; + font-family: 'Inter', sans-serif !important; margin: 1.2em 0 1em 0; line-height: 1.2em; color: #272c2e; @@ -368,7 +373,7 @@ footer span.commit .rst-content tt, } .wy-body-for-nav .rst-content a, .wy-body-for-nav footer a { - font-family: inherit; + font-family: 'Inter', sans-serif !important; font-size: inherit; color: #006ea7; text-decoration: none; @@ -419,6 +424,16 @@ footer span.commit .rst-content tt, color: black !important; } +.wy-side-scroll p.caption[role="heading"] span.caption-text { + font-weight: 700; +} + +.wy-nav-content h1, +.wy-nav-content p.caption[role="heading"] .caption-text { + font-size: 1.7857rem; + font-weight: 700; +} + @media screen and (max-width: 768px) { .wy-body-for-nav .wy-nav-side { @@ -475,6 +490,4 @@ footer span.commit .rst-content tt, display: block; content: ' '; width: 1px; - height: 60px; - margin-top: -80px; } diff --git a/docs/source/_static/css/theme-overrides.css b/docs/source/_static/css/theme-overrides.css new file mode 100644 index 0000000000..8ae384dd9a --- /dev/null +++ b/docs/source/_static/css/theme-overrides.css @@ -0,0 +1,28 @@ +/* override table width restrictions */ +@media screen and (min-width: 767px) { + .wy-table-responsive table td { + white-space: normal; + } + + .wy-table-responsive { + overflow: visible; + } +} + +/* override the table font-size and line-height */ +html.writer-html5 .rst-content .wy-table-responsive table.docutils th > p, +html.writer-html5 .rst-content .wy-table-responsive table.docutils td > p { + font-size: 1em; + line-height: 1em; +} + +img[alt^="mermaid-"] { + max-width: 600px; +} + +.rst-content .important { + background: #ffedcc; +} +.rst-content .important .admonition-title { + background-color: #f0b37e; +} diff --git a/docs/_templates/autosummary/base.rst b/docs/source/_templates/autosummary/base.rst similarity index 100% rename from docs/_templates/autosummary/base.rst rename to docs/source/_templates/autosummary/base.rst diff --git a/docs/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst similarity index 100% rename from docs/_templates/autosummary/class.rst rename to docs/source/_templates/autosummary/class.rst diff --git a/docs/_templates/autosummary/module.rst b/docs/source/_templates/autosummary/module.rst similarity index 100% rename from docs/_templates/autosummary/module.rst rename to docs/source/_templates/autosummary/module.rst diff --git a/docs/_templates/breadcrumbs.html b/docs/source/_templates/breadcrumbs.html similarity index 100% rename from docs/_templates/breadcrumbs.html rename to docs/source/_templates/breadcrumbs.html diff --git a/docs/_templates/layout.html b/docs/source/_templates/layout.html similarity index 100% rename from docs/_templates/layout.html rename to docs/source/_templates/layout.html diff --git a/docs/conf.py b/docs/source/conf.py similarity index 78% rename from docs/conf.py rename to docs/source/conf.py index 7d1513f810..b57e0daab9 100644 --- a/docs/conf.py +++ b/docs/source/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # Kedro documentation build configuration file, created by # sphinx-quickstart on Mon Dec 18 11:31:24 2017. @@ -12,27 +11,23 @@ # # All configuration values have a default; values that are commented out # serve to show the default. +from __future__ import annotations import importlib import os import re -import shutil import sys -from distutils.dir_util import copy_tree from inspect import getmembers, isclass, isfunction from pathlib import Path -from typing import List, Tuple from click import secho, style -from recommonmark.transform import AutoStructify from kedro import __version__ as release # -- Project information ----------------------------------------------------- -project = "Kedro" -copyright = "2021, QuantumBlack Visual Analytics Limited" -author = "QuantumBlack" +project = "kedro" +author = "kedro" # The short X.Y version. version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) @@ -52,17 +47,15 @@ "sphinx.ext.napoleon", "sphinx_autodoc_typehints", "sphinx.ext.doctest", - "sphinx.ext.todo", - "sphinx.ext.coverage", - "sphinx.ext.mathjax", "sphinx.ext.ifconfig", "sphinx.ext.viewcode", - "nbsphinx", - "recommonmark", "sphinx_copybutton", + "sphinxcontrib.mermaid", + "myst_parser", + "notfound.extension", ] -# enable autosummary plugin (table of contents for modules/classes/class +# enable autosummary plugin (table of contents for modules/classes/class # methods) autosummary_generate = True autosummary_generate_overwrite = False @@ -70,6 +63,7 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] +html_static_path = ["_static"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: @@ -84,7 +78,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -97,6 +91,7 @@ "kedro_docs_style_guide.md", ] + type_targets = { "py:class": ( "object", @@ -107,6 +102,9 @@ "tuple", "Any", "Dict", + "dict", + "list", + "set", "typing.Dict", "typing.Iterable", "typing.List", @@ -119,17 +117,18 @@ "kedro.io.core.DataSetError", "kedro.io.core.Version", "kedro.io.data_catalog.DataCatalog", - "kedro.io.transformers.AbstractTransformer", - "kedro.io.data_catalog_with_default.DataCatalogWithDefault", - "kedro.io.partitioned_data_set.PartitionedDataSet", + "kedro.io.memory_dataset.MemoryDataSet", + "kedro.io.partitioned_dataset.PartitionedDataSet", "kedro.pipeline.pipeline.Pipeline", "kedro.runner.runner.AbstractRunner", "kedro.runner.parallel_runner._SharedMemoryDataSet", - "kedro.versioning.journal.Journal", + "kedro.runner.parallel_runner._SharedMemoryDataset", "kedro.framework.context.context.KedroContext", "kedro.framework.startup.ProjectMetadata", "abc.ABC", + "Path", "pathlib.Path", + "PurePosixPath", "pathlib.PurePosixPath", "requests.auth.AuthBase", "google.oauth2.credentials.Credentials", @@ -138,6 +137,22 @@ "integer -- return number of occurrences of value", "integer -- return first index of value.", "kedro.extras.datasets.pandas.json_dataset.JSONDataSet", + "kedro_datasets.pandas.json_dataset.JSONDataSet", + "pluggy._manager.PluginManager", + "PluginManager", + "_DI", + "_DO", + # The statements below were added after subclassing UserDict in AbstractConfigLoader. + "None. Remove all items from D.", + "a shallow copy of D", + "a set-like object providing a view on D's items", + "a set-like object providing a view on D's keys", + "v, remove specified key and return the corresponding value.", + "None. Update D from dict/iterable E and F.", + "an object providing a view on D's values", + "(k, v), remove and return some (key, value) pair", + "D.get(k,d), also set D[k]=d if k not in D", + "None. Update D from mapping/iterable E and F.", ), "py:data": ( "typing.Any", @@ -159,9 +174,9 @@ "CircularDependencyError", "OutputNotUniqueError", "ConfirmNotUniqueError", + "ParserError", ), } - # https://stackoverflow.com/questions/61770698/sphinx-nit-picky-mode-but-only-for-links-i-explicitly-wrote nitpick_ignore = [(key, value) for key in type_targets for value in type_targets[key]] @@ -177,32 +192,40 @@ here = Path(__file__).parent.absolute() html_logo = str(here / "kedro_logo.svg") -# Theme options are theme-specific and customize the look and feel of a theme +# Theme options are theme-specific and customise the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = {"collapse_navigation": False, "style_external_links": True} +# html_extra_path used to define a path to robots.txt which is used by webcrawlers +# to ignore or allow certain links. +html_extra_path = [str(here / "robots.txt")] + +# Removes, from all docs, the copyright footer. +html_show_copyright = False + # some of these complain that the sections don't exist (which is not true), # too many requests, or forbidden URL linkcheck_ignore = [ - "https://www.datacamp.com/community/tutorials/docstrings-python", # "forbidden" url - "https://setuptools.readthedocs.io/en/latest/setuptools.html#dynamic-discovery-of-services-and-plugins", + "http://127.0.0.1:8787/status", # Dask's diagnostics dashboard + "https://datacamp.com/community/tutorials/docstrings-python", # "forbidden" url "https://github.com/argoproj/argo/blob/master/README.md#quickstart", "https://console.aws.amazon.com/batch/home#/jobs", - "https://github.com/EbookFoundation/free-programming-books/blob/master/books/free-programming-books.md#python", + "https://github.com/EbookFoundation/free-programming-books/blob/master/books/free-programming-books-langs.md#python", "https://github.com/jazzband/pip-tools#example-usage-for-pip-compile", "https://www.astronomer.io/docs/cloud/stable/get-started/quickstart#", - "https://github.com/quantumblacklabs/private-kedro/blob/master/kedro/templates/project/*", - "https://zenodo.org/record/4336685", - "https://zenodo.org/badge/latestdoi/182067506", "https://eternallybored.org/misc/wget/", "https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas", - "https://github.com/quantumblacklabs/kedro-starters/tree/master/standalone-datacatalog", # temporary until 0.18 "https://www.oracle.com/java/technologies/javase-downloads.html", # "forbidden" url - "https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71", - "https://medium.com/quantumblack/beyond-the-notebook-and-into-the-data-science-framework-revolution-a7fd364ab9c4", "https://www.java.com/en/download/help/download_options.html", # "403 Client Error: Forbidden for url" + # "anchor not found" but it's a valid selector for code examples + "https://docs.delta.io/latest/delta-update.html#language-python", + "https://github.com/kedro-org/kedro/blob/main/kedro/framework/project/default_logging.yml", + "https://github.com/kedro-org/kedro/blob/main/README.md#the-humans-behind-kedro", # "anchor not found" but is valid + "https://opensource.org/license/apache2-0-php/", + "https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password", + "https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/api/snowflake.snowpark.DataFrameWriter.saveAsTable.html" ] # retry before render a link broken (fix for "too many requests") @@ -211,7 +234,7 @@ html_context = { "display_github": True, - "github_url": "https://github.com/quantumblacklabs/kedro/tree/master/docs/source", + "github_url": "https://github.com/kedro-org/kedro/tree/main/docs/source", } # Add any paths that contain custom static files (such as style sheets) here, @@ -256,9 +279,7 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, "Kedro.tex", "Kedro Documentation", "QuantumBlack", "manual") -] +latex_documents = [(master_doc, "Kedro.tex", "Kedro Documentation", "Kedro", "manual")] # -- Options for manual page output ------------------------------------------ @@ -278,7 +299,7 @@ "Kedro Documentation", author, "Kedro", - "Kedro is a Data Science framework for QuantumBlack-led projects.", + "Kedro is a Python framework for creating reproducible, maintainable and modular data science code.", "Data-Science", ) ] @@ -288,23 +309,6 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False -# -- Extension configuration ------------------------------------------------- - -# nbsphinx_prolog = """ -# see here for prolog/epilog details: -# https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html -# """ - -nbsphinx_epilog = """ -.. note:: - - Found a bug, or didn't find what you were looking for? 🙏 `Please file a - ticket `_ -""" - -# -- NBconvert kedro config ------------------------------------------------- -nbsphinx_kedro_name = "kedro" - # -- Kedro specific configuration ----------------------------------------- KEDRO_MODULES = [ "kedro.io", @@ -312,9 +316,7 @@ "kedro.runner", "kedro.config", "kedro.extras.datasets", - "kedro.extras.logging", - "kedro.extras.decorators", - "kedro.extras.transformers", + "kedro_datasets", ] @@ -335,7 +337,7 @@ def remove_arrows_in_examples(lines): lines[i] = line.replace(">>>", "") -def autolink_replacements(what: str) -> List[Tuple[str, str, str]]: +def autolink_replacements(what: str) -> list[tuple[str, str, str]]: """ Create a list containing replacement tuples of the form: (``regex``, ``replacement``, ``obj``) for all classes and methods which are @@ -377,7 +379,7 @@ def autolink_replacements(what: str) -> List[Tuple[str, str, str]]: # first do plural only for classes replacements += [ ( - r"``{}``s".format(obj), + rf"``{obj}``s", f":{what}:`~{module}.{obj}`\\\\s", obj, ) @@ -386,8 +388,7 @@ def autolink_replacements(what: str) -> List[Tuple[str, str, str]]: # singular replacements += [ - (r"``{}``".format(obj), f":{what}:`~{module}.{obj}`", obj) - for obj in objects + (rf"``{obj}``", f":{what}:`~{module}.{obj}`", obj) for obj in objects ] # Look for recognised class names/function names which are NOT @@ -396,20 +397,19 @@ def autolink_replacements(what: str) -> List[Tuple[str, str, str]]: if what == "class": # first do plural only for classes suggestions += [ - (r"(?/settings.py`](../kedro_project_setup/settings.md): + +```python +from kedro.config import TemplatedConfigLoader # new import + +CONFIG_LOADER_CLASS = TemplatedConfigLoader +``` + +### Provide template values through globals +When using the `TemplatedConfigLoader` you can provide values in the configuration template through a `globals` file or dictionary. + +Let's assume the project contains a `conf/base/globals.yml` file with the following contents: + +```yaml +bucket_name: "my_s3_bucket" +key_prefix: "my/key/prefix/" + +datasets: + csv: "pandas.CSVDataSet" + spark: "spark.SparkDataSet" + +folders: + raw: "01_raw" + int: "02_intermediate" + pri: "03_primary" + fea: "04_feature" +``` + +To point your `TemplatedConfigLoader` to the globals file, add it to the the `CONFIG_LOADER_ARGS` variable in [`src//settings.py`](../kedro_project_setup/settings.md): + +```python +CONFIG_LOADER_ARGS = {"globals_pattern": "*globals.yml"} +``` + +Now the templating can be applied to the configuration. Here is an example of a templated `conf/base/catalog.yml` file: + +```yaml +raw_boat_data: + type: "${datasets.spark}" # nested paths into global dict are allowed + filepath: "s3a://${bucket_name}/${key_prefix}/${folders.raw}/boats.csv" + file_format: parquet + +raw_car_data: + type: "${datasets.csv}" + filepath: "s3://${bucket_name}/data/${key_prefix}/${folders.raw}/${filename|cars.csv}" # default to 'cars.csv' if the 'filename' key is not found in the global dict +``` + +Under the hood, `TemplatedConfigLoader` uses [`JMESPath` syntax](https://github.com/jmespath/jmespath.py) to extract elements from the globals dictionary. + + +Alternatively, you can declare which values to fill in the template through a dictionary. This dictionary could look like the following: + +```python +{ + "bucket_name": "another_bucket_name", + "non_string_key": 10, + "key_prefix": "my/key/prefix", + "datasets": {"csv": "pandas.CSVDataSet", "spark": "spark.SparkDataSet"}, + "folders": { + "raw": "01_raw", + "int": "02_intermediate", + "pri": "03_primary", + "fea": "04_feature", + }, +} +``` + +To point your `TemplatedConfigLoader` to the globals dictionary, add it to the `CONFIG_LOADER_ARGS` variable in [`src//settings.py`](../kedro_project_setup/settings.md): + +```python +CONFIG_LOADER_ARGS = { + "globals_dict": { + "bucket_name": "another_bucket_name", + "non_string_key": 10, + "key_prefix": "my/key/prefix", + "datasets": {"csv": "pandas.CSVDataSet", "spark": "spark.SparkDataSet"}, + "folders": { + "raw": "01_raw", + "int": "02_intermediate", + "pri": "03_primary", + "fea": "04_feature", + }, + } +} +``` + +If you specify both `globals_pattern` and `globals_dict` in `CONFIG_LOADER_ARGS`, the contents of the dictionary resulting from `globals_pattern` are merged with the `globals_dict` dictionary. In case of conflicts, the keys from the `globals_dict` dictionary take precedence. + + +## OmegaConfigLoader + +[OmegaConf](https://omegaconf.readthedocs.io/) is a Python library designed for configuration. It is a YAML-based hierarchical configuration system with support for merging configurations from multiple sources. + +From Kedro 0.18.5 you can use the [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader) which uses `OmegaConf` under the hood to load data. + +```{note} +`OmegaConfigLoader` is under active development. It was first available from Kedro 0.18.5 with additional features due in later releases. Let us know if you have any feedback about the `OmegaConfigLoader`. +``` + +`OmegaConfigLoader` can load `YAML` and `JSON` files. Acceptable file extensions are `.yml`, `.yaml`, and `.json`. By default, any configuration files used by the config loaders in Kedro are `.yml` files. + +To use `OmegaConfigLoader` in your project, set the `CONFIG_LOADER_CLASS` constant in your [`src//settings.py`](../kedro_project_setup/settings.md): + +```python +from kedro.config import OmegaConfigLoader # new import + +CONFIG_LOADER_CLASS = OmegaConfigLoader +``` + +## Advanced Kedro configuration + +This section contains a set of guidance for advanced configuration requirements of standard Kedro projects: + +- [Advanced configuration](#advanced-configuration) + - [TemplatedConfigLoader](#templatedconfigloader) + - [Provide template values through globals](#provide-template-values-through-globals) + - [OmegaConfigLoader](#omegaconfigloader) + - [Advanced Kedro configuration](#advanced-kedro-configuration) + - [How to change which configuration files are loaded](#how-to-change-which-configuration-files-are-loaded) + - [How to ensure non default configuration files get loaded](#how-to-ensure-non-default-configuration-files-get-loaded) + - [How to bypass the configuration loading rules](#how-to-bypass-the-configuration-loading-rules) + - [How to use Jinja2 syntax in configuration](#how-to-use-jinja2-syntax-in-configuration) + - [How to do templating with the `OmegaConfigLoader`](#how-to-do-templating-with-the-omegaconfigloader) + - [How to use custom resolvers in the `OmegaConfigLoader`](#how-to-use-custom-resolvers-in-the-omegaconfigloader) + - [How to load credentials through environment variables](#how-to-load-credentials-through-environment-variables) + +### How to change which configuration files are loaded +If you want to change the patterns that the configuration loader uses to find the files to load you need to set the `CONFIG_LOADER_ARGS` variable in [`src//settings.py`](../kedro_project_setup/settings.md). +For example, if your `parameters` files are using a `params` naming convention instead of `parameters` (e.g. `params.yml`) you need to update `CONFIG_LOADER_ARGS` as follows: + +```python +CONFIG_LOADER_ARGS = { + "config_patterns": { + "parameters": ["params*", "params*/**", "**/params*"], + } +} +``` + +By changing this setting, the default behaviour for loading parameters will be replaced, while the other configuration patterns will remain in their default state. + +### How to ensure non default configuration files get loaded +You can add configuration patterns to match files other than `parameters`, `credentials`, and `catalog` by setting the `CONFIG_LOADER_ARGS` variable in [`src//settings.py`](../kedro_project_setup/settings.md). +For example, if you want to load Spark configuration files you need to update `CONFIG_LOADER_ARGS` as follows: + +```python +CONFIG_LOADER_ARGS = { + "config_patterns": { + "spark": ["spark*/"], + } +} +``` + +### How to bypass the configuration loading rules +You can bypass the configuration patterns and set configuration directly on the instance of a config loader class. You can bypass the default configuration (catalog, parameters and credentials) as well as additional configuration. + +```{code-block} python +:lineno-start: 10 +:emphasize-lines: 8 + +from kedro.config import ConfigLoader +from kedro.framework.project import settings + +conf_path = str(project_path / settings.CONF_SOURCE) +conf_loader = ConfigLoader(conf_source=conf_path) + +# Bypass configuration patterns by setting the key and values directly on the config loader instance. +conf_loader["catalog"] = {"catalog_config": "something_new"} +``` + +### How to use Jinja2 syntax in configuration +From version 0.17.0, `TemplatedConfigLoader` also supports the [Jinja2](https://palletsprojects.com/p/jinja/) template engine alongside the original template syntax. Below is an example of a `catalog.yml` file that uses both features: + +``` +{% for speed in ['fast', 'slow'] %} +{{ speed }}-trains: + type: MemoryDataSet + +{{ speed }}-cars: + type: pandas.CSVDataSet + filepath: s3://${bucket_name}/{{ speed }}-cars.csv + save_args: + index: true + +{% endfor %} +``` + +When parsing this configuration file, `TemplatedConfigLoader` will: + +1. Read the `catalog.yml` and compile it using Jinja2 +2. Use a YAML parser to parse the compiled config into a Python dictionary +3. Expand `${bucket_name}` in `filepath` using the `globals_pattern` and `globals_dict` arguments for the `TemplatedConfigLoader` instance, as in the previous examples + +The output Python dictionary will look as follows: + +```python +{ + "fast-trains": {"type": "MemoryDataSet"}, + "fast-cars": { + "type": "pandas.CSVDataSet", + "filepath": "s3://my_s3_bucket/fast-cars.csv", + "save_args": {"index": True}, + }, + "slow-trains": {"type": "MemoryDataSet"}, + "slow-cars": { + "type": "pandas.CSVDataSet", + "filepath": "s3://my_s3_bucket/slow-cars.csv", + "save_args": {"index": True}, + }, +} +``` + +```{warning} +Although Jinja2 is a very powerful and extremely flexible template engine, which comes with a wide range of features, we do not recommend using it to template your configuration unless absolutely necessary. The flexibility of dynamic configuration comes at a cost of significantly reduced readability and much higher maintenance overhead. We believe that, for the majority of analytics projects, dynamically compiled configuration does more harm than good. +``` + + +### How to do templating with the `OmegaConfigLoader` +#### Parameters +Templating or [variable interpolation](https://omegaconf.readthedocs.io/en/2.3_branch/usage.html#variable-interpolation), as it's called in `OmegaConf`, for parameters works out of the box if the template values are within the parameter files or the name of the file that contains the template values follows the same config pattern specified for parameters. +By default, the config pattern for parameters is: `["parameters*", "parameters*/**", "**/parameters*"]`. +Suppose you have one parameters file called `parameters.yml` containing parameters with `omegaconf` placeholders like this: + +```yaml +model_options: + test_size: ${data.size} + random_state: 3 +``` + +and a file containing the template values called `parameters_globals.yml`: +```yaml +data: + size: 0.2 +``` + +Since both of the file names (`parameters.yml` and `parameters_globals.yml`) match the config pattern for parameters, the `OmegaConfigLoader` will load the files and resolve the placeholders correctly. + +#### Catalog +From Kedro `0.18.10` templating also works for catalog files. To enable templating in the catalog you need to ensure that the template values are within the catalog files or the name of the file that contains the template values follows the same config pattern specified for catalogs. +By default, the config pattern for catalogs is: `["catalog*", "catalog*/**", "**/catalog*"]`. + +Additionally, any template values in the catalog need to start with an underscore `_`. This is because of how catalog entries are validated. Templated values will neither trigger a key duplication error nor appear in the resulting configuration dictionary. + +Suppose you have one catalog file called `catalog.yml` containing entries with `omegaconf` placeholders like this: + +```yaml +companies: + type: ${_pandas.type} + filepath: data/01_raw/companies.csv +``` + +and a file containing the template values called `catalog_globals.yml`: +```yaml +_pandas: + type: pandas.CSVDataSet +``` + +Since both of the file names (`catalog.yml` and `catalog_globals.yml`) match the config pattern for catalogs, the `OmegaConfigLoader` will load the files and resolve the placeholders correctly. + +#### Other configuration files +It's also possible to use variable interpolation in configuration files other than parameters and catalog, such as custom spark or mlflow configuration. This works in the same way as variable interpolation in parameter files. You can still use the underscore for the templated values if you want, but it's not mandatory like it is for catalog files. + +### How to use custom resolvers in the `OmegaConfigLoader` +`Omegaconf` provides functionality to [register custom resolvers](https://omegaconf.readthedocs.io/en/2.3_branch/usage.html#resolvers) for templated values. You can use these custom resolves within Kedro by extending the [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader) class. +The example below illustrates this: + +```python +from kedro.config import OmegaConfigLoader +from omegaconf import OmegaConf +from typing import Any, Dict + + +class CustomOmegaConfigLoader(OmegaConfigLoader): + def __init__( + self, + conf_source: str, + env: str = None, + runtime_params: Dict[str, Any] = None, + ): + super().__init__( + conf_source=conf_source, env=env, runtime_params=runtime_params + ) + + # Register a customer resolver that adds up numbers. + self.register_custom_resolver("add", lambda *numbers: sum(numbers)) + + @staticmethod + def register_custom_resolver(name, function): + """ + Helper method that checks if the resolver has already been registered and registers the + resolver if it's new. The check is needed, because omegaconf will throw an error + if a resolver with the same name is registered twice. + Alternatively, you can call `register_new_resolver()` with `replace=True`. + """ + if not OmegaConf.has_resolver(name): + OmegaConf.register_new_resolver(name, function) +``` + +In order to use this custom configuration loader, you will need to set it as the project configuration loader in `src//settings.py`: + +```python +from package_name.custom_configloader import CustomOmegaConfigLoader + +CONFIG_LOADER_CLASS = CustomOmegaConfigLoader +``` + +You can then use the custom "add" resolver in your `parameters.yml` as follows: + +```yaml +model_options: + test_size: ${add:1,2,3} + random_state: 3 +``` + +### How to load credentials through environment variables +The [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader) enables you to load credentials from environment variables. To achieve this you have to use the `OmegaConfigLoader` and the `omegaconf` [`oc.env` resolver](https://omegaconf.readthedocs.io/en/2.3_branch/custom_resolvers.html#oc-env). +To use the `OmegaConfigLoader` in your project, set the `CONFIG_LOADER_CLASS` constant in your [`src//settings.py`](../kedro_project_setup/settings.md): + +```python +from kedro.config import OmegaConfigLoader # new import + +CONFIG_LOADER_CLASS = OmegaConfigLoader +``` + +Now you can use the `oc.env` resolver to access credentials from environment variables in your `credentials.yml`, as demonstrated in the following example: + +```yaml +dev_s3: + client_kwargs: + aws_access_key_id: ${oc.env:AWS_ACCESS_KEY_ID} + aws_secret_access_key: ${oc.env:AWS_SECRET_ACCESS_KEY} +``` + +```{note} +Note that you can only use the resolver in `credentials.yml` and not in catalog or parameter files. This is because we do not encourage the usage of environment variables for anything other than credentials. +``` diff --git a/docs/source/configuration/configuration_basics.md b/docs/source/configuration/configuration_basics.md new file mode 100644 index 0000000000..56af25cf29 --- /dev/null +++ b/docs/source/configuration/configuration_basics.md @@ -0,0 +1,199 @@ +# Configuration + +This section contains detailed information about Kedro project configuration, which you can use to store settings for your project such as [parameters](./parameters.md), [credentials](./credentials.md), the [data catalog](../data/data_catalog.md), and [logging information](../logging/index.md). + +Kedro makes use of a configuration loader to load any project configuration files, and the available configuration loader classes are: + +* [`ConfigLoader`](/kedro.config.ConfigLoader) +* [`TemplatedConfigLoader`](/kedro.config.TemplatedConfigLoader) +* [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader). + +By default, Kedro uses the `ConfigLoader` and, in the following sections and examples, you can assume the default `ConfigLoader` is used, unless otherwise specified. The [advanced configuration documentation](./advanced_configuration.md) covers use of the [`TemplatedConfigLoader`](/kedro.config.TemplatedConfigLoader) and [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader) in more detail. + +## Configuration source +The configuration source folder is [`conf`](../get_started/kedro_concepts.md#conf) by default. We recommend that you keep all configuration files in the default `conf` folder of a Kedro project. + +## Configuration environments +A configuration environment is a way of organising your configuration settings for different stages of your data pipeline. For example, you might have different settings for development, testing, and production environments. + +By default, Kedro has a `base` and a `local` environment. + +### Base +In Kedro, the base configuration environment refers to the default configuration settings that are used as the foundation for all other configuration environments. + +The `base` folder contains the default settings that are used across your pipelines, unless they are overridden by a specific environment. + +```{warning} +Do not put private access credentials in the base configuration folder or any other configuration environment folder that is stored in version control. +``` + +### Local +The `local` configuration environment folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys). + +```{warning} +Do not add any local configuration to version control. +``` + +## Configuration loading +Kedro-specific configuration (e.g., `DataCatalog` configuration for I/O) is loaded using a configuration loader class, by default, this is [`ConfigLoader`](/kedro.config.ConfigLoader). +When you interact with Kedro through the command line, e.g. by running `kedro run`, Kedro loads all project configuration in the configuration source through this configuration loader. + +The loader recursively scans for configuration files inside the `conf` folder, firstly in `conf/base` (`base` being the default environment) and then in `conf/local` (`local` being the designated overriding environment). + +Kedro merges configuration information and returns a configuration dictionary according to the following rules: + +* If any two configuration files located inside the **same** environment path (such as `conf/base/`) contain the same top-level key, the configuration loader raises a `ValueError` indicating that duplicates are not allowed. +* If two configuration files contain the same top-level key but are in **different** environment paths (for example, one in `conf/base/`, another in `conf/local/`) then the last loaded path (`conf/local/`) takes precedence as the key value. `ConfigLoader.get` does not raise any errors but a `DEBUG` level log message is emitted with information on the overridden keys. + +When using any of the configuration loaders, any top-level keys that start with `_` are considered hidden (or reserved) and are ignored. Those keys will neither trigger a key duplication error nor appear in the resulting configuration dictionary. However, you can still use such keys, for example, as [YAML anchors and aliases](https://www.educative.io/blog/advanced-yaml-syntax-cheatsheet#anchors) +or [to enable templating in the catalog when using the `OmegaConfigLoader`](advanced_configuration.md#how-to-do-templating-with-the-omegaconfigloader). + +### Configuration file names +Configuration files will be matched according to file name and type rules. Suppose the config loader needs to fetch the catalog configuration, it will search according to the following rules: + +* *Either* of the following is true: + * filename starts with `catalog` + * file is located in a subfolder whose name is prefixed with `catalog` +* *And* file extension is one of the following: + * `yaml`, `yml`, `json`, `ini`, `pickle`, `xml` or `properties` for the `ConfigLoader` and `TemplatedConfigLoader` + * `yaml`, `yml`, or `json` for the `OmegaConfigLoader` + +### Configuration patterns +Under the hood, the Kedro configuration loader loads files based on regex patterns that specify the naming convention for configuration files. These patterns are specified by `config_patterns` in the configuration loader classes. + +By default those patterns are set as follows for the configuration of catalog, parameters and credentials: + +```python +config_patterns = { + "catalog": ["catalog*", "catalog*/**", "**/catalog*"], + "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + "credentials": ["credentials*", "credentials*/**", "**/credentials*"], +} +``` + +If you want to change change the way configuration is loaded, you can either [customise the config patterns](advanced_configuration.md#how-to-change-which-configuration-files-are-loaded) or [bypass the configuration loading](advanced_configuration.md#how-to-bypass-the-configuration-loading-rules) as described in the advanced configuration chapter. + +## How to use Kedro configuration + +This section contains a set of guidance for the most common configuration requirements of standard Kedro projects: + +- [Configuration](#configuration) + - [Configuration source](#configuration-source) + - [Configuration environments](#configuration-environments) + - [Base](#base) + - [Local](#local) + - [Configuration loading](#configuration-loading) + - [Configuration file names](#configuration-file-names) + - [Configuration patterns](#configuration-patterns) + - [How to use Kedro configuration](#how-to-use-kedro-configuration) + - [How to change the setting for a configuration source folder](#how-to-change-the-setting-for-a-configuration-source-folder) + - [How to change the configuration source folder at runtime](#how-to-change-the-configuration-source-folder-at-runtime) + - [How to read configuration from a compressed file](#how-to-read-configuration-from-a-compressed-file) + - [How to access configuration in code](#how-to-access-configuration-in-code) + - [How to specify additional configuration environments](#how-to-specify-additional-configuration-environments) + - [How to change the default overriding environment](#how-to-change-the-default-overriding-environment) + - [How to use only one configuration environment](#how-to-use-only-one-configuration-environment) + +### How to change the setting for a configuration source folder +To store the Kedro project configuration in a different folder to `conf`, change the configuration source by setting the `CONF_SOURCE` variable in [`src//settings.py`](../kedro_project_setup/settings.md) as follows: + +```python +CONF_SOURCE = "new_conf" +``` + +### How to change the configuration source folder at runtime +Specify a source folder for the configuration files at runtime using the [`kedro run` CLI command](../development/commands_reference.md#modifying-a-kedro-run) with the `--conf-source` flag as follows: + +```bash +kedro run --conf-source= +``` + +### How to read configuration from a compressed file +You can read configuration from a compressed file in `tar.gz` or `zip` format by using the [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader). + +How to reference a `tar.gz` file: + + ```bash +kedro run --conf-source=.tar.gz +``` + +How to reference a `zip` file: + +```bash +kedro run --conf-source=.zip +``` + +To compress your configuration you can use Kedro's `kedro package` command which builds the package into the `dist/` folder of your project, and creates a `.whl` file, as well as a `tar.gz` file containing the project configuration. The compressed version of the config files excludes any files inside your `local` folder. + +Alternatively you can run the command below to create a `tar.gz` file: + +```bash +tar --exclude=local/*.yml -czf .tar.gz --directory= +``` + +Or the following command to create a `zip` file: + +```bash +zip -x /local/** -r .zip +``` + +Note that for both the `tar.gz` and `zip` file the following structure is expected: + +```text + +├── base <-- the files inside may be different, but this is an example of a standard Kedro structure. +│ └── parameters.yml +│ └── catalog.yml +└── local <-- the top level local folder is required, but no files should be inside when distributed. +└── README.md <-- optional but included with the default Kedro conf structure. +``` + +### How to access configuration in code +To directly access configuration in code, for example to debug, you can do so as follows: + +```python +from kedro.config import ConfigLoader +from kedro.framework.project import settings + +# Instantiate a ConfigLoader with the location of your project configuration. +conf_path = str(project_path / settings.CONF_SOURCE) +conf_loader = ConfigLoader(conf_source=conf_path) + +# This line shows how to access the catalog configuration. You can access other configuration in the same way. +conf_catalog = conf_loader["catalog"] +``` + +### How to specify additional configuration environments +In addition to the two built-in `local` and `base` configuration environments, you can create your own. Your project loads `conf/base/` as the bottom-level configuration environment but allows you to overwrite it with any other environments that you create, such as `conf/server/` or `conf/test/`. To use additional configuration environments, run the following command: + +```bash +kedro run --env= +``` + +If no `env` option is specified, this will default to using the `local` environment to overwrite `conf/base`. + +If you set the `KEDRO_ENV` environment variable to the name of your environment, Kedro will load that environment for your `kedro run`, `kedro ipython`, `kedro jupyter notebook` and `kedro jupyter lab` sessions: + +```bash +export KEDRO_ENV= +``` + +```{note} +If you both specify the `KEDRO_ENV` environment variable and provide the `--env` argument to a CLI command, the CLI argument takes precedence. +``` + +### How to change the default overriding environment +By default, `local` is the overriding environment for `base`. To change the folder, customise the configuration loader argument settings in `src//settings.py` and set the `CONFIG_LOADER_ARGS` key to have a new `default_run_env` value. + +For example, if you want to override `base` with configuration in a custom environment called `prod`, you change the configuration loader arguments in `settings.py` as follows: + +```python +CONFIG_LOADER_ARGS = {"default_run_env": "prod"} +``` + +### How to use only one configuration environment +If, for some reason, your project does not have any other environments apart from `base`, i.e. no `local` environment to default to, you must customise the configuration loader argument settings in `src//settings.py` and set the `CONFIG_LOADER_ARGS` key to `"default_run_env": "base"` + +```python +CONFIG_LOADER_ARGS = {"default_run_env": "base"} +``` diff --git a/docs/source/configuration/credentials.md b/docs/source/configuration/credentials.md new file mode 100644 index 0000000000..620fb569ac --- /dev/null +++ b/docs/source/configuration/credentials.md @@ -0,0 +1,46 @@ +# Credentials + +For security reasons, we strongly recommend that you *do not* commit any credentials or other secrets to version control. +Kedro is set up so that, by default, if a file inside the `conf` folder (and its subfolders) contains `credentials` in its name, it will be ignored by git. + +Credentials configuration can be used on its own directly in code or [fed into the `DataCatalog`](../data/data_catalog.md#feeding-in-credentials). +If you would rather store your credentials in environment variables instead of a file, you can use the `OmegaConfigLoader` [to load credentials from environment variables](advanced_configuration.md#how-to-load-credentials-through-environment-variables) as described in the advanced configuration chapter. + +## How to load credentials in code +Credentials configuration can be loaded the same way as any other project configuration using any of the configuration loader classes: `ConfigLoader`, `TemplatedConfigLoader`, and `OmegaConfigLoader`. + +The following examples all use the default `ConfigLoader` class. + +```python +from kedro.config import ConfigLoader +from kedro.framework.project import settings + +conf_path = str(project_path / settings.CONF_SOURCE) +conf_loader = ConfigLoader(conf_source=conf_path) +credentials = conf_loader["credentials"] +``` + +This loads configuration files from `conf/base` and `conf/local` whose filenames start with `credentials`, or that are located inside a folder with a name that starts with `credentials`. + +Calling `conf_loader[key]` in the example above throws a `MissingConfigException` error if no configuration files match the given key. But if this is a valid workflow for your application, you can handle it as follows: + +```python +from kedro.config import ConfigLoader, MissingConfigException +from kedro.framework.project import settings + +conf_path = str(project_path / settings.CONF_SOURCE) +conf_loader = ConfigLoader(conf_source=conf_path) + +try: + credentials = conf_loader["credentials"] +except MissingConfigException: + credentials = {} +``` + +```{note} +The `kedro.framework.context.KedroContext` class uses the approach above to load project credentials. +``` + +## How to work with AWS credentials + +When you work with AWS credentials on datasets, you are not required to store AWS credentials in the project configuration files. Instead, you can specify them using environment variables `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and, optionally, `AWS_SESSION_TOKEN`. Please refer to the [official documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html) for more details. diff --git a/docs/source/configuration/index.md b/docs/source/configuration/index.md new file mode 100644 index 0000000000..3f554e1e91 --- /dev/null +++ b/docs/source/configuration/index.md @@ -0,0 +1,10 @@ +# Configuration + +```{toctree} +:maxdepth: 1 + +configuration_basics +credentials +parameters +advanced_configuration +``` diff --git a/docs/source/configuration/parameters.md b/docs/source/configuration/parameters.md new file mode 100644 index 0000000000..60de2d4da4 --- /dev/null +++ b/docs/source/configuration/parameters.md @@ -0,0 +1,143 @@ +# Parameters +Project parameters in Kedro are defined inside the `conf` folder in a file that has a filename starting with `parameters`, or are located inside a folder with name starting with `parameters`. +By default, in a new Kedro project, parameters are defined in the `parameters.yml` file, which is located in the project's `conf/base` directory. This file contains a dictionary of key-value pairs, where each key is a parameter name and each value is the corresponding parameter value. +These parameters can serve as input to nodes and are used when running the pipeline. By using parameters, you can make your Kedro pipelines more flexible and easier to configure, since you can change the behaviour of your nodes by modifying the `parameters.yml` file. + +## How to use parameters +If you have a group of parameters that determine the hyperparameters of your model, you can define them in a single location such as `conf/base/parameters.yml`. This way, you can keep all your modifications in a centralised location and avoid making changes across multiple parts of your code. + +```yaml +step_size: 1 +learning_rate: 0.01 +``` + +You can now use the `params:` prefix to reference these parameters in the `node` definition: + +```python +def increase_volume(volume, step): + return volume + step + + +# in pipeline definition +node( + func=increase_volume, + inputs=["input_volume", "params:step_size"], + outputs="output_volume", +) +``` + +You can also group your parameters into nested structures and, using the same method above, load them by top-level key: + +```yaml +step_size: 1 +model_params: + learning_rate: 0.01 + test_data_ratio: 0.2 + number_of_train_iterations: 10000 +``` + +```python +def train_model(data, model): + lr = model["learning_rate"] + test_data_ratio = model["test_data_ratio"] + iterations = model["number_of_train_iterations"] + ... + + +# in pipeline definition +node( + func=train_model, + inputs=["input_data", "params:model_params"], + outputs="output_data", +) +``` + +Alternatively, you can also pass `parameters` to the node inputs and get access to the entire collection of values inside the node function. + +```python +def increase_volume(volume, params): + step = params["step_size"] + return volume + step + + +# in pipeline definition +node( + func=increase_volume, inputs=["input_volume", "parameters"], outputs="output_volume" +) +``` + +In both cases, under the hood parameters are added to the Data Catalog through the method `add_feed_dict()` in [`DataCatalog`](/kedro.io.DataCatalog), where they live as `MemoryDataSet`s. This method is also what the `KedroContext` class uses when instantiating the catalog. + +```{note} +You can use `add_feed_dict()` to inject any other entries into your `DataCatalog` as per your use case. +``` + +## How to load parameters in code + +Parameters project configuration can be loaded by any of the configuration loader classes: `ConfigLoader`, `TemplatedConfigLoader`, and `OmegaConfigLoader`. + +The following examples all make use of the default `ConfigLoader` class. + +```python +from kedro.config import ConfigLoader +from kedro.framework.project import settings + +conf_path = str(project_path / settings.CONF_SOURCE) +conf_loader = ConfigLoader(conf_source=conf_path) +parameters = conf_loader["parameters"] +``` + +This loads configuration files from any subdirectories in `conf` that have a filename starting with `parameters`, or are located inside a folder with name starting with `parameters`. + +Calling `conf_loader[key]` in the example above will throw a `MissingConfigException` error if no configuration files match the given key. But if this is a valid workflow for your application, you can handle it as follows: + +```python +from kedro.config import ConfigLoader, MissingConfigException +from kedro.framework.project import settings + +conf_path = str(project_path / settings.CONF_SOURCE) +conf_loader = ConfigLoader(conf_source=conf_path) + +try: + parameters = conf_loader["parameters"] +except MissingConfigException: + parameters = {} +``` + +```{note} +The `kedro.framework.context.KedroContext` class uses the approach above to load project parameters. +``` + +[Parameters can then be used on their own or fed in as function inputs](#how-to-use-parameters). + +## How to specify parameters at runtime + +Kedro also allows you to specify runtime parameters for the `kedro run` CLI command. Use the `--params` command line option and specify a comma-separated list of key-value pairs that will be added to [KedroContext](/kedro.framework.context.KedroContext) parameters and made available to pipeline nodes. + +Each key-value pair is split on the first colon or equals sign. The following examples are both valid commands: + +```bash +kedro run --params=param_key1:value1,param_key2:2.0 # this will add {"param_key1": "value1", "param_key2": 2} to parameters dictionary +``` +```bash +kedro run --params=param_key1=value1,param_key2=2.0 +``` +Values provided in the CLI take precedence and overwrite parameters specified in configuration files. + +* Parameter keys are _always_ treated as strings. +* Parameter values are converted to a float or an integer number if the corresponding conversion succeeds; otherwise, they are also treated as string. + +If any extra parameter key and/or value contains spaces, wrap the whole option contents in quotes: + +```bash +kedro run --params="key1=value with spaces,key2=value" +``` + +Since key-value pairs are split on the first colon or equals sign, values can contain colons/equals signs, but keys cannot. These are valid CLI commands: + +```bash +kedro run --params=endpoint_url:https://endpoint.example.com +``` +```bash +kedro run --params=endpoint_url=https://endpoint.example.com +``` diff --git a/docs/source/14_contribution/03_backwards_compatibility.md b/docs/source/contribution/backwards_compatibility.md similarity index 66% rename from docs/source/14_contribution/03_backwards_compatibility.md rename to docs/source/contribution/backwards_compatibility.md index 11f3ae897f..142cf929d2 100644 --- a/docs/source/14_contribution/03_backwards_compatibility.md +++ b/docs/source/contribution/backwards_compatibility.md @@ -8,18 +8,18 @@ A breaking change is any change that modifies Kedro's public APIs. Examples incl Your change is **not** considered a breaking change, and so is backwards compatible, **if a user can upgrade their Kedro version and include your change without anything breaking in their project**. -### When should I make a breaking change? +## When should I make a breaking change? We aim to minimise the number of breaking changes to keep Kedro software stable and reduce the overhead for users as they migrate their projects. However, there are cases where a breaking change brings considerable value or increases the maintainability of the codebase. In these cases, breaking backwards compatibility can make sense. -Before you contribute a breaking change, you should create a [Github Issue](https://github.com/quantumblacklabs/kedro/issues) that describes the change and justifies the value gained by breaking backwards compatibility. +Before you contribute a breaking change, you should create a [GitHub Issue](https://github.com/kedro-org/kedro/issues) that describes the change and justifies the value gained by breaking backwards compatibility. ## The Kedro release model -All non-breaking changes go into `master`, from which a minor release can be deployed at any time. +All non-breaking changes go into `main`, from which a minor release can be deployed at any time. -All breaking changes go into `develop`, from which a major release can be deployed at any time. The `develop` branch contains all commits from the `master` branch, but the `master` branch does not contain all the commits from `develop` until the next major release. +All breaking changes go into `develop`, from which a major release can be deployed at any time. The `develop` branch contains all commits from the `main` branch, but the `main` branch does not contain all the commits from `develop` until the next major release. ![Kedro Gitflow Diagram](../meta/images/kedro_gitflow.svg) -Please check the Q&A on [GitHub discussions](https://github.com/quantumblacklabs/kedro/discussions) and ask any new questions about the development process there too! +Got a question about the development process? Ask the community on [Slack](https://slack.kedro.org) if you need to! diff --git a/docs/source/contribution/developer_contributor_guidelines.md b/docs/source/contribution/developer_contributor_guidelines.md new file mode 100644 index 0000000000..787a838d90 --- /dev/null +++ b/docs/source/contribution/developer_contributor_guidelines.md @@ -0,0 +1,197 @@ +# Guidelines for contributing developers + +This page explains the principles and development process that we ask contributing developers to follow. + +**Any contributions you make will be under the [Apache 2.0 Software License](https://github.com/kedro-org/kedro/blob/main/LICENSE.md).** + +In short, when you submit code changes, your submissions are understood to be under the same the [Apache 2.0 License](https://github.com/kedro-org/kedro/blob/main/LICENSE.md) that covers the Kedro project. You should have permission to share the submitted code. + +```{note} +You don't need to contribute code to help the Kedro project. See our list of other ways [you can contribute to Kedro](https://github.com/kedro-org/kedro/blob/main/CONTRIBUTING.md). +``` + +## Introduction + +This guide is a practical description of: + +* How to set up your development environment to contribute to Kedro. +* How to prepare a pull request against the Kedro repository. + + +## Before you start: development set up + +To work on the Kedro codebase, you will need to be set up with Git, and Make. + +```{note} +If your development environment is Windows, you can use the `win_setup_conda` and `win_setup_env` commands from [Circle CI configuration](https://github.com/kedro-org/kedro/blob/main/.circleci/config.yml) to guide you in the correct way to do this. +``` + +You will also need to create and activate virtual environment. If this is unfamiliar to you, read through our [pre-requisites documentation](../get_started/install.md#installation-prerequisites). + +Next, you'll need to fork the [Kedro source code from the GitHub repository](https://github.com/kedro-org/kedro): + +* Fork the project by clicking **Fork** in the top-right corner of the [Kedro GitHub repository](https://github.com/kedro-org/kedro) +* Choose your target account + +If you need further guidance, consult the [GitHub documentation about forking a repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo#forking-a-repository). + +You are almost ready to go. In your terminal, navigate to the folder into which you forked the Kedro code. + +Run these commands to install everything you need to work with Kedro: + +``` +make install-test-requirements +make install-pre-commit +``` + +Once the above commands have executed successfully, do a sanity check to ensure that `kedro` works in your environment: + +``` +make test +``` + +```{note} +If the tests in `tests/extras/datasets/spark` are failing, and you are not planning to work on [Spark](https://spark.apache.org) related features, then you can run a reduced test suite that excludes them. Do this by executing `make test-no-spark`. +``` + +## Get started: areas of contribution + +Once you are ready to contribute, a good place to start is to take a look at the `good first issues` and `help wanted issues` on [GitHub](https://github.com/kedro-org/kedro/issues). + +We focus on three areas for contribution: `core`, `extras` and `plugin`: + +- `core` refers to the primary Kedro library. Read the [`core` contribution process](#core-contribution-process) for details. +- `extras` refers to features that could be added to `core` that do not introduce too many dependencies or require new Kedro CLI commands to be created. Read the [`extras` contribution process](#extras-contribution-process) for more information. +- [`plugin`](../extend_kedro/plugins.md) refers to new functionality that requires a Kedro CLI command e.g. adding in Airflow functionality and [adding a new dataset](../extend_kedro/custom_datasets.md) to the `kedro-datasets` package. The [`plugin` development documentation](../extend_kedro/plugins.md) contains guidance on how to design and develop a Kedro `plugin`. + + +### `core` contribution process + +Typically, we only accept small contributions to the `core` Kedro library, but we accept new features as plugins or additions to the [`extras`](https://github.com/kedro-org/kedro/tree/main/kedro/extras) module. + +To contribute: + +1. Create a feature branch on your forked repository and push all your local changes to that feature branch. +2. Is your change [non-breaking and backwards-compatible](./backwards_compatibility.md)? Your feature branch should branch off from: +
    +
  1. main if you intend for it to be a non-breaking, backwards-compatible change.
  2. +
  3. develop if you intend for it to be a breaking change.
  4. +
+3. Before you submit a pull request (PR), please ensure that unit tests, end-to-end (E2E) tests and linters are passing for your changes by running `make test`, `make e2e-tests` and `make lint` locally; see the [development set up](#before-you-start-development-set-up) section above. +4. Open a PR: +
    +
  1. For backwards compatible changes, open a PR against the kedro-org:main branch from your feature branch.
  2. +
  3. For changes that are NOT backwards compatible, open a PR against the kedro-org:develop branch from your feature branch.
  4. +
+ +5. Await reviewer comments. +6. Update the PR according to the reviewer's comments. +7. Your PR will be merged by the Kedro team once all the comments are addressed. + +```{note} +We will work with you to complete your contribution, but we reserve the right to take over abandoned PRs. +``` + +### `extras` contribution process + +You can add new work to `extras` if you do not need to create a new Kedro CLI command: + +1. Create an [issue](https://github.com/kedro-org/kedro/issues) describing your contribution. +2. Work in [`extras`](https://github.com/kedro-org/kedro/tree/main/kedro/extras) and create a feature branch on your forked repository and push all your local changes to that feature branch. +3. Before you submit a pull request, please ensure that unit tests, end-to-end (E2E) tests and linters are passing for your changes by running `make test`,`make e2e-tests` and `make lint` locally, have a look at the section [development set up](#before-you-start-development-set-up) section above. +4. Include a `README.md` with instructions on how to use your contribution. +5. Is your change [non-breaking and backwards-compatible](./backwards_compatibility.md)? +
    +
  1. For backwards compatible changes, open a PR against the kedro-org:main branch from your feature branch.
  2. +
  3. For changes that are NOT backwards compatible, open a PR against the kedro-org:develop branch from your feature branch.
  4. +
+ +6. Reference your issue in the PR description (e.g., `Resolves #`). +7. Await review comments, then update the PR according to the reviewer's comments. +8. Your PR will be merged by the Kedro team once all the comments are addressed. + +```{note} +We will work with you to complete your contribution, but we reserve the right to take over abandoned PRs. +``` + +## Create a pull request + +[Give your pull request a descriptive title](#pull-request-title-conventions). Before you submit it, consider the following: + +* You should aim for cross-platform compatibility on Windows, macOS and Linux +* We use [Semantic Versioning](https://semver.org/) for versioning +* We have designed our code to be compatible with Python 3.7 onwards and our style guidelines are (in cascading order): + * [PEP 8 conventions](https://www.python.org/dev/peps/pep-0008/) for all Python code + * [Google docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) for code comments + * [PEP 484 type hints](https://www.python.org/dev/peps/pep-0484/) for all user-facing functions/class methods; e.g. + + ```python + def count_truthy(elements: List[Any]) -> int: + return sum(1 for elem in elements if element) + ``` + +Ensure that your PR builds cleanly before you submit it, by running the CI/CD checks locally, as follows: +* `make lint`: PEP-8 Standards (`ruff`, `black`) +* `make test`: unit tests, 100% coverage (`pytest`, `pytest-cov`) +* `make e2e-tests`: end-to-end tests (`behave`) + +```{note} +If Spark/PySpark/Hive tests for datasets are failing it might be due to the lack of Java>8 support from Spark. You can try using `export JAVA_HOME=$(/usr/libexec/java_home -v 1.8)` which [works under macOS or other workarounds](https://stackoverflow.com/questions/53583199/spark-error-unsupported-class-file-major-version). +``` + +```{note} +We place [conftest.py](https://docs.pytest.org/en/latest/reference/fixtures.html) files in some test directories to make fixtures reusable by any tests in that directory. If you need to see which test fixtures are available and where they come from, you can issue the following command `pytest --fixtures path/to/the/test/location.py`. +``` + +### Pull request title conventions + +The Kedro repository requires that you [squash and merge your pull request commits](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits), and, in most cases, the [merge message for a squash merge](https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/about-pull-request-merges#merge-message-for-a-squash-merge) then defaults to the pull request title. + +For clarity, your pull request title should be descriptive, and we ask you to follow some guidelines suggested by [Chris Beams](https://github.com/cbeams) in his post [How to Write a Git Commit Message](https://chris.beams.io/posts/git-commit/#seven-rules). In particular, for your pull request title, we suggest that you: + +* [Limit the length to 50 characters](https://chris.beams.io/posts/git-commit/#limit-50) +* [Capitalise the first letter of the first word](https://chris.beams.io/posts/git-commit/#capitalize) +* [Omit the period at the end](https://chris.beams.io/posts/git-commit/#end) +* [Use the imperative tense](https://chris.beams.io/posts/git-commit/#imperative) + +### Hints on `pre-commit` usage +[`pre-commit`](https://pre-commit.com) hooks run checks automatically on all the changed files on each commit but can be skipped with the `--no-verify` or `-n` flag: + +```bash +git commit --no-verify <...> +``` + +All checks will run during CI build, so skipping checks on commit will not allow you to merge your code with failing checks. You can uninstall the `pre-commit` hooks by running: + +```bash +make uninstall-pre-commit +``` +`pre-commit` will still be used by `make lint`, but will not install the git hooks. + +### Developer Certificate of Origin +We require that all contributions comply with the [Developer Certificate of Origin (DCO)](https://developercertificate.org/). This certifies that the contributor wrote or otherwise has the right to submit their contribution. + +All commits must be signed off by including a `Signed-off-by` line in the commit message: +``` +This is my commit message + +Signed-off-by: Random J Developer +``` + +The sign-off can be added automatically to your commit message using the `-s` option: +```bash +git commit -s -m "This is my commit message" +``` + +To avoid needing to remember the `-s` flag on every commit, you might like to set up a [git alias](https://git-scm.com/book/en/v2/Git-Basics-Git-Aliases) for `git commit -s`. Alternatively, run `make sign-off` to setup a [`commit-msg` Git hook](https://git-scm.com/docs/githooks#_commit_msg) that automatically signs off all commits (including merge commits) you make while working on the Kedro repository. + +If your PR is blocked due to unsigned commits, then you must follow the instructions under "Rebase the branch" on the GitHub Checks page for your PR. This will retroactively add the sign-off to all unsigned commits and allow the DCO check to pass. + +## Need help? + +Working on your first pull request? You can learn how from these resources: + +* [First timers only](https://www.firsttimersonly.com/) +* [How to contribute to an open source project on GitHub](https://egghead.io/courses/how-to-contribute-to-an-open-source-project-on-github) + +Previous Q&A on [GitHub discussions](https://github.com/kedro-org/kedro/discussions) and the [searchable archive of Slack discussions](https://www.linen.dev/s/kedro). You can ask new questions about the development process on [Slack](https://slack.kedro.org) too! diff --git a/docs/source/contribution/development_for_databricks.md b/docs/source/contribution/development_for_databricks.md new file mode 100644 index 0000000000..571b2e65ad --- /dev/null +++ b/docs/source/contribution/development_for_databricks.md @@ -0,0 +1,95 @@ +# Contribute changes to Kedro that are tested on Databricks + +Many Kedro users deploy their projects to [Databricks](https://www.databricks.com/), a cloud-based platform for data engineering and data science. We encourage contributions to extend and improve the experience for Kedro users on Databricks; this guide explains how to efficiently test your locally modified version of Kedro on Databricks as part of a build-and-test development cycle. + +## How to deploy a development version of Kedro to Databricks + +```{note} +This page is for **contributors** developing changes to Kedro that need to test them on Databricks. If you are a Kedro user working on an individual or team project and need more information about workflows, consult the [documentation pages for developing a Kedro project on Databricks](../deployment/databricks/index.md). +``` + +## Prerequisites + +You will need the following to follow this guide: + +* Python **version >=3.8**. +* An activated Python virtual environment into which you have installed the [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html) with [authentication for your workspace](https://docs.databricks.com/dev-tools/cli/index.html#set-up-the-cli). +* Access to a Databricks workspace with an [existing cluster](https://docs.databricks.com/clusters/create-cluster.html). +* [GNU `make`](https://www.gnu.org/software/make/). +* [`git`](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). +* A local clone of the [Kedro git repository](https://github.com/kedro-org/kedro). + +## How to install a build of Kedro onto Databricks + +The development workflow for Kedro on Databricks is similar to the one for Kedro in general, when you develop and test your changes locally. The main difference comes when manually testing your changes on Databricks, since you will need to build and deploy the wheel file to Databricks to test it on a cluster. + +To make developing Kedro for Databricks easier, Kedro comes with a `Makefile` target named `databricks-build` that automates the process of building a wheel file and installing this on your Databricks cluster to save development time. + +### How to set up the Databricks CLI to test a Kedro build + +Before you use `make databricks-build`, you must [set up the Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html#set-up-the-cli). + +Next, create and environment variable with the ID of the cluster you are using to test your Kedro build. You can find the ID by executing the Databricks CLI command `databricks clusters list` and looking for the Cluster ID to the left of the name of your chosen cluster, for instance: + +```bash +$ databricks clusters list +1234-567890-abcd1234 General Cluster TERMINATED +0987-654321-9876xywz Kedro Test Cluster TERMINATED +``` + +In this case, the cluster ID of `Kedro Test Cluster` is `0987-654321-9876xywz`. + +Once you have determined the cluster ID, you must export it to an environment variable named `DATABRICKS_CLUSTER_ID`: + +```bash +# Linux or macOS +export DATABRICKS_CLUSTER_ID= + +# Windows (PowerShell) +$Env:DATABRICKS_CLUSTER_ID = '' +``` + +### How to use `make databricks-build` to test your Kedro build + +With the setup complete, you can use `make databricks-build`. In your terminal, navigate to the parent directory of your Kedro development repository and run: + +```bash +make databricks-build +``` + +You should see a stream of messages being written to your terminal. Behind the scenes, `databricks-build` does the following: + +1. Builds a wheel file of your modified version of Kedro. +2. Uninstalls any library on your Databricks cluster with the same wheel file name. +3. Uploads your updated wheel file to DBFS (Databricks File System). +4. Queues your updated wheel file for installation +5. Restarts your cluster to apply the changes. + +Note that your cluster will be unavailable while it restarts. You can poll the status of the cluster using the Databricks CLI: + +```bash +# Linux or macOS +databricks clusters get --cluster-id $DATABRICKS_CLUSTER_ID | grep state + +# Windows (PowerShell) +databricks clusters get --cluster-id $Env:DATABRICKS_CLUSTER_ID | Select-String state +``` + +Once the cluster has restarted, you should verify that your modified version of Kedro has been correctly installed. Run `databricks libraries list --cluster-id `. If installation was successful, you should see the following output: + +```bash +{ + "cluster_id": "", + "library_statuses": [ + { + "library": { + "whl": "dbfs:/tmp/kedro-builds/kedro--py3-none-any.whl" + }, + "status": "INSTALLED", + "is_library_for_all_clusters": false + } + ] +} +``` + +Any runs of a Kedro project on this cluster will now reflect your latest local changes to Kedro. You can now test your changes to Kedro by using your cluster to run a Kedro project. diff --git a/docs/source/contribution/documentation_contributor_guidelines.md b/docs/source/contribution/documentation_contributor_guidelines.md new file mode 100644 index 0000000000..85a4746a27 --- /dev/null +++ b/docs/source/contribution/documentation_contributor_guidelines.md @@ -0,0 +1,62 @@ +# Contribute to the Kedro documentation + +You are welcome to contribute to the Kedro documentation if you find something incorrect or missing, or have other improvement suggestions. + +You can tell us what we should change or make a PR to change it yourself. + +Before you contribute any documentation changes, please read the [Kedro documentation style guidelines](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide) on the GitHub wiki. + +## How do I rebuild the documentation after I make changes to it? + +Our documentation is written in Markdown and built from by Sphinx, coordinated by a [build script](https://github.com/kedro-org/kedro/blob/main/docs/build-docs.sh). + +If you make changes to the markdown for the Kedro documentation, you can rebuild it within a Unix-like environment (with `pandoc` installed). + +If you are a Windows user, you can still contribute to the documentation, but you cannot rebuild it. This is fine! As long as you have made an effort to verify that your Markdown is rendering correctly, and you have followed our basic guidelines, we will be happy to take your final draft as a pull request and rebuild it for you. + +The following instructions are specifically for people working with documentation who may not already have a development setup. If you are comfortable with virtual environments, cloning and branching from a git repo and using `make` you don't need them and can probably jump to the section called [Build the documentation](#build-the-documentation). + +### Set up to build Kedro documentation + +Follow the setup instructions in the [developer contributor guide](./developer_contributor_guidelines.md#before-you-start-development-set-up) +to fork the Kedro repo, create and activate a Python virtual environment and install the dependencies necessary to build the documentation. + + +### Build the documentation + +**MacOS users** can use `make` commands to build the documentation: + +```bash +make build-docs +``` + +The build will take a few minutes to finish, and a successful result is a set of HTML documentation in `docs/build/html`, which you can review by navigating to the following file and opening it: `docs/build/html/index.html`. + + +## Extend Kedro documentation + +### Add new pages + +All Kedro documentation is collated and built from a single index file, [`index.rst`](https://github.com/kedro-org/kedro/blob/main/docs/source/index.rst) found in the `docs/source` folder. + +If you add extra pages of documentation, you should always include them within `index.rst` file to include them in the table of contents and let Sphinx know to build them alongside the rest of the documentation. + +### Move or remove pages + +To move or remove a page of documentation, first locate it in the repo, and also locate where it is specified in the `index.rst` or `.rst` for the relevant section within the table of contents. + +### Create a pull request + +You need to submit any changes to the documentation via a branch. + +[Find out more about the process of submitting a PR to the Kedro project](./developer_contributor_guidelines.md). + +### Help! + +There is no shame in breaking the documentation build. Sphinx is incredibly fussy and even a single space in the wrong place will sometimes cause problems. A range of other issues can crop up and block you, whether you're technically experienced or less familiar with working with git, conda and Sphinx. + +Ask for help over on [GitHub discussions](https://github.com/kedro-org/kedro/discussions). + +## Kedro documentation style guide + +There is a lightweight [documentation style guide](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide) on Kedro's GitHub wiki. diff --git a/docs/source/contribution/index.md b/docs/source/contribution/index.md new file mode 100644 index 0000000000..40e660bcbd --- /dev/null +++ b/docs/source/contribution/index.md @@ -0,0 +1,23 @@ +# Contribute to Kedro + +We welcome any and all contributions to Kedro, at whatever level you can manage. For example, you could: + +- Join the community on [Slack](https://slack.kedro.org) +- Review Kedro's [GitHub issues](https://github.com/kedro-org/kedro/issues) or raise your own issue to report a bug or feature request +- Start a conversation about the Kedro project on [GitHub discussions](https://github.com/kedro-org/kedro/discussions) +- Make a pull request on the [`awesome-kedro` GitHub repo](https://github.com/kedro-org/awesome-kedro) to update the curated list of Kedro community content +- Report a bug or propose a new feature on [GitHub issues](https://github.com/kedro-org/kedro/issues) +- [Review other contributors' PRs](https://github.com/kedro-org/kedro/pulls) +- [Contribute code](./developer_contributor_guidelines.md), for example to fix a bug or add a feature +- [Contribute to the documentation](documentation_contributor_guidelines.md) + + +```{toctree} +:hidden: + +developer_contributor_guidelines +backwards_compatibility +documentation_contributor_guidelines +technical_steering_committee +development_for_databricks +``` diff --git a/docs/source/contribution/technical_steering_committee.md b/docs/source/contribution/technical_steering_committee.md new file mode 100644 index 0000000000..1760cfd6f6 --- /dev/null +++ b/docs/source/contribution/technical_steering_committee.md @@ -0,0 +1,127 @@ +# Join the Technical Steering Committee + +In the Kedro project's latest iteration it is an incubating project within [LF AI & Data](https://lfaidata.foundation/). + +The term "Technical Steering Committee" (TSC) describes the group of Kedro maintainers. We list [Kedro's current and past maintainers](#kedro-maintainers) on this page. + +The TSC is responsible for the project's future development; you can read about our duties in our [Technical Charter](https://github.com/kedro-org/kedro/blob/main/kedro_technical_charter.pdf). We are happy to accept new members into the TSC to fuel Kedro's continued development. + +On this page we describe: + +- [Responsibilities of a maintainer](#responsibilities-of-a-maintainer) +- [Requirements to become a maintainer](#requirements-to-become-a-maintainer) +- [Kedro maintainers](#kedro-maintainers) +- [Application process](#application-process) +- [Voting process](#voting-process) + +## Responsibilities of a maintainer + +### Product development + + - Be available for at least one full day per week to help with product development + - Attend community meetings to discuss the project plans and roadmap + - Be proactive about project maintenance including security, updates, CI/CD, builds and infrastructure + - Prioritise the work following the product roadmap to move the project forward + +### Community management + +- Ensure that ongoing pull requests are moving forward at the right pace or closing them +- Guide the community to use our various communication channels: + + - [GitHub issues](https://github.com/kedro-org/kedro/issues) for feature requests and bug reports + - [GitHub discussions](https://github.com/kedro-org/kedro/discussions) to discuss the future of the Kedro project + - [Slack](https://slack.kedro.org) for questions and to support other users + +## Requirements to become a maintainer + +Just contributing does not make you a maintainer; you need to demonstrate commitment to Kedro's long-term success by +working with existing maintainers for a period of time. + +We look for commitment markers who can do the following: + +- Write high-quality code and collaborate with the team and community +- Understand the project's code base and internals +- Make pull requests from our backlog or roadmap; maintainers need to work towards a common goal +- Learn how the team works, including processes for testing, quality standards and code review +- Show evidence of already having started pull requests and code reviews under the guidance of maintainers; including asking + for help where needed +- Show excitement about the future of Kedro +- Build a collaborative relationship with the existing team + +## Kedro maintainers + + + +Kedro was originally designed by [Aris Valtazanos](https://github.com/arisvqb) and [Nikolaos Tsaousis](https://github.com/tsanikgr) at QuantumBlack to solve challenges they faced in their project work. Their work was later turned into an internal product by [Peteris Erins](https://github.com/Pet3ris), [Ivan Danov](https://github.com/idanov), [Nikolaos Kaltsas](https://github.com/nikos-kal), [Meisam Emamjome](https://github.com/misamae) and [Nikolaos Tsaousis](https://github.com/tsanikgr). + + +Currently, the core Kedro team consists of: + +[Ahdra Merali](https://github.com/AhdraMeraliQB), +[Andrew Mackay](https://github.com/Mackay031), +[Ankita Katiyar](https://github.com/ankatiyar), +[Antony Milne](https://github.com/antonymilne), +[Deepyaman Datta](https://github.com/deepyaman), +[Dmitry Sorokin](https://github.com/DimedS), +[Huong Nguyen](https://github.com/Huongg), +[Ivan Danov](https://github.com/idanov), +[Jitendra Gundaniya](https://github.com/jitu5), +[Jo Stichbury](https://github.com/stichbury), +[Joel Schwarzmann](https://github.com/datajoely), +[Juan Luis Cano](https://github.com/astrojuanlu), +[Laura Couto](https://github.com/lrcouto), +[Marcin Zabłocki](https://github.com/marrrcin), +[Merel Theisen](https://github.com/merelcht), +[Nero Okwa](https://github.com/NeroOkwa), +[Nok Lam Chan](https://github.com/noklam), +[Rashida Kanchwala](https://github.com/rashidakanchwala), +[Ravi Kumar Pilla](https://github.com/ravi-kumar-pilla), +[Sajid Alam](https://github.com/SajidAlamQB), +[Stephanie Kaiser](https://github.com/stephkaiser), +[Tynan DeBold](https://github.com/tynandebold), +[Vladimir Nikolic](https://github.com/vladimir-mck), and +[Yetunde Dada](https://github.com/yetudada). + +Former core team members with significant contributions include: + +[Andrii Ivaniuk](https://github.com/andrii-ivaniuk), +[Anton Kirilenko](https://github.com/Flid), +[Cvetanka Nechevska](https://github.com/cvetankanechevska), +[Dmitrii Deriabin](https://github.com/dmder), +[Gabriel Comym](https://github.com/comym), +[Gordon Wrigley](https://github.com/tolomea), +[Hamza Oza](https://github.com/hamzaoza), +[Ignacio Paricio](https://github.com/ignacioparicio), +[Jannic Holzer](https://github.com/jmholzer), +[Jiri Klein](https://github.com/jiriklein), +[Kiyohito Kunii](https://github.com/921kiyo), +[Laís Carvalho](https://github.com/laisbsc), +[Liam Brummitt](https://github.com/bru5), +[Lim Hoang](https://github.com/limdauto), +[Lorena Bălan](https://github.com/lorenabalan), +[Nasef Khan](https://github.com/nakhan98), +[Richard Westenra](https://github.com/richardwestenra), +[Susanna Wong](https://github.com/studioswong) and +[Zain Patel](https://github.com/mzjp2). + + +## Application process + +Every quarter year, existing maintainers will collect a list of contributors that have shown regular activity on the project over the prior months and want to become maintainers. From this list, maintainer candidates are selected and proposed for a vote. + +Following a successful vote, candidates are added to the `kedro-developers` team on the Kedro GitHub organisation +and the `kedro-team` channel on the Kedro Slack organisation, and listed as [Kedro maintainers](#kedro-maintainers). + +## Voting process + +Voting can change project maintainers and decide on the future of Kedro. The TSC leads the process as voting maintainers of Kedro. The voting period is one week and via a GitHub discussion or through a pull request. + +### Other issues or proposals + +[Kedro's GitHub discussions](https://github.com/kedro-org/kedro/discussions) section is used to host votes on issues, proposals and changes affecting the future of Kedro, including amendments to our ways of working described on this page. These votes require **a 1/2 majority**. + +### Adding or removing maintainers + +The decision to add or remove a maintainer is made based on TSC members votes in that pull request. Additions and removals of maintainers require **a 2/3 majority**. + +The act of adding or removing maintainers onto the list requires a pull request against the [Kedro maintainers section of this page](#kedro-maintainers). diff --git a/docs/source/css/theme-overrides.css b/docs/source/css/theme-overrides.css deleted file mode 100644 index c928bd0256..0000000000 --- a/docs/source/css/theme-overrides.css +++ /dev/null @@ -1,11 +0,0 @@ -/* override table width restrictions */ -@media screen and (min-width: 767px) { - - .wy-table-responsive table td { - white-space: normal; - } - - .wy-table-responsive { - overflow: visible; - } -} diff --git a/docs/source/data/data_catalog.md b/docs/source/data/data_catalog.md new file mode 100644 index 0000000000..ccc3b24960 --- /dev/null +++ b/docs/source/data/data_catalog.md @@ -0,0 +1,845 @@ +# The Data Catalog + +This section introduces `catalog.yml`, the project-shareable Data Catalog. The file is located in `conf/base` and is a registry of all data sources available for use by a project; it manages loading and saving of data. + +All supported data connectors are available in [`kedro-datasets`](/kedro_datasets). + +## Use the Data Catalog within Kedro configuration + +Kedro uses configuration to make your code reproducible when it has to reference datasets in different locations and/or in different environments. + +You can copy this file and reference additional locations for the same datasets. For instance, you can use the `catalog.yml` file in `conf/base/` to register the locations of datasets that would run in production, while copying and updating a second version of `catalog.yml` in `conf/local/` to register the locations of sample datasets that you are using for prototyping your data pipeline(s). + +Built-in functionality for `conf/local/` to overwrite `conf/base/` is [described in the documentation about configuration](../configuration/configuration_basics.md). This means that a dataset called `cars` could exist in the `catalog.yml` files in `conf/base/` and `conf/local/`. In code, in `src`, you would only call a dataset named `cars` and Kedro would detect which definition of `cars` dataset to use to run your pipeline - `cars` definition from `conf/local/catalog.yml` would take precedence in this case. + +The Data Catalog also works with the `credentials.yml` file in `conf/local/`, allowing you to specify usernames and passwords required to load certain datasets. + +You can define a Data Catalog in two ways - through YAML configuration, or programmatically using an API. Both methods allow you to specify: + + - Dataset name + - Dataset type + - Location of the dataset using `fsspec`, detailed in the next section + - Credentials needed to access the dataset + - Load and saving arguments + - Whether you want a [dataset or ML model to be versioned](kedro_io.md#versioning) when you run your data pipeline + +## Specify the location of the dataset + +Kedro relies on [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to read and save data from a variety of data stores including local file systems, network file systems, cloud object stores, and Hadoop. When specifying a storage location in `filepath:`, you should provide a URL using the general form `protocol://path/to/data`. If no protocol is provided, the local file system is assumed (same as ``file://``). + +The following prepends are available: + +- **Local or Network File System**: `file://` - the local file system is default in the absence of any protocol, it also permits relative paths. +- **Hadoop File System (HDFS)**: `hdfs://user@server:port/path/to/data` - Hadoop Distributed File System, for resilient, replicated files within a cluster. +- **Amazon S3**: `s3://my-bucket-name/path/to/data` - Amazon S3 remote binary store, often used with Amazon EC2, + using the library s3fs. +- **S3 Compatible Storage**: `s3://my-bucket-name/path/_to/data` - e.g. Minio, using the s3fs library. +- **Google Cloud Storage**: `gcs://` - Google Cloud Storage, typically used with Google Compute + resource using gcsfs (in development). +- **Azure Blob Storage / Azure Data Lake Storage Gen2**: `abfs://` - Azure Blob Storage, typically used when working on an Azure environment. +- **HTTP(s)**: ``http://`` or ``https://`` for reading data directly from HTTP web servers. + +`fsspec` also provides other file systems, such as SSH, FTP and WebHDFS. [See the fsspec documentation for more information](https://filesystem-spec.readthedocs.io/en/latest/api.html#implementations). + +## Data Catalog `*_args` parameters + +Data Catalog accepts two different groups of `*_args` parameters that serve different purposes: +- `fs_args` +- `load_args` and `save_args` + +The `fs_args` is used to configure the interaction with a filesystem. +All the top-level parameters of `fs_args` (except `open_args_load` and `open_args_save`) will be passed in an underlying filesystem class. + +### Example 1: Provide the `project` value to the underlying filesystem class (`GCSFileSystem`) to interact with Google Cloud Storage (GCS) + +```yaml +test_dataset: + type: ... + fs_args: + project: test_project +``` + +The `open_args_load` and `open_args_save` parameters are passed to the filesystem's `open` method to configure how a dataset file (on a specific filesystem) is opened during a load or save operation, respectively. + +### Example 2: Load data from a local binary file using `utf-8` encoding + +```yaml +test_dataset: + type: ... + fs_args: + open_args_load: + mode: "rb" + encoding: "utf-8" +``` + +`load_args` and `save_args` configure how a third-party library (e.g. `pandas` for `CSVDataSet`) loads/saves data from/to a file. + +### Example 3: Save data to a CSV file without row names (index) using `utf-8` encoding + +```yaml +test_dataset: + type: pandas.CSVDataSet + ... + save_args: + index: False + encoding: "utf-8" +``` + +## Use the Data Catalog with the YAML API + +The YAML API allows you to configure your datasets in a YAML configuration file, `conf/base/catalog.yml` or `conf/local/catalog.yml`. + +Here are some examples of data configuration in a `catalog.yml`: + +### Example 1: Loads / saves a CSV file from / to a local file system + +```yaml +bikes: + type: pandas.CSVDataSet + filepath: data/01_raw/bikes.csv +``` + +### Example 2: Loads and saves a CSV on a local file system, using specified load and save arguments + +```yaml +cars: + type: pandas.CSVDataSet + filepath: data/01_raw/company/cars.csv + load_args: + sep: ',' + save_args: + index: False + date_format: '%Y-%m-%d %H:%M' + decimal: . + +``` + +### Example 3: Loads and saves a compressed CSV on a local file system + +```yaml +boats: + type: pandas.CSVDataSet + filepath: data/01_raw/company/boats.csv.gz + load_args: + sep: ',' + compression: 'gzip' + fs_args: + open_args_load: + mode: 'rb' +``` + +### Example 4: Loads a CSV file from a specific S3 bucket, using credentials and load arguments + +```yaml +motorbikes: + type: pandas.CSVDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.csv + credentials: dev_s3 + load_args: + sep: ',' + skiprows: 5 + skipfooter: 1 + na_values: ['#NA', NA] +``` + +### Example 5: Loads / saves a pickle file from / to a local file system + +```yaml +airplanes: + type: pickle.PickleDataSet + filepath: data/06_models/airplanes.pkl + backend: pickle +``` + +### Example 6: Loads an Excel file from Google Cloud Storage + +```yaml +rockets: + type: pandas.ExcelDataSet + filepath: gcs://your_bucket/data/02_intermediate/company/motorbikes.xlsx + fs_args: + project: my-project + credentials: my_gcp_credentials + save_args: + sheet_name: Sheet1 +``` + +### Example 7: Loads a multi-sheet Excel file from a local file system + +```yaml +trains: + type: pandas.ExcelDataSet + filepath: data/02_intermediate/company/trains.xlsx + load_args: + sheet_name: [Sheet1, Sheet2, Sheet3] +``` + +### Example 8: Saves an image created with Matplotlib on Google Cloud Storage + +```yaml +results_plot: + type: matplotlib.MatplotlibWriter + filepath: gcs://your_bucket/data/08_results/plots/output_1.jpeg + fs_args: + project: my-project + credentials: my_gcp_credentials +``` + + +### Example 9: Loads / saves an HDF file on local file system storage, using specified load and save arguments + +```yaml +skateboards: + type: pandas.HDFDataSet + filepath: data/02_intermediate/skateboards.hdf + key: name + load_args: + columns: [brand, length] + save_args: + mode: w # Overwrite even when the file already exists + dropna: True +``` + +### Example 10: Loads / saves a parquet file on local file system storage, using specified load and save arguments + +```yaml +trucks: + type: pandas.ParquetDataSet + filepath: data/02_intermediate/trucks.parquet + load_args: + columns: [name, gear, disp, wt] + categories: list + index: name + save_args: + compression: GZIP + file_scheme: hive + has_nulls: False + partition_on: [name] +``` + + +### Example 11: Loads / saves a Spark table on S3, using specified load and save arguments + +```yaml +weather: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/weather* + credentials: dev_s3 + file_format: csv + load_args: + header: True + inferSchema: True + save_args: + sep: '|' + header: True +``` + + +### Example 12: Loads / saves a SQL table using credentials, a database connection, using specified load and save arguments + +```yaml +scooters: + type: pandas.SQLTableDataSet + credentials: scooters_credentials + table_name: scooters + load_args: + index_col: [name] + columns: [name, gear] + save_args: + if_exists: replace +``` + +### Example 13: Loads an SQL table with credentials, a database connection, and applies a SQL query to the table + + +```yaml +scooters_query: + type: pandas.SQLQueryDataSet + credentials: scooters_credentials + sql: select * from cars where gear=4 + load_args: + index_col: [name] +``` + +When you use [`pandas.SQLTableDataSet`](/kedro_datasets.pandas.SQLTableDataSet) or [`pandas.SQLQueryDataSet`](/kedro_datasets.pandas.SQLQueryDataSet), you must provide a database connection string. In the above example, we pass it using the `scooters_credentials` key from the credentials (see the details in the [Feeding in credentials](#feeding-in-credentials) section below). `scooters_credentials` must have a top-level key `con` containing a [SQLAlchemy compatible](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) connection string. As an alternative to credentials, you could explicitly put `con` into `load_args` and `save_args` (`pandas.SQLTableDataSet` only). + + +### Example 14: Loads data from an API endpoint, example US corn yield data from USDA + +```yaml +us_corn_yield_data: + type: api.APIDataSet + url: https://quickstats.nass.usda.gov + credentials: usda_credentials + load_args: + params: + key: SOME_TOKEN + format: JSON + commodity_desc: CORN + statisticcat_des: YIELD + agg_level_desc: STATE + year: 2000 +``` + +Note that `usda_credientials` will be passed as the `auth` argument in the `requests` library. Specify the username and password as a list in your `credentials.yml` file as follows: + +```yaml +usda_credentials: + - username + - password +``` + + +### Example 15: Loads data from Minio (S3 API Compatible Storage) + + +```yaml +test: + type: pandas.CSVDataSet + filepath: s3://your_bucket/test.csv # assume `test.csv` is uploaded to the Minio server. + credentials: dev_minio +``` +In `credentials.yml`, define the `key`, `secret` and the `endpoint_url` as follows: + +```yaml +dev_minio: + key: token + secret: key + client_kwargs: + endpoint_url : 'http://localhost:9000' +``` + +```{note} +The easiest way to setup MinIO is to run a Docker image. After the following command, you can access the Minio server with `http://localhost:9000` and create a bucket and add files as if it is on S3. +``` + +`docker run -p 9000:9000 -e "MINIO_ACCESS_KEY=token" -e "MINIO_SECRET_KEY=key" minio/minio server /data` + + +### Example 16: Loads a model saved as a pickle from Azure Blob Storage + +```yaml +ml_model: + type: pickle.PickleDataSet + filepath: "abfs://models/ml_models.pickle" + versioned: True + credentials: dev_abs +``` +In the `credentials.yml` file, define the `account_name` and `account_key`: + +```yaml +dev_abs: + account_name: accountname + account_key: key +``` + + +### Example 17: Loads a CSV file stored in a remote location through SSH + +```{note} +This example requires [Paramiko](https://www.paramiko.org) to be installed (`pip install paramiko`). +``` +```yaml +cool_dataset: + type: pandas.CSVDataSet + filepath: "sftp:///path/to/remote_cluster/cool_data.csv" + credentials: cluster_credentials +``` +All parameters required to establish the SFTP connection can be defined through `fs_args` or in the `credentials.yml` file as follows: + +```yaml +cluster_credentials: + username: my_username + host: host_address + port: 22 + password: password +``` +The list of all available parameters is given in the [Paramiko documentation](https://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect). + +## Create a Data Catalog YAML configuration file via CLI + +You can use the [`kedro catalog create` command to create a Data Catalog YAML configuration](../development/commands_reference.md#create-a-data-catalog-yaml-configuration-file). + +This creates a `//catalog/.yml` configuration file with `MemoryDataSet` datasets for each dataset in a registered pipeline if it is missing from the `DataCatalog`. + +```yaml +# //catalog/.yml +rockets: + type: MemoryDataSet +scooters: + type: MemoryDataSet +``` + +## Adding parameters + +You can [configure parameters](../configuration/parameters.md) for your project and [reference them](../configuration/parameters.md#how-to-use-parameters) in your nodes. To do this, use the `add_feed_dict()` method ([API documentation](/kedro.io.DataCatalog)). You can use this method to add any other entry or metadata you wish on the `DataCatalog`. + + +## Feeding in credentials + +Before instantiating the `DataCatalog`, Kedro will first attempt to read [the credentials from the project configuration](../configuration/credentials.md). The resulting dictionary is then passed into `DataCatalog.from_config()` as the `credentials` argument. + +Let's assume that the project contains the file `conf/local/credentials.yml` with the following contents: + +```yaml +dev_s3: + client_kwargs: + aws_access_key_id: key + aws_secret_access_key: secret + +scooters_credentials: + con: sqlite:///kedro.db + +my_gcp_credentials: + id_token: key +``` + +In the example above, the `catalog.yml` file contains references to credentials keys `dev_s3` and `scooters_credentials`. This means that when it instantiates the `motorbikes` dataset, for example, the `DataCatalog` will attempt to read top-level key `dev_s3` from the received `credentials` dictionary, and then will pass its values into the dataset `__init__` as a `credentials` argument. This is essentially equivalent to calling this: + +```python +CSVDataSet( + filepath="s3://test_bucket/data/02_intermediate/company/motorbikes.csv", + load_args=dict(sep=",", skiprows=5, skipfooter=1, na_values=["#NA", "NA"]), + credentials=dict(key="token", secret="key"), +) +``` + + +## Load multiple datasets with similar configuration using YAML anchors + +Different datasets might use the same file format, load and save arguments, and be stored in the same folder. [YAML has a built-in syntax](https://yaml.org/spec/1.2.1/#Syntax) for factorising parts of a YAML file, which means that you can decide what is generalisable across your datasets, so that you need not spend time copying and pasting dataset configurations in the `catalog.yml` file. + +You can see this in the following example: + +```yaml +_csv: &csv + type: spark.SparkDataSet + file_format: csv + load_args: + sep: ',' + na_values: ['#NA', NA] + header: True + inferSchema: False + +cars: + <<: *csv + filepath: s3a://data/01_raw/cars.csv + +trucks: + <<: *csv + filepath: s3a://data/01_raw/trucks.csv + +bikes: + <<: *csv + filepath: s3a://data/01_raw/bikes.csv + load_args: + header: False +``` + +The syntax `&csv` names the following block `csv` and the syntax `<<: *csv` inserts the contents of the block named `csv`. Locally declared keys entirely override inserted ones as seen in `bikes`. + +```{note} +It's important that the name of the template entry starts with a `_` so Kedro knows not to try and instantiate it as a dataset. +``` + +You can also nest reuseable YAML syntax: + +```yaml +_csv: &csv + type: spark.SparkDataSet + file_format: csv + load_args: &csv_load_args + header: True + inferSchema: False + +airplanes: + <<: *csv + filepath: s3a://data/01_raw/airplanes.csv + load_args: + <<: *csv_load_args + sep: ; +``` + +In this example, the default `csv` configuration is inserted into `airplanes` and then the `load_args` block is overridden. Normally, that would replace the whole dictionary. In order to extend `load_args`, the defaults for that block are then re-inserted. + +## Load multiple datasets with similar configuration using dataset factories +For catalog entries that share configuration details, you can also use the dataset factories introduced in Kedro 0.18.12. This syntax allows you to generalise the configuration and +reduce the number of similar catalog entries by matching datasets used in your project's pipelines to dataset factory patterns. + +### Example 1: Generalise datasets with similar names and types into one dataset factory +Consider the following catalog entries: +```yaml +factory_data: + type: pandas.CSVDataSet + filepath: data/01_raw/factory_data.csv + + +process_data: + type: pandas.CSVDataSet + filepath: data/01_raw/process_data.csv +``` +The datasets in this catalog can be generalised to the following dataset factory: +```yaml +"{name}_data": + type: pandas.CSVDataSet + filepath: data/01_raw/{name}_data.csv +``` +When `factory_data` or `process_data` is used in your pipeline, it is matched to the factory pattern `{name}_data`. The factory pattern must always be enclosed in +quotes to avoid YAML parsing errors. + + +### Example 2: Generalise datasets of the same type into one dataset factory +You can also combine all the datasets with the same type and configuration details. For example, consider the following +catalog with three datasets named `boats`, `cars` and `planes` of the type `pandas.CSVDataSet`: +```yaml +boats: + type: pandas.CSVDataSet + filepath: data/01_raw/shuttles.csv + +cars: + type: pandas.CSVDataSet + filepath: data/01_raw/reviews.csv + +planes: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv +``` +These datasets can be combined into the following dataset factory: +```yaml +"{dataset_name}#csv": + type: pandas.CSVDataSet + filepath: data/01_raw/{dataset_name}.csv +``` +You will then have to update the pipelines in your project located at `src///pipeline.py` to refer to these datasets as `boats#csv`, +`cars#csv` and `planes#csv`. Adding a suffix or a prefix to the dataset names and the dataset factory patterns, like `#csv` here, ensures that the dataset +names are matched with the intended pattern. +```python +from .nodes import create_model_input_table, preprocess_companies, preprocess_shuttles + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( + [ + node( + func=preprocess_boats, + inputs="boats#csv", + outputs="preprocessed_boats", + name="preprocess_boats_node", + ), + node( + func=preprocess_cars, + inputs="cars#csv", + outputs="preprocessed_cars", + name="preprocess_cars_node", + ), + node( + func=preprocess_planes, + inputs="planes#csv", + outputs="preprocessed_planes", + name="preprocess_planes_node", + ), + node( + func=create_model_input_table, + inputs=[ + "preprocessed_boats", + "preprocessed_planes", + "preprocessed_cars", + ], + outputs="model_input_table", + name="create_model_input_table_node", + ), + ] + ) +``` +### Example 3: Generalise datasets using namespaces into one dataset factory +You can also generalise the catalog entries for datasets belonging to namespaced modular pipelines. Consider the +following pipeline which takes in a `model_input_table` and outputs two regressors belonging to the +`active_modelling_pipeline` and the `candidate_modelling_pipeline` namespaces: +```python +from kedro.pipeline import Pipeline, node +from kedro.pipeline.modular_pipeline import pipeline + +from .nodes import evaluate_model, split_data, train_model + + +def create_pipeline(**kwargs) -> Pipeline: + pipeline_instance = pipeline( + [ + node( + func=split_data, + inputs=["model_input_table", "params:model_options"], + outputs=["X_train", "y_train"], + name="split_data_node", + ), + node( + func=train_model, + inputs=["X_train", "y_train"], + outputs="regressor", + name="train_model_node", + ), + ] + ) + ds_pipeline_1 = pipeline( + pipe=pipeline_instance, + inputs="model_input_table", + namespace="active_modelling_pipeline", + ) + ds_pipeline_2 = pipeline( + pipe=pipeline_instance, + inputs="model_input_table", + namespace="candidate_modelling_pipeline", + ) + + return ds_pipeline_1 + ds_pipeline_2 +``` +You can now have one dataset factory pattern in your catalog instead of two separate entries for `active_modelling_pipeline.regressor` +and `candidate_modelling_pipeline.regressor` as below: +```yaml +{namespace}.regressor: + type: pickle.PickleDataSet + filepath: data/06_models/regressor_{namespace}.pkl + versioned: true +``` +### Example 4: Generalise datasets of the same type in different layers into one dataset factory with multiple placeholders + +You can use multiple placeholders in the same pattern. For example, consider the following catalog where the dataset +entries share `type`, `file_format` and `save_args`: +```yaml +processing.factory_data: + type: spark.SparkDataSet + filepath: data/processing/factory_data.pq + file_format: parquet + save_args: + mode: overwrite + +processing.process_data: + type: spark.SparkDataSet + filepath: data/processing/process_data.pq + file_format: parquet + save_args: + mode: overwrite + +modelling.metrics: + type: spark.SparkDataSet + filepath: data/modelling/factory_data.pq + file_format: parquet + save_args: + mode: overwrite +``` +This could be generalised to the following pattern: +```yaml +"{layer}.{dataset_name}": + type: spark.SparkDataSet + filepath: data/{layer}/{dataset_name}.pq + file_format: parquet + save_args: + mode: overwrite +``` +All the placeholders used in the catalog entry body must exist in the factory pattern name. + +### Example 5: Generalise datasets using multiple dataset factories +You can have multiple dataset factories in your catalog. For example: +```yaml +"{namespace}.{dataset_name}@spark": + type: spark.SparkDataSet + filepath: data/{namespace}/{dataset_name}.pq + file_format: parquet + +"{dataset_name}@csv": + type: pandas.CSVDataSet + filepath: data/01_raw/{dataset_name}.csv +``` + +Having multiple dataset factories in your catalog can lead to a situation where a dataset name from your pipeline might +match multiple patterns. To overcome this, Kedro sorts all the potential matches for the dataset name in the pipeline and picks the best match. +The matches are ranked according to the following criteria : +1. Number of exact character matches between the dataset name and the factory pattern. For example, a dataset named `factory_data$csv` would match `{dataset}_data$csv` over `{dataset_name}$csv`. +2. Number of placeholders. For example, the dataset `preprocessing.shuttles+csv` would match `{namespace}.{dataset}+csv` over `{dataset}+csv`. +3. Alphabetical order + +### Example 6: Generalise all datasets with a catch-all dataset factory to overwrite the default `MemoryDataSet` +You can use dataset factories to define a catch-all pattern which will overwrite the default `MemoryDataSet` creation. +```yaml +"{default_dataset}": + type: pandas.CSVDataSet + filepath: data/{default_dataset}.csv + +``` +Kedro will now treat all the datasets mentioned in your project's pipelines that do not appear as specific patterns or explicit entries in your catalog +as `pandas.CSVDataSet`. + +## Transcode datasets + +You might come across a situation where you would like to read the same file using two different dataset implementations. Use transcoding when you want to load and save the same file, via its specified `filepath`, using different `DataSet` implementations. + +### A typical example of transcoding + +For instance, parquet files can not only be loaded via the `ParquetDataSet` using `pandas`, but also directly by `SparkDataSet`. This conversion is typical when coordinating a `Spark` to `pandas` workflow. + +To enable transcoding, define two `DataCatalog` entries for the same dataset in a common format (Parquet, JSON, CSV, etc.) in your `conf/base/catalog.yml`: + +```yaml +my_dataframe@spark: + type: spark.SparkDataSet + filepath: data/02_intermediate/data.parquet + file_format: parquet + +my_dataframe@pandas: + type: pandas.ParquetDataSet + filepath: data/02_intermediate/data.parquet +``` + +These entries are used in the pipeline like this: + +```python +pipeline( + [ + node(func=my_func1, inputs="spark_input", outputs="my_dataframe@spark"), + node(func=my_func2, inputs="my_dataframe@pandas", outputs="pipeline_output"), + ] +) +``` + +### How does transcoding work? + +In this example, Kedro understands that `my_dataframe` is the same dataset in its `spark.SparkDataSet` and `pandas.ParquetDataSet` formats and helps resolve the node execution order. + +In the pipeline, Kedro uses the `spark.SparkDataSet` implementation for saving and `pandas.ParquetDataSet` +for loading, so the first node should output a `pyspark.sql.DataFrame`, while the second node would receive a `pandas.Dataframe`. + + +## Version datasets and ML models + +Making a simple addition to your Data Catalog allows you to perform versioning of datasets and machine learning models. + +Consider the following versioned dataset defined in the `catalog.yml`: + +```yaml +cars: + type: pandas.CSVDataSet + filepath: data/01_raw/company/cars.csv + versioned: True +``` + +The `DataCatalog` will create a versioned `CSVDataSet` called `cars`. The actual csv file location will look like `data/01_raw/company/cars.csv//cars.csv`, where `` corresponds to a global save version string formatted as `YYYY-MM-DDThh.mm.ss.sssZ`. + +You can run the pipeline with a particular versioned data set with `--load-version` flag as follows: + +```bash +kedro run --load-version=cars:YYYY-MM-DDThh.mm.ss.sssZ +``` +where `--load-version` is dataset name and version timestamp separated by `:`. + +This section shows just the very basics of versioning, which is described further in [the documentation about Kedro IO](../data/kedro_io.md#versioning). + +## Use the Data Catalog with the Code API + +The code API allows you to: + +* configure data sources in code +* operate the IO module within notebooks + +### Configure a Data Catalog + +In a file like `catalog.py`, you can construct a `DataCatalog` object programmatically. In the following, we are using several pre-built data loaders documented in the [API reference documentation](/kedro_datasets). + +```python +from kedro.io import DataCatalog +from kedro_datasets.pandas import ( + CSVDataSet, + SQLTableDataSet, + SQLQueryDataSet, + ParquetDataSet, +) + +io = DataCatalog( + { + "bikes": CSVDataSet(filepath="../data/01_raw/bikes.csv"), + "cars": CSVDataSet(filepath="../data/01_raw/cars.csv", load_args=dict(sep=",")), + "cars_table": SQLTableDataSet( + table_name="cars", credentials=dict(con="sqlite:///kedro.db") + ), + "scooters_query": SQLQueryDataSet( + sql="select * from cars where gear=4", + credentials=dict(con="sqlite:///kedro.db"), + ), + "ranked": ParquetDataSet(filepath="ranked.parquet"), + } +) +``` + +When using `SQLTableDataSet` or `SQLQueryDataSet` you must provide a `con` key containing [SQLAlchemy compatible](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) database connection string. In the example above we pass it as part of `credentials` argument. Alternative to `credentials` is to put `con` into `load_args` and `save_args` (`SQLTableDataSet` only). + +### Load datasets + +You can access each dataset by its name. + +```python +cars = io.load("cars") # data is now loaded as a DataFrame in 'cars' +gear = cars["gear"].values +``` + +#### Behind the scenes + +The following steps happened behind the scenes when `load` was called: + +- The value `cars` was located in the Data Catalog +- The corresponding `AbstractDataSet` object was retrieved +- The `load` method of this dataset was called +- This `load` method delegated the loading to the underlying pandas `read_csv` function + +### View the available data sources + +If you forget what data was assigned, you can always review the `DataCatalog`. + +```python +io.list() +``` + +### Save data + +You can save data using an API similar to that used to load data. + +```{warning} +This use is not recommended unless you are prototyping in notebooks. +``` + +#### Save data to memory + +```python +from kedro.io import MemoryDataSet + +memory = MemoryDataSet(data=None) +io.add("cars_cache", memory) +io.save("cars_cache", "Memory can store anything.") +io.load("cars_cache") +``` + +#### Save data to a SQL database for querying + +We might now want to put the data in a SQLite database to run queries on it. Let's use that to rank scooters by their mpg. + +```python +import os + +# This cleans up the database in case it exists at this point +try: + os.remove("kedro.db") +except FileNotFoundError: + pass + +io.save("cars_table", cars) +ranked = io.load("scooters_query")[["brand", "mpg"]] +``` + +#### Save data in Parquet + +Finally, we can save the processed data in Parquet format. + +```python +io.save("ranked", ranked) +``` + +```{warning} +Saving `None` to a dataset is not allowed! +``` diff --git a/docs/source/data/index.md b/docs/source/data/index.md new file mode 100644 index 0000000000..00c05353fc --- /dev/null +++ b/docs/source/data/index.md @@ -0,0 +1,8 @@ +# Data Catalog + +```{toctree} +:maxdepth: 1 + +data_catalog +kedro_io +``` diff --git a/docs/source/data/kedro_io.md b/docs/source/data/kedro_io.md new file mode 100644 index 0000000000..6fdfefdd66 --- /dev/null +++ b/docs/source/data/kedro_io.md @@ -0,0 +1,615 @@ +# Kedro IO + + +In this tutorial, we cover advanced uses of [the Kedro IO module](/kedro.io) to understand the underlying implementation. The relevant API documentation is [kedro.io.AbstractDataSet](/kedro.io.AbstractDataSet) and [kedro.io.DataSetError](/kedro.io.DataSetError). + +## Error handling + +We have custom exceptions for the main classes of errors that you can handle to deal with failures. + +```python +from kedro.io import * +``` + +```python +io = DataCatalog(data_sets=dict()) # empty catalog + +try: + cars_df = io.load("cars") +except DataSetError: + print("Error raised.") +``` + + +## AbstractDataSet + +To understand what is going on behind the scenes, you should study the [AbstractDataSet interface](/kedro.io.AbstractDataSet). `AbstractDataSet` is the underlying interface that all datasets extend. It requires subclasses to override the `_load` and `_save` and provides `load` and `save` methods that enrich the corresponding private methods with uniform error handling. It also requires subclasses to override `_describe`, which is used in logging the internal information about the instances of your custom `AbstractDataSet` implementation. + +If you have a dataset called `parts`, you can make direct calls to it like so: + +```python +parts_df = parts.load() +``` + +We recommend using a `DataCatalog` instead (for more details, see [the `DataCatalog` documentation](../data/data_catalog.md)) as it has been designed to make all datasets available to project members. + +For contributors, if you would like to submit a new dataset, you must extend the `AbstractDataSet`. For a complete guide, please read [the section on custom datasets](../extend_kedro/custom_datasets.md). + + +## Versioning + +In order to enable versioning, you need to update the `catalog.yml` config file and set the `versioned` attribute to `true` for the given dataset. If this is a custom dataset, the implementation must also: + 1. extend `kedro.io.core.AbstractVersionedDataSet` AND + 2. add `version` namedtuple as an argument to its `__init__` method AND + 3. call `super().__init__()` with positional arguments `filepath`, `version`, and, optionally, with `glob` and `exists` functions if it uses a non-local filesystem (see [kedro_datasets.pandas.CSVDataSet](/kedro_datasets.pandas.CSVDataSet) as an example) AND + 4. modify its `_describe`, `_load` and `_save` methods respectively to support versioning (see [`kedro_datasets.pandas.CSVDataSet`](/kedro_datasets.pandas.CSVDataSet) for an example implementation) + +```{note} +If a new version of a dataset is created mid-run, for instance by an external system adding new files, it will not interfere in the current run, i.e. the load version stays the same throughout subsequent loads. +``` + +An example dataset could look similar to the below: + +```python +from pathlib import Path, PurePosixPath + +import pandas as pd + +from kedro.io import AbstractVersionedDataSet + + +class MyOwnDataSet(AbstractVersionedDataSet): + def __init__(self, filepath, version, param1, param2=True): + super().__init__(PurePosixPath(filepath), version) + self._param1 = param1 + self._param2 = param2 + + def _load(self) -> pd.DataFrame: + load_path = self._get_load_path() + return pd.read_csv(load_path) + + def _save(self, df: pd.DataFrame) -> None: + save_path = self._get_save_path() + df.to_csv(save_path) + + def _exists(self) -> bool: + path = self._get_load_path() + return Path(path).exists() + + def _describe(self): + return dict(version=self._version, param1=self._param1, param2=self._param2) +``` + +With `catalog.yml` specifying: + +```yaml +my_dataset: + type: .MyOwnDataSet + filepath: data/01_raw/my_data.csv + versioned: true + param1: # param1 is a required argument + # param2 will be True by default +``` + +### `version` namedtuple + +Versioned dataset `__init__` method must have an optional argument called `version` with a default value of `None`. If provided, this argument must be an instance of [`kedro.io.core.Version`](/kedro.io.Version). Its `load` and `save` attributes must either be `None` or contain string values representing exact load and save versions: + +* If `version` is `None`, then the dataset is considered *not versioned*. +* If `version.load` is `None`, then the latest available version will be used to load the dataset, otherwise a string representing exact load version must be provided. +* If `version.save` is `None`, then a new save version string will be generated by calling `kedro.io.core.generate_timestamp()`, otherwise a string representing the exact save version must be provided. + +### Versioning using the YAML API + +The easiest way to version a specific dataset is to change the corresponding entry in the `catalog.yml` file. For example, if the following dataset was defined in the `catalog.yml` file: + +```yaml +cars: + type: pandas.CSVDataSet + filepath: data/01_raw/company/car_data.csv + versioned: true +``` + +The `DataCatalog` will create a versioned `CSVDataSet` called `cars`. The actual csv file location will look like `data/01_raw/company/car_data.csv//car_data.csv`, where `` corresponds to a global save version string formatted as `YYYY-MM-DDThh.mm.ss.sssZ`. Every time the `DataCatalog` is instantiated, it generates a new global save version, which is propagated to all versioned datasets it contains. + +The `catalog.yml` file only allows you to version your datasets, but does not allow you to choose which version to load or save. This is deliberate because we have chosen to separate the data catalog from any runtime configuration. If you need to pin a dataset version, you can either [specify the versions in a separate `yml` file and call it at runtime](../nodes_and_pipelines/run_a_pipeline.md#configure-kedro-run-arguments) or [instantiate your versioned datasets using Code API and define a version parameter explicitly](#versioning-using-the-code-api). + +By default, the `DataCatalog` will load the latest version of the dataset. However, you can also specify an exact load version. In order to do that, pass a dictionary with exact load versions to `DataCatalog.from_config`: + +```python +load_versions = {"cars": "2019-02-13T14.35.36.518Z"} +io = DataCatalog.from_config(catalog_config, credentials, load_versions=load_versions) +cars = io.load("cars") +``` + +The last row in the example above would attempt to load a CSV file from `data/01_raw/company/car_data.csv/2019-02-13T14.35.36.518Z/car_data.csv`: + +* `load_versions` configuration has an effect only if a dataset versioning has been enabled in the catalog config file - see the example above. + +* We recommend that you do not override `save_version` argument in `DataCatalog.from_config` unless strongly required to do so, since it may lead to inconsistencies between loaded and saved versions of the versioned datasets. + +```{warning} +The `DataCatalog` does not re-generate save versions between instantiations. Therefore, if you call `catalog.save('cars', some_data)` twice, then the second call will fail, since it tries to overwrite a versioned dataset using the same save version. To mitigate this, reload your data catalog by calling `%reload_kedro` line magic. This limitation does not apply to `load` operation. +``` + +### Versioning using the Code API + +Although we recommend enabling versioning using the `catalog.yml` config file as described in the section above, you might require more control over load and save versions of a specific dataset. To achieve this, you can instantiate `Version` and pass it as a parameter to the dataset initialisation: + +```python +from kedro.io import DataCatalog, Version +from kedro_datasets.pandas import CSVDataSet +import pandas as pd + +data1 = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) +data2 = pd.DataFrame({"col1": [7], "col2": [8], "col3": [9]}) +version = Version( + load=None, # load the latest available version + save=None, # generate save version automatically on each save operation +) + +test_data_set = CSVDataSet( + filepath="data/01_raw/test.csv", save_args={"index": False}, version=version +) +io = DataCatalog({"test_data_set": test_data_set}) + +# save the dataset to data/01_raw/test.csv//test.csv +io.save("test_data_set", data1) +# save the dataset into a new file data/01_raw/test.csv//test.csv +io.save("test_data_set", data2) + +# load the latest version from data/test.csv/*/test.csv +reloaded = io.load("test_data_set") +assert data2.equals(reloaded) +``` + +```{note} +In the example above, we did not fix any versions. If we do, then the behaviour of load and save operations becomes slightly different: +``` + +```python +version = Version( + load="my_exact_version", # load exact version + save="my_exact_version", # save to exact version +) + +test_data_set = CSVDataSet( + filepath="data/01_raw/test.csv", save_args={"index": False}, version=version +) +io = DataCatalog({"test_data_set": test_data_set}) + +# save the dataset to data/01_raw/test.csv/my_exact_version/test.csv +io.save("test_data_set", data1) +# load from data/01_raw/test.csv/my_exact_version/test.csv +reloaded = io.load("test_data_set") +assert data1.equals(reloaded) + +# raises DataSetError since the path +# data/01_raw/test.csv/my_exact_version/test.csv already exists +io.save("test_data_set", data2) +``` + +```{warning} +We do not recommend passing exact load and/or save versions, since it might lead to inconsistencies between operations. For example, if versions for load and save operations do not match, a save operation would result in a `UserWarning` indicating that save and load versions do not match. Load after save might also return an error if the corresponding load version is not found: +``` + +```python +version = Version( + load="exact_load_version", # load exact version + save="exact_save_version", # save to exact version +) + +test_data_set = CSVDataSet( + filepath="data/01_raw/test.csv", save_args={"index": False}, version=version +) +io = DataCatalog({"test_data_set": test_data_set}) + +io.save("test_data_set", data1) # emits a UserWarning due to version inconsistency + +# raises DataSetError since the data/01_raw/test.csv/exact_load_version/test.csv +# file does not exist +reloaded = io.load("test_data_set") +``` + +### Supported datasets + +Currently, the following datasets support versioning: + +- `kedro_datasets.matplotlib.MatplotlibWriter` +- `kedro_datasets.holoviews.HoloviewsWriter` +- `kedro_datasets.networkx.NetworkXDataSet` +- `kedro_datasets.pandas.CSVDataSet` +- `kedro_datasets.pandas.ExcelDataSet` +- `kedro_datasets.pandas.FeatherDataSet` +- `kedro_datasets.pandas.HDFDataSet` +- `kedro_datasets.pandas.JSONDataSet` +- `kedro_datasets.pandas.ParquetDataSet` +- `kedro_datasets.pickle.PickleDataSet` +- `kedro_datasets.pillow.ImageDataSet` +- `kedro_datasets.text.TextDataSet` +- `kedro_datasets.spark.SparkDataSet` +- `kedro_datasets.yaml.YAMLDataSet` +- `kedro_datasets.api.APIDataSet` +- `kedro_datasets.tensorflow.TensorFlowModelDataSet` +- `kedro_datasets.json.JSONDataSet` + +```{note} +Although HTTP(S) is a supported file system in the dataset implementations, it does not support versioning. +``` + +## Partitioned dataset + +These days, distributed systems play an increasingly important role in ETL data pipelines. They significantly increase the processing throughput, enabling us to work with much larger volumes of input data. However, these benefits sometimes come at a cost. When dealing with the input data generated by such distributed systems, you might encounter a situation where your Kedro node needs to read the data from a directory full of uniform files of the same type (e.g. JSON, CSV, Parquet, etc.) rather than from a single file. Tools like `PySpark` and the corresponding [SparkDataSet](/kedro_datasets.spark.SparkDataSet) cater for such use cases, but the use of Spark is not always feasible. + +This is why Kedro provides a built-in [PartitionedDataSet](/kedro.io.PartitionedDataSet), with the following features: + +* `PartitionedDataSet` can recursively load/save all or specific files from a given location. +* It is platform agnostic, and can work with any filesystem implementation supported by [fsspec](https://filesystem-spec.readthedocs.io/) including local, S3, GCS, and many more. +* It implements a [lazy loading](https://en.wikipedia.org/wiki/Lazy_loading) approach, and does not attempt to load any partition data until a processing node explicitly requests it. +* It supports lazy saving by using `Callable`s. + +```{note} +In this section, each individual file inside a given location is called a partition. +``` + +### Partitioned dataset definition + +`PartitionedDataSet` definition can be put in your `catalog.yml` file like any other regular dataset definition. The definition represents the following structure: + +```yaml +# conf/base/catalog.yml + +my_partitioned_dataset: + type: PartitionedDataSet + path: s3://my-bucket-name/path/to/folder # path to the location of partitions + dataset: pandas.CSVDataSet # shorthand notation for the dataset which will handle individual partitions + credentials: my_credentials + load_args: + load_arg1: value1 + load_arg2: value2 +``` + +```{note} +Like any other dataset, `PartitionedDataSet` can also be instantiated programmatically in Python: +``` + +```python +from kedro_datasets.pandas import CSVDataSet +from kedro.io import PartitionedDataSet + +my_credentials = {...} # credentials dictionary + +my_partitioned_dataset = PartitionedDataSet( + path="s3://my-bucket-name/path/to/folder", + dataset=CSVDataSet, + credentials=my_credentials, + load_args={"load_arg1": "value1", "load_arg2": "value2"}, +) +``` + +Alternatively, if you need more granular configuration of the underlying dataset, its definition can be provided in full: + +```yaml +# conf/base/catalog.yml + +my_partitioned_dataset: + type: PartitionedDataSet + path: s3://my-bucket-name/path/to/folder + dataset: # full dataset config notation + type: pandas.CSVDataSet + load_args: + delimiter: "," + save_args: + index: false + credentials: my_credentials + load_args: + load_arg1: value1 + load_arg2: value2 + filepath_arg: filepath # the argument of the dataset to pass the filepath to + filename_suffix: ".csv" +``` + +Here is an exhaustive list of the arguments supported by `PartitionedDataSet`: + +| Argument | Required | Supported types | Description | +| ----------------- | ------------------------------ | ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | Yes | `str` | Path to the folder containing partitioned data. If path starts with the protocol (e.g., `s3://`) then the corresponding `fsspec` concrete filesystem implementation will be used. If protocol is not specified, local filesystem will be used | +| `dataset` | Yes | `str`, `Type[AbstractDataSet]`, `Dict[str, Any]` | Underlying dataset definition, for more details see the section below | +| `credentials` | No | `Dict[str, Any]` | Protocol-specific options that will be passed to `fsspec.filesystemcall`, for more details see the section below | +| `load_args` | No | `Dict[str, Any]` | Keyword arguments to be passed into `find()` method of the corresponding filesystem implementation | +| `filepath_arg` | No | `str` (defaults to `filepath`) | Argument name of the underlying dataset initializer that will contain a path to an individual partition | +| `filename_suffix` | No | `str` (defaults to an empty string) | If specified, partitions that don't end with this string will be ignored | + +#### Dataset definition + +Dataset definition should be passed into the `dataset` argument of the `PartitionedDataSet`. The dataset definition is used to instantiate a new dataset object for each individual partition, and use that dataset object for load and save operations. Dataset definition supports shorthand and full notations. + +##### Shorthand notation + +Requires you only to specify a class of the underlying dataset either as a string (e.g. `pandas.CSVDataSet` or a fully qualified class path like `kedro_datasets.pandas.CSVDataSet`) or as a class object that is a subclass of the [AbstractDataSet](/kedro.io.AbstractDataSet). + +##### Full notation + +Full notation allows you to specify a dictionary with the full underlying dataset definition _except_ the following arguments: +* The argument that receives the partition path (`filepath` by default) - if specified, a `UserWarning` will be emitted stating that this value will be overridden by individual partition paths +* `credentials` key - specifying it will result in a `DataSetError` being raised; dataset credentials should be passed into the `credentials` argument of the `PartitionedDataSet` rather than the underlying dataset definition - see the section below on [partitioned dataset credentials](#partitioned-dataset-credentials) for details +* `versioned` flag - specifying it will result in a `DataSetError` being raised; versioning cannot be enabled for the underlying datasets + +#### Partitioned dataset credentials + +```{note} +Support for `dataset_credentials` key in the credentials for `PartitionedDataSet` is now deprecated. The dataset credentials should be specified explicitly inside the dataset config. +``` + +Credentials management for `PartitionedDataSet` is somewhat special, because it might contain credentials for both `PartitionedDataSet` itself _and_ the underlying dataset that is used for partition load and save. Top-level credentials are passed to the underlying dataset config (unless such config already has credentials configured), but not the other way around - dataset credentials are never propagated to the filesystem. + +Here is the full list of possible scenarios: + +| Top-level credentials | Underlying dataset credentials | Example `PartitionedDataSet` definition | Description | +| --------------------- | ------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Undefined | Undefined | `PartitionedDataSet(path="s3://bucket-name/path/to/folder", dataset="pandas.CSVDataSet")` | Credentials are not passed to the underlying dataset or the filesystem | +| Undefined | Specified | `PartitionedDataSet(path="s3://bucket-name/path/to/folder", dataset={"type": "pandas.CSVDataSet", "credentials": {"secret": True}})` | Underlying dataset credentials are passed to the `CSVDataSet` constructor, filesystem is instantiated without credentials | +| Specified | Undefined | `PartitionedDataSet(path="s3://bucket-name/path/to/folder", dataset="pandas.CSVDataSet", credentials={"secret": True})` | Top-level credentials are passed to the underlying `CSVDataSet` constructor and the filesystem | +| Specified | `None` | `PartitionedDataSet(path="s3://bucket-name/path/to/folder", dataset={"type": "pandas.CSVDataSet", "credentials": None}, credentials={"dataset_secret": True})` | Top-level credentials are passed to the filesystem, `CSVDataSet` is instantiated without credentials - this way you can stop the top-level credentials from propagating into the dataset config | +| Specified | Specified | `PartitionedDataSet(path="s3://bucket-name/path/to/folder", dataset={"type": "pandas.CSVDataSet", "credentials": {"dataset_secret": True}}, credentials={"secret": True})` | Top-level credentials are passed to the filesystem, underlying dataset credentials are passed to the `CSVDataSet` constructor | + +### Partitioned dataset load + +Let's assume that the Kedro pipeline that you are working with contains the node, defined as follows: + +```python +from kedro.pipeline import node + +node(concat_partitions, inputs="my_partitioned_dataset", outputs="concatenated_result") +``` + +The underlying node function `concat_partitions` might look like this: + +```python +from typing import Any, Callable, Dict +import pandas as pd + + +def concat_partitions(partitioned_input: Dict[str, Callable[[], Any]]) -> pd.DataFrame: + """Concatenate input partitions into one pandas DataFrame. + + Args: + partitioned_input: A dictionary with partition ids as keys and load functions as values. + + Returns: + Pandas DataFrame representing a concatenation of all loaded partitions. + """ + result = pd.DataFrame() + + for partition_key, partition_load_func in sorted(partitioned_input.items()): + partition_data = partition_load_func() # load the actual partition data + # concat with existing result + result = pd.concat([result, partition_data], ignore_index=True, sort=True) + + return result +``` + +As you can see from the above example, on load `PartitionedDataSet` _does not_ automatically load the data from the located partitions. Instead, `PartitionedDataSet` returns a dictionary with partition IDs as keys and the corresponding load functions as values. It allows the node that consumes the `PartitionedDataSet` to implement the logic that defines what partitions need to be loaded, and how this data is going to be processed. + +Partition ID _does not_ represent the whole partition path, but only a part of it that is unique for a given partition _and_ filename suffix: + +* Example 1: if `path=s3://my-bucket-name/folder` and partition is stored in `s3://my-bucket-name/folder/2019-12-04/data.csv`, then its Partition ID is `2019-12-04/data.csv`. + + +* Example 2: if `path=s3://my-bucket-name/folder` and `filename_suffix=".csv"` and partition is stored in `s3://my-bucket-name/folder/2019-12-04/data.csv`, then its Partition ID is `2019-12-04/data`. + +`PartitionedDataSet` implements caching on load operation, which means that if multiple nodes consume the same `PartitionedDataSet`, they will all receive the same partition dictionary even if some new partitions were added to the folder after the first load has been completed. This is done deliberately to guarantee the consistency of load operations between the nodes and avoid race conditions. To reset the cache, call the `release()` method of the partitioned dataset object. + +### Partitioned dataset save + +`PartitionedDataSet` also supports a save operation. Let's assume the following configuration: + +```yaml +# conf/base/catalog.yml + +new_partitioned_dataset: + type: PartitionedDataSet + path: s3://my-bucket-name + dataset: pandas.CSVDataSet + filename_suffix: ".csv" +``` + +node definition: + +```python +from kedro.pipeline import node + +node(create_partitions, inputs=None, outputs="new_partitioned_dataset") +``` + +and underlying node function `create_partitions`: + +```python +from typing import Any, Dict +import pandas as pd + + +def create_partitions() -> Dict[str, Any]: + """Create new partitions and save using PartitionedDataSet. + + Returns: + Dictionary with the partitions to create. + """ + return { + # create a file "s3://my-bucket-name/part/foo.csv" + "part/foo": pd.DataFrame({"data": [1, 2]}), + # create a file "s3://my-bucket-name/part/bar.csv.csv" + "part/bar.csv": pd.DataFrame({"data": [3, 4]}), + } +``` + +```{note} +Writing to an existing partition may result in its data being overwritten, if this case is not specifically handled by the underlying dataset implementation. You should implement your own checks to ensure that no existing data is lost when writing to a `PartitionedDataSet`. The simplest safety mechanism could be to use partition IDs with a high chance of uniqueness: for example, the current timestamp. +``` + +### Partitioned dataset lazy saving +`PartitionedDataSet` also supports lazy saving, where the partition's data is not materialised until it is time to write. +To use this, simply return `Callable` types in the dictionary: + +```python +from typing import Any, Dict, Callable +import pandas as pd + + +def create_partitions() -> Dict[str, Callable[[], Any]]: + """Create new partitions and save using PartitionedDataSet. + + Returns: + Dictionary of the partitions to create to a function that creates them. + """ + return { + # create a file "s3://my-bucket-name/part/foo.csv" + "part/foo": lambda: pd.DataFrame({"data": [1, 2]}), + # create a file "s3://my-bucket-name/part/bar.csv" + "part/bar": lambda: pd.DataFrame({"data": [3, 4]}), + } +``` + +```{note} +When using lazy saving, the dataset will be written _after_ the `after_node_run` [hook](../hooks/introduction). +``` + +### Incremental loads with `IncrementalDataSet` + +[IncrementalDataSet](/kedro.io.IncrementalDataSet) is a subclass of `PartitionedDataSet`, which stores the information about the last processed partition in the so-called `checkpoint`. `IncrementalDataSet` addresses the use case when partitions have to be processed incrementally, i.e. each subsequent pipeline run should only process the partitions which were not processed by the previous runs. + +This checkpoint, by default, is persisted to the location of the data partitions. For example, for `IncrementalDataSet` instantiated with path `s3://my-bucket-name/path/to/folder`, the checkpoint will be saved to `s3://my-bucket-name/path/to/folder/CHECKPOINT`, unless [the checkpoint configuration is explicitly overwritten](#checkpoint-configuration). + +The checkpoint file is only created _after_ [the partitioned dataset is explicitly confirmed](#incremental-dataset-confirm). + +#### Incremental dataset load + +Loading `IncrementalDataSet` works similarly to [`PartitionedDataSet`](#partitioned-dataset-load) with several exceptions: +1. `IncrementalDataSet` loads the data _eagerly_, so the values in the returned dictionary represent the actual data stored in the corresponding partition, rather than a pointer to the load function. `IncrementalDataSet` considers a partition relevant for processing if its ID satisfies the comparison function, given the checkpoint value. +2. `IncrementalDataSet` _does not_ raise a `DataSetError` if load finds no partitions to return - an empty dictionary is returned instead. An empty list of available partitions is part of a normal workflow for `IncrementalDataSet`. + +#### Incremental dataset save + +The `IncrementalDataSet` save operation is identical to the [save operation of the `PartitionedDataSet`](#partitioned-dataset-save). + +#### Incremental dataset confirm + +```{note} +The checkpoint value *is not* automatically updated when a new set of partitions is successfully loaded or saved. +``` + +Partitioned dataset checkpoint update is triggered by an explicit `confirms` instruction in one of the nodes downstream. It can be the same node, which processes the partitioned dataset: + +```python +from kedro.pipeline import node + +# process and then confirm `IncrementalDataSet` within the same node +node( + process_partitions, + inputs="my_partitioned_dataset", + outputs="my_processed_dataset", + confirms="my_partitioned_dataset", +) +``` + +Alternatively, confirmation can be deferred to one of the nodes downstream, allowing you to implement extra validations before the loaded partitions are considered successfully processed: + +```python +from kedro.pipeline import node, pipeline + +pipeline( + [ + node( + func=process_partitions, + inputs="my_partitioned_dataset", + outputs="my_processed_dataset", + ), + # do something else + node( + func=confirm_partitions, + # note that the node may not require 'my_partitioned_dataset' as an input + inputs="my_processed_dataset", + outputs=None, + confirms="my_partitioned_dataset", + ), + # ... + node( + func=do_something_else_with_partitions, + # will return the same partitions even though they were already confirmed + inputs=["my_partitioned_dataset", "my_processed_dataset"], + outputs=None, + ), + ] +) +``` + +Important notes about the confirmation operation: + +* Confirming a partitioned dataset does not affect any subsequent loads within the same run. All downstream nodes that input the same partitioned dataset as input will all receive the _same_ partitions. Partitions that are created externally during the run will also not affect the dataset loads and won't appear in the list of loaded partitions until the next run or until the [`release()`](/kedro.io.IncrementalDataSet) method is called on the dataset object. +* A pipeline cannot contain more than one node confirming the same dataset. + + +#### Checkpoint configuration + +`IncrementalDataSet` does not require explicit configuration of the checkpoint unless there is a need to deviate from the defaults. To update the checkpoint configuration, add a `checkpoint` key containing the valid dataset configuration. This may be required if, say, the pipeline has read-only permissions to the location of partitions (or write operations are undesirable for any other reason). In such cases, `IncrementalDataSet` can be configured to save the checkpoint elsewhere. The `checkpoint` key also supports partial config updates where only some checkpoint attributes are overwritten, while the defaults are kept for the rest: + +```yaml +my_partitioned_dataset: + type: IncrementalDataSet + path: s3://my-bucket-name/path/to/folder + dataset: pandas.CSVDataSet + checkpoint: + # update the filepath and load_args, but keep the dataset type unchanged + filepath: gcs://other-bucket/CHECKPOINT + load_args: + k1: v1 +``` + +#### Special checkpoint config keys + +Along with the standard dataset attributes, `checkpoint` config also accepts two special optional keys: +* `comparison_func` (defaults to `operator.gt`) - a fully qualified import path to the function that will be used to compare a partition ID with the checkpoint value, to determine whether a partition should be processed. Such functions must accept two positional string arguments - partition ID and checkpoint value - and return `True` if such partition is considered to be past the checkpoint. It might be useful to specify your own `comparison_func` if you need to customise the checkpoint filtration mechanism - for example, you might want to implement windowed loading, where you always want to load the partitions representing the last calendar month. See the example config specifying a custom comparison function: + +```yaml +my_partitioned_dataset: + type: IncrementalDataSet + path: s3://my-bucket-name/path/to/folder + dataset: pandas.CSVDataSet + checkpoint: + comparison_func: my_module.path.to.custom_comparison_function # the path must be importable +``` + +* `force_checkpoint` - if set, the partitioned dataset will use this value as the checkpoint instead of loading the corresponding checkpoint file. This might be useful if you need to roll back the processing steps and reprocess some (or all) of the available partitions. See the example config forcing the checkpoint value: + +```yaml +my_partitioned_dataset: + type: IncrementalDataSet + path: s3://my-bucket-name/path/to/folder + dataset: pandas.CSVDataSet + checkpoint: + force_checkpoint: 2020-01-01/data.csv +``` + +```{note} +Specification of `force_checkpoint` is also supported via the shorthand notation, as follows: +``` + +```yaml +my_partitioned_dataset: + type: IncrementalDataSet + path: s3://my-bucket-name/path/to/folder + dataset: pandas.CSVDataSet + checkpoint: 2020-01-01/data.csv +``` + +```{note} +If you need to force the partitioned dataset to load all available partitions, set `checkpoint` to an empty string: +``` + +```yaml +my_partitioned_dataset: + type: IncrementalDataSet + path: s3://my-bucket-name/path/to/folder + dataset: pandas.CSVDataSet + checkpoint: "" +``` diff --git a/docs/source/10_deployment/11_airflow_astronomer.md b/docs/source/deployment/airflow_astronomer.md similarity index 53% rename from docs/source/10_deployment/11_airflow_astronomer.md rename to docs/source/deployment/airflow_astronomer.md index 740cfc9d7d..307e9ab903 100644 --- a/docs/source/10_deployment/11_airflow_astronomer.md +++ b/docs/source/deployment/airflow_astronomer.md @@ -1,29 +1,42 @@ -# How to deploy your Kedro pipeline on Apache Airflow with Astronomer +# Apache Airflow +Apache Airflow is a popular open-source workflow management platform. It is a suitable engine to orchestrate and execute a pipeline authored with Kedro because workflows in Airflow are modelled and organised as [DAGs](https://en.wikipedia.org/wiki/Directed_acyclic_graph). + +## How to run a Kedro pipeline on Apache Airflow using a Kubernetes cluster + +The `kedro-airflow-k8s` plugin from GetInData | Part of Xebia enables you to run a Kedro pipeline on Airflow with a Kubernetes cluster. The plugin can be used together with `kedro-docker` to prepare a docker image for pipeline execution. At present, the plugin is available for versions of Kedro < 0.18 only. + +Consult the [GitHub repository for `kedro-airflow-k8s`](https://github.com/getindata/kedro-airflow-k8s) for further details, or take a look at the [documentation](https://kedro-airflow-k8s.readthedocs.io/). -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` -This tutorial explains how to deploy a Kedro project on [Apache Airflow](https://airflow.apache.org/) with [Astronomer](https://www.astronomer.io/). Apache Airflow is an extremely popular open-source workflow management platform. Workflows in Airflow are modelled and organised as [DAGs](https://en.wikipedia.org/wiki/Directed_acyclic_graph), making it a suitable engine to orchestrate and execute a pipeline authored with Kedro. [Astronomer](https://www.astronomer.io/docs/cloud/stable/develop/cli-quickstart) is a managed Airflow platform which allows users to spin up and run an Airflow cluster easily in production. Additionally, it also provides a set of tools to help users get started with Airflow locally in the easiest way possible. +## How to run a Kedro pipeline on Apache Airflow with Astronomer -The following discusses how to run the [example Iris classification pipeline](../02_get_started/05_example_project) on a local Airflow cluster with Astronomer. +The following tutorial uses a different approach and shows how to deploy a Kedro project on [Apache Airflow](https://airflow.apache.org/) with [Astronomer](https://www.astronomer.io/). -## Strategy +[Astronomer](https://docs.astronomer.io/astro/install-cli) is a managed Airflow platform which allows users to spin up and run an Airflow cluster easily in production. Additionally, it also provides a set of tools to help users get started with Airflow locally in the easiest way possible. + +The tutorial discusses how to run the [example Iris classification pipeline](../get_started/new_project.md#create-a-new-project-containing-example-code) on a local Airflow cluster with Astronomer. You may also consider using our [`astro-airflow-iris` starter](https://github.com/kedro-org/kedro-starters/tree/main/astro-airflow-iris) which provides a template containing the boilerplate code that the tutorial describes: + +```shell +kedro new --starter=astro-airflow-iris +``` -The general strategy to deploy a Kedro pipeline on Apache Airflow is to run every Kedro node as an [Airflow task](https://airflow.apache.org/docs/apache-airflow/stable/concepts/tasks.html) while the whole pipeline is converted into a [DAG](https://airflow.apache.org/docs/apache-airflow/stable/concepts/dags.html) for orchestration purpose. This approach mirrors the principles of running Kedro in a [distributed](03_distributed) environment. -## Prerequisites +### Strategy -To follow along with this tutorial, make sure you have the following: +The general strategy to deploy a Kedro pipeline on Apache Airflow is to run every Kedro node as an [Airflow task](https://airflow.apache.org/docs/apache-airflow/stable/concepts/tasks.html) while the whole pipeline is converted into a [DAG](https://airflow.apache.org/docs/apache-airflow/stable/concepts/dags.html) for orchestration purpose. This approach mirrors the principles of [running Kedro in a distributed environment](distributed.md). -* An Airflow cluster: you can follow [Astronomer's quickstart guide](https://www.astronomer.io/docs/cloud/stable/get-started/quickstart) to set one up. -* The [Astro CLI](https://www.astronomer.io/docs/cloud/stable/get-started/quickstart#step-4-install-the-astronomer-cli) installed +### Prerequisites + +To follow this tutorial, ensure you have the following: + +* An Airflow cluster: you can follow [Astronomer's quickstart guide](https://docs.astronomer.io/astro/create-deployment) to set one up. +* The [Astro CLI installed](https://docs.astronomer.io/astro/install-cli) * `kedro>=0.17` installed -## Project Setup +### Tutorial project setup -1. [Initialise an Airflow project with Astro](https://www.astronomer.io/docs/cloud/stable/get-started/quickstart#step-5-initialize-an-airflow-project). Let's call it `kedro-airflow-iris` +1. [Initialise an Airflow project with Astro](https://docs.astronomer.io/astro/create-project). Let's call it `kedro-airflow-iris` ```shell mkdir kedro-airflow-iris @@ -56,7 +69,6 @@ To follow along with this tutorial, make sure you have the following: ├── data ├── docs ├── include - ├── logs ├── notebooks ├── packages.txt ├── plugins @@ -66,17 +78,17 @@ To follow along with this tutorial, make sure you have the following: └── src ``` -4. Install [`kedro-airflow~=0.4`](https://github.com/quantumblacklabs/kedro-airflow). We will use this plugin to convert the Kedro pipeline into an Airflow DAG. +4. Install [`kedro-airflow~=0.4`](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow). We will use this plugin to convert the Kedro pipeline into an Airflow DAG. ```shell pip install kedro-airflow~=0.4 ``` -5. Run `kedro install` to install all dependencies. +5. Run `pip install -r src/requirements.txt` to install all dependencies. -## Deployment process +### Deployment process -### Step 1. Create new configuration environment to prepare a compatible `DataCatalog` +#### Step 1. Create new configuration environment to prepare a compatible `DataCatalog` * Create a `conf/airflow` directory in your Kedro project * Create a `catalog.yml` file in this directory with the following content @@ -107,7 +119,7 @@ example_predictions: This ensures that all datasets are persisted so all Airflow tasks can read them without the need to share memory. In the example here we assume that all Airflow tasks share one disk, but for distributed environment you would need to use non-local filepaths. -### Step 2. Package the Kedro pipeline as an Astronomer-compliant Docker image +#### Step 2. Package the Kedro pipeline as an Astronomer-compliant Docker image * **Step 2.1**: Package the Kedro pipeline as a Python package so you can install it into the container later on: @@ -131,13 +143,13 @@ FROM quay.io/astronomer/ap-airflow:2.0.0-buster-onbuild RUN pip install --user dist/new_kedro_project-0.1-py3-none-any.whl ``` -### Step 3. Convert the Kedro pipeline into an Airflow DAG with `kedro airflow` +#### Step 3. Convert the Kedro pipeline into an Airflow DAG with `kedro airflow` ```shell kedro airflow create --target-dir=dags/ --env=airflow ``` -### Step 4. Launch the local Airflow cluster with Astronomer +#### Step 4. Launch the local Airflow cluster with Astronomer ```shell astro dev start @@ -148,11 +160,3 @@ If you visit the Airflow UI, you should now see the Kedro pipeline as an Airflow ![](../meta/images/kedro_airflow_dag.png) ![](../meta/images/kedro_airflow_dag_run.png) - -## Final thought - -This tutorial walks you through the manual process of deploying an existing Kedro project on Apache Airflow with Astronomer. However, if you are starting out, consider using our `astro-iris` starter which provides all the aforementioned boilerplate out of the box: - -```shell -kedro new --starter=astro-iris -``` diff --git a/docs/source/deployment/amazon_sagemaker.md b/docs/source/deployment/amazon_sagemaker.md new file mode 100644 index 0000000000..c6134244d7 --- /dev/null +++ b/docs/source/deployment/amazon_sagemaker.md @@ -0,0 +1,9 @@ +# Amazon SageMaker + +Amazon SageMaker provides the components used for machine learning in a single toolset that supports both classical machine learning libraries like [`scikit-learn`](https://scikit-learn.org/) or [`XGBoost`](https://xgboost.readthedocs.io/), and Deep Learning frameworks such as [`TensorFlow`](https://www.tensorflow.org/) or [`PyTorch`](https://pytorch.org/). + +Amazon SageMaker is a fully-managed service and its features are covered by the [official service documentation](https://docs.aws.amazon.com/sagemaker/index.html). + +## The `kedro-sagemaker` plugin + +The `kedro-sagemaker` plugin from GetInData | Part of Xebia enables you to run a Kedro pipeline on Amazon Sagemaker. Consult the [GitHub repository for `kedro-sagemaker`](https://github.com/getindata/kedro-sagemaker) for further details, or take a look at the [documentation](https://kedro-sagemaker.readthedocs.io/). diff --git a/docs/source/10_deployment/04_argo.md b/docs/source/deployment/argo.md similarity index 67% rename from docs/source/10_deployment/04_argo.md rename to docs/source/deployment/argo.md index e43557227a..f66b809b0e 100644 --- a/docs/source/10_deployment/04_argo.md +++ b/docs/source/deployment/argo.md @@ -1,6 +1,11 @@ -# Deployment with Argo Workflows +# Argo Workflows (outdated documentation that needs review) + +``` {important} +This page contains outdated documentation that has not been tested against recent Kedro releases. If you successfully use Argo Workflows with a recent version of Kedro, consider telling us the steps you took on [Slack](https://slack.kedro.org) or [GitHub](https://github.com/kedro-org/kedro/issues). +``` + +
This page explains how to convert your Kedro pipeline to use Argo Workflows, an open-source container-native workflow engine for orchestrating parallel jobs on Kubernetes. -This page explains how to convert your Kedro pipeline to use [Argo Workflows](https://github.com/argoproj/argo-workflows), an open source container-native workflow engine for orchestrating parallel jobs on [Kubernetes](https://kubernetes.io/). ## Why would you use Argo Workflows? @@ -14,15 +19,15 @@ Here are the main reasons to use Argo Workflows: ## Prerequisites -To use Argo Workflows, make sure you have the following prerequisites in place: +To use Argo Workflows, ensure you have the following prerequisites in place: -- Argo Workflows is [installed](https://github.com/argoproj/argo/blob/master/README.md#quickstart) on your Kubernetes cluster -- Argo CLI is [installed](https://github.com/argoproj/argo/releases) on you machine -- A `name` attribute is set for each Kedro [node](/kedro.pipeline.node) since it is used to build a DAG -- All node input/output DataSets must be configured in `catalog.yml` and refer to an external location (e.g. [AWS S3](../05_data/01_data_catalog.md#using-the-data-catalog-with-the-yaml-api)); you cannot use the `MemoryDataSet` in your workflow +- [Argo Workflows is installed](https://github.com/argoproj/argo/blob/master/README.md#quickstart) on your Kubernetes cluster +- [Argo CLI is installed](https://github.com/argoproj/argo/releases) on your machine +- A `name` attribute is set for each [Kedro node](/kedro.pipeline.node) since it is used to build a DAG +- [All node input/output DataSets must be configured in `catalog.yml`](../data/data_catalog.md#use-the-data-catalog-with-the-yaml-api) and refer to an external location (e.g. AWS S3); you cannot use the `MemoryDataSet` in your workflow -```eval_rst -.. note:: Each node will run in its own container. +```{note} +Each node will run in its own container. ``` ## How to run your Kedro pipeline using Argo Workflows @@ -31,9 +36,9 @@ To use Argo Workflows, make sure you have the following prerequisites in place: First, you need to containerise your Kedro project, using any preferred container solution (e.g. [`Docker`](https://www.docker.com/)), to build an image to use in Argo Workflows. -For the purpose of this walk-through, we are going to assume a `Docker` workflow. We recommend the [`Kedro-Docker`](https://github.com/quantumblacklabs/kedro-docker) plugin to streamline the process. [Instructions for Kedro-Docker are in the plugin's README.md](https://github.com/quantumblacklabs/kedro-docker/blob/master/README.md). +For the purpose of this walk-through, we are going to assume a `Docker` workflow. We recommend the [`Kedro-Docker`](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) plugin to streamline the process. [Instructions for Kedro-Docker are in the plugin's README.md](https://github.com/kedro-org/kedro-plugins/blob/main/README.md). -After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./02_single_machine.md#how-to-use-container-registry). +After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./single_machine.md#how-to-use-container-registry). ### Create Argo Workflows spec @@ -65,16 +70,16 @@ def generate_argo_config(image, pipeline_name, env): project_path = Path.cwd() metadata = bootstrap_project(project_path) - project_name = metadata.project_name + package_name = metadata.package_name pipeline_name = pipeline_name or "__default__" pipeline = pipelines.get(pipeline_name) tasks = get_dependencies(pipeline.node_dependencies) - output = template.render(image=image, project_name=project_name, tasks=tasks) + output = template.render(image=image, package_name=package_name, tasks=tasks) - (SEARCH_PATH / f"argo-{project_name}.yml").write_text(output) + (SEARCH_PATH / f"argo-{package_name}.yml").write_text(output) def get_dependencies(dependencies): @@ -113,7 +118,7 @@ Add the following Argo Workflows spec template to `/templates/argo apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: - generateName: {{ project_name }}- + generateName: {{ package_name }}- spec: entrypoint: dag templates: @@ -165,19 +170,19 @@ spec: ``` -```eval_rst -.. note:: The Argo Workflows is defined as the dependencies between tasks using a directed-acyclic graph (DAG). +```{note} +The Argo Workflows is defined as the dependencies between tasks using a directed-acyclic graph (DAG). ``` -For the purpose of this walk-through, we are going to use AWS S3 bucket for DataSets therefore `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables must be set to have an ability to communicate with S3. The `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` values should be stored in [Kubernetes Secrets](https://kubernetes.io/docs/concepts/configuration/secret/) (an example Kubernetes Secrets spec is given [below](#submit-argo-workflows-spec-to-kubernetes)). +For the purpose of this walk-through, we will use an AWS S3 bucket for DataSets; therefore `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables must be set to have an ability to communicate with S3. The `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` values should be stored in [Kubernetes Secrets](https://kubernetes.io/docs/concepts/configuration/secret/) (an example [Kubernetes Secrets spec is given below](#submit-argo-workflows-spec-to-kubernetes)). -The spec template is written with using [Jinja templating language](https://jinja.palletsprojects.com/en/2.11.x/) so you need to install the Jinja Python package: +The spec template is written with the [Jinja templating language](https://jinja.palletsprojects.com/en/2.11.x/), so you must install the Jinja Python package: ```console $ pip install Jinja2 ``` -Finally, run the helper script from project's directory to build the Argo Workflows spec (the spec will be saved to `/templates/argo-.yml` file). +Finally, run the helper script from project's directory to build the Argo Workflows spec (the spec will be saved to `/templates/argo-.yml` file). ```console $ cd @@ -219,11 +224,11 @@ Now, you are ready to submit the Argo Workflows spec as follows: ```console $ cd -$ argo submit --watch templates/argo-.yml +$ argo submit --watch templates/argo-.yml ``` -```eval_rst -.. note:: The Argo Workflows should be submitted to the same namespace as the Kubernetes Secrets. Please refer to the Argo CLI help to get more details about the usage. +```{note} +The Argo Workflows should be submitted to the same namespace as the Kubernetes Secrets. Please refer to the Argo CLI help to get more details about the usage. ``` In order to clean up your Kubernetes cluster you can use the following commands: @@ -237,6 +242,7 @@ $ kubectl delete -f secret.yml As an alternative, you can use [Kedro-Argo plugin](https://pypi.org/project/kedro-argo/) to convert a Kedro project to Argo Workflows. -```eval_rst -.. warning:: The plugin is not supported by the Kedro team and we can't guarantee its workability. +```{warning} +The plugin is not supported by the Kedro team and we can't guarantee its workability. ``` +
diff --git a/docs/source/10_deployment/07_aws_batch.md b/docs/source/deployment/aws_batch.md similarity index 75% rename from docs/source/10_deployment/07_aws_batch.md rename to docs/source/deployment/aws_batch.md index ff59c2ee93..976d5e9e5a 100644 --- a/docs/source/10_deployment/07_aws_batch.md +++ b/docs/source/deployment/aws_batch.md @@ -1,19 +1,24 @@ -# Deployment with AWS Batch +# AWS Batch (outdated documentation that needs review) + +``` {important} +This page contains outdated documentation that has not been tested against recent Kedro releases. If you successfully use AWS Batch with a recent version of Kedro, consider telling us the steps you took on [Slack](https://slack.kedro.org) or [GitHub](https://github.com/kedro-org/kedro/issues). +``` +
## Why would you use AWS Batch? [AWS Batch](https://aws.amazon.com/batch/) is optimised for batch computing and applications that scale with the number of jobs running in parallel. It manages job execution and compute resources, and dynamically provisions the optimal quantity and type. AWS Batch can assist with planning, scheduling, and executing your batch computing workloads, using [Amazon EC2](https://aws.amazon.com/ec2/) On-Demand and [Spot Instances](https://aws.amazon.com/ec2/spot/), and it has native integration with [CloudWatch](https://aws.amazon.com/cloudwatch/) for log collection. -AWS Batch helps you run massively parallel Kedro pipelines in a cost-effective way, and allows you to parallelise the pipeline execution across a number of compute instances. Each Batch job is run in an isolated Docker container environment. +AWS Batch helps you run massively parallel Kedro pipelines in a cost-effective way, and allows you to parallelise the pipeline execution across multiple compute instances. Each Batch job is run in an isolated Docker container environment. -The following sections are a guide on how to deploy a Kedro project to AWS Batch, and uses the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) as primary example. The guide assumes that you have already completed the tutorial, and that the project was created with the project name **Kedro Tutorial**. +The following sections are a guide on how to deploy a Kedro project to AWS Batch, and uses the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) as primary example. The guide assumes that you have already completed the tutorial, and that the project was created with the project name **Kedro Tutorial**. ## Prerequisites -To use AWS Batch, make sure you have the following prerequisites in place: +To use AWS Batch, ensure you have the following prerequisites in place: - An [AWS account set up](https://aws.amazon.com/premiumsupport/knowledge-center/create-and-activate-aws-account/). -- A `name` attribute is set for each Kedro [node](/kedro.pipeline.node). Each node will run in its own Batch job, so having sensible node names will make it easier to `kedro run --node `. -- All node input/output `DataSets` must be configured in `catalog.yml` and refer to an external location (e.g. [AWS S3](../05_data/01_data_catalog.md#using-the-data-catalog-with-the-yaml-api)). A clean way to do this is to create a new configuration environment `conf/aws_batch` containing a `catalog.yml` file with the appropriate configuration, as illustrated below. +- A `name` attribute is set for each [Kedro node](/kedro.pipeline.node). Each node will run in its own Batch job, so having sensible node names will make it easier to `kedro run --node=`. +- [All node input/output `DataSets` must be configured in `catalog.yml`](../data/data_catalog.md#use-the-data-catalog-with-the-yaml-api) and refer to an external location (e.g. AWS S3). A clean way to do this is to create a new configuration environment `conf/aws_batch` containing a `catalog.yml` file with the appropriate configuration, as illustrated below.
Click to expand @@ -71,11 +76,11 @@ y_test: ### Containerise your Kedro project -First, you need to containerise your Kedro project, using any preferred container solution (e.g. [`Docker`](https://www.docker.com/)), to build an image to use in AWS Batch. +First, you need to containerise your Kedro project, using any preferred container solution (e.g. [Docker](https://www.docker.com/)), to build an image to use in AWS Batch. -For the purpose of this walk-through, we are going to assume a `Docker` workflow. We recommend using the [`Kedro-Docker`](https://github.com/quantumblacklabs/kedro-docker) plugin to streamline the process. [Instructions for using this are in the plugin's README.md](https://github.com/quantumblacklabs/kedro-docker/blob/master/README.md). +For the purpose of this walk-through, we are going to assume a Docker workflow. We recommend using the [Kedro-Docker plugin](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) to streamline the process. [Instructions for using this are in the plugin's README.md](https://github.com/kedro-org/kedro-plugins/blob/main/README.md). -After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./02_single_machine.md#how-to-use-container-registry), for instance [AWS ECR](https://aws.amazon.com/ecr/). You can find instructions on how to push your Docker image to ECR [in Amazon's ECR documentation](https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html). +After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./single_machine.md#how-to-use-container-registry), for instance [AWS ECR](https://aws.amazon.com/ecr/). You can find instructions on how to push your Docker image to ECR [in Amazon's ECR documentation](https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html). Alternatively, once you've created a container repository, click the `View Push Commands` button in the top-right corner of the [ECR dashboard](https://console.aws.amazon.com/ecr). @@ -95,8 +100,8 @@ Job definitions provide the template for resources needed for running a job. Cre Next you need a compute environment where the work will be executed. Create a _managed_, on-demand one named `spaceflights_env` and let it choose to create new service and instance roles if you don't have any yet. Having a managed environment means that AWS will automatically handle the scaling of your instances. -```eval_rst -.. note:: This compute environment won't contain any instances until you trigger the pipeline run. Therefore, creating it does not incur any immediate costs. +```{note} +This compute environment won't contain any instances until you trigger the pipeline run. Therefore, creating it does not incur any immediate costs. ``` #### Create AWS Batch job queue @@ -105,10 +110,10 @@ A job queue is the bridge between the submitted jobs and the compute environment ### Configure the credentials -Ensure you have the necessary AWS credentials in place before moving on, so that your pipeline can access and interact with the AWS services. Check out [the AWS CLI documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html#cli-configure-quickstart-config) for instructions on how to set this up. +Ensure you have the necessary AWS credentials in place before moving on, so that your pipeline can access and interact with the AWS services. Check out [the AWS CLI documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) for instructions on how to set this up. -```eval_rst -.. note:: You should configure the ``default region`` to match the region where you've created the Batch resources. +```{note} +You should configure the default region to match the region where you've created the Batch resources. ``` @@ -118,12 +123,14 @@ Now that all the resources are in place, it's time to submit jobs to Batch progr #### Create a custom runner -Create a new Python package `runner` in your `src` folder, i.e. `kedro_tutorial/src/kedro_tutorial/runner/`. Make sure there is an `__init__.py` file at this location and add another file named `batch_runner.py`, which will contain the implementation of your custom runner, `AWSBatchRunner`. The `AWSBatchRunner` will submit and monitor jobs asynchronously, surfacing any errors that occur on Batch. +Create a new Python package `runner` in your `src` folder, i.e. `kedro_tutorial/src/kedro_tutorial/runner/`. Make sure there is an `__init__.py` file at this location, and add another file named `batch_runner.py`, which will contain the implementation of your custom runner, `AWSBatchRunner`. The `AWSBatchRunner` will submit and monitor jobs asynchronously, surfacing any errors that occur on Batch. -Make sure the `__init__.py` file in the `runner` folder includes the following import: +Make sure the `__init__.py` file in the `runner` folder includes the following import and declaration: ```python -from .batch_runner import AWSBatchRunner # NOQA +from .batch_runner import AWSBatchRunner + +__all__ = ["AWSBatchRunner"] ``` Copy the contents of the script below into `batch_runner.py`: @@ -163,7 +170,11 @@ class AWSBatchRunner(ThreadRunner): return super()._get_required_workers_count(pipeline) def _run( # pylint: disable=too-many-locals,useless-suppression - self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None + self, + pipeline: Pipeline, + catalog: DataCatalog, + hook_manager: PluginManager, + session_id: str = None, ) -> None: nodes = pipeline.nodes node_dependencies = pipeline.node_dependencies @@ -204,7 +215,7 @@ class AWSBatchRunner(ThreadRunner): node, node_to_job, node_dependencies[node], - run_id, + session_id, ) futures.add(future) @@ -218,7 +229,7 @@ Next you will want to add the implementation of the `_submit_job()` method refer * Correctly specified upstream dependencies * A unique job name -* The corresponding command to run, namely `kedro run --node `. +* The corresponding command to run, namely `kedro run --node=`. Once submitted, the method tracks progress and surfaces any errors if the jobs end in `FAILED` state. @@ -230,11 +241,11 @@ def _submit_job( node: Node, node_to_job: Dict[Node, str], node_dependencies: Set[Node], - run_id: str, + session_id: str, ) -> Node: self._logger.info("Submitting the job for node: %s", str(node)) - job_name = f"kedro_{run_id}_{node.name}".replace(".", "-") + job_name = f"kedro_{session_id}_{node.name}".replace(".", "-") depends_on = [{"jobId": node_to_job[dep]} for dep in node_dependencies] command = ["kedro", "run", "--node", node.name] @@ -286,35 +297,29 @@ def _track_batch_job(job_id: str, client: Any) -> None: #### Set up Batch-related configuration -You'll need to set the Batch-related configuration that the runner will use. Add a `parameters.yml` file inside the `conf/aws_batch/` directory created as part of the prerequistes steps, which will include the following keys: +You'll need to set the Batch-related configuration that the runner will use. Add a `parameters.yml` file inside the `conf/aws_batch/` directory created as part of the prerequistes with the following keys: ```yaml aws_batch: - job_queue: "spaceflights_queue" - job_definition: "kedro_run" - max_workers: 2 + job_queue: "spaceflights_queue" + job_definition: "kedro_run" + max_workers: 2 ``` #### Update CLI implementation -You're nearly there! Before you can use the new runner, you need to add a `cli.py` file at the same level as `settings.py`, using [the template we provide](../07_extend_kedro/01_common_use_cases.md#use-case-3-how-to-add-or-modify-cli-commands). Add the following `run()` function to your `cli.py` file to make sure the runner class is instantiated correctly: +You're nearly there! Before you can use the new runner, you need to add a `cli.py` file at the same level as `settings.py`, using [the template we provide](../development/commands_reference.md#customise-or-override-project-specific-kedro-commands). Update the `run()` function in the newly-created `cli.py` file to make sure the runner class is instantiated correctly: ```python -def run(tag, env, parallel, ...): +def run(tag, env, ...): """Run the pipeline.""" - if parallel and runner: - raise KedroCliError( - "Both --parallel and --runner options cannot be used together. " - "Please use either --parallel or --runner." - ) runner = runner or "SequentialRunner" - if parallel: - runner = "ParallelRunner" tag = _get_values_as_tuple(tag) if tag else tag node_names = _get_values_as_tuple(node_names) if node_names else node_names with KedroSession.create(env=env, extra_params=params) as session: + context = session.load_context() runner_instance = _instantiate_runner(runner, is_async, context) session.run( tags=tag, @@ -323,6 +328,7 @@ def run(tag, env, parallel, ...): from_nodes=from_nodes, to_nodes=to_nodes, from_inputs=from_inputs, + to_outputs=to_outputs, load_versions=load_version, pipeline_name=pipeline, ) @@ -347,9 +353,10 @@ def _instantiate_runner(runner, is_async, project_context): You're now ready to trigger the run. Execute the following command: ```bash -kedro run --env aws_batch --runner kedro_tutorial.runner.AWSBatchRunner +kedro run --env=aws_batch --runner=kedro_tutorial.runner.AWSBatchRunner ``` You should start seeing jobs appearing on your Jobs dashboard, under the `Runnable` tab - meaning they're ready to start as soon as the resources are provisioned in the compute environment. -AWS Batch has native integration with CloudWatch, where you can check the logs for a particular job. You can either click on the Batch job in the [Jobs](https://console.aws.amazon.com/batch/home/jobs) tab and click `View logs` in the pop-up panel, or go to [CloudWatch dashboard](https://console.aws.amazon.com/cloudwatch), click `Log groups` in the side bar and find `/aws/batch/job`. +AWS Batch has native integration with CloudWatch, where you can check the logs for a particular job. You can either click on [the Batch job in the Jobs tab](https://console.aws.amazon.com/batch/home/jobs) and click `View logs` in the pop-up panel, or go to [CloudWatch dashboard](https://console.aws.amazon.com/cloudwatch), click `Log groups` in the side bar and find `/aws/batch/job`. +
diff --git a/docs/source/10_deployment/10_aws_step_functions.md b/docs/source/deployment/aws_step_functions.md similarity index 79% rename from docs/source/10_deployment/10_aws_step_functions.md rename to docs/source/deployment/aws_step_functions.md index dfa9e20e7f..380f303067 100644 --- a/docs/source/10_deployment/10_aws_step_functions.md +++ b/docs/source/deployment/aws_step_functions.md @@ -1,33 +1,31 @@ -# How to deploy your Kedro pipeline with AWS Step Functions +# AWS Step Functions +This tutorial explains how to deploy a Kedro project with [AWS Step Functions](https://aws.amazon.com/step-functions/?step-functions.sort-by=item.additionalFields.postDateTime&step-functions.sort-order=desc) in order to run a Kedro pipeline in production on AWS [Serverless Computing](https://aws.amazon.com/serverless/) platform. -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` +## Why would you run a Kedro pipeline with AWS Step Functions? -This tutorial explains how to deploy a Kedro project with [AWS Step Functions](https://aws.amazon.com/step-functions/?step-functions.sort-by=item.additionalFields.postDateTime&step-functions.sort-order=desc) in order to run a Kedro pipeline in production on AWS [Serverless Computing](https://aws.amazon.com/serverless/) platform. +A major problem when data pipelines move to production is to build and maintain the underlying compute infrastructure, or [servers](https://en.wikipedia.org/wiki/Server_(computing)). [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) hands the provisioning and management of distributed computing resources to cloud providers, enabling data engineers and data scientists to focus on their business problems. -## Why would you run a Kedro pipeline with AWS Step Functions +[Azure Functions](https://docs.microsoft.com/en-us/azure/azure-functions/) and [AWS Lambda](https://aws.amazon.com/lambda/) are good examples of this solution, but others are available. Services like [AWS Step Functions](https://aws.amazon.com/step-functions/) offer a managed orchestration capability that makes it easy to sequence serverless functions and multiple cloud-native services into business-critical applications. -A major problem when data pipelines move to production is to build and maintain the underlying compute infrastructure, or [servers](https://en.wikipedia.org/wiki/Server_(computing)). However, [serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) can address some aspects of this problem, whereby cloud providers allocate machine resources on demand, allowing data engineers and data scientists to focus on their business problems. [Azure Functions](https://docs.microsoft.com/en-us/azure/azure-functions/) and [AWS Lambda](https://aws.amazon.com/lambda/) are good examples of this solution, but others are available. +From a Kedro perspective, this means the ability to run each node and retain the pipeline's correctness and reliability through a managed orchestrator without the concerns of managing underlying infrastructure. Another benefit of running a Kedro pipeline in a serverless computing platform is the ability to take advantage of other services from the same provider, such as the use of the [feature store for Amazon SageMaker](https://aws.amazon.com/sagemaker/feature-store/) to store features data. -In addition to on-demand compute, services like [AWS Step Functions](https://aws.amazon.com/step-functions/) offer a managed orchestration capability that makes it easy to sequence serverless functions and multiple cloud-native services into business-critical applications. From a Kedro perspective, this means the ability to run each node and retain the pipeline's correctness and reliability through a managed orchestrator without the concerns of managing underlying infrastructure. +The following discusses how to run the Kedro pipeline from the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) on [AWS Step Functions](https://aws.amazon.com/step-functions/). -The following discusses how to run the Kedro pipeline from the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) on [AWS Step Functions](https://aws.amazon.com/step-functions/). ## Strategy -The general strategy to deploy a Kedro pipeline on AWS Step Functions is to run every Kedro node as an [AWS Lambda](https://aws.amazon.com/lambda/) function. The whole pipeline is converted into an [AWS Step Functions State Machine](https://docs.aws.amazon.com/step-functions/latest/dg/tutorial-creating-lambda-state-machine.html) for orchestration purpose. This approach mirrors the principles of running Kedro in a [distributed](03_distributed) environment. +The general strategy to deploy a Kedro pipeline on AWS Step Functions is to run every Kedro node as an [AWS Lambda](https://aws.amazon.com/lambda/) function. The whole pipeline is converted into an [AWS Step Functions State Machine](https://docs.aws.amazon.com/step-functions/latest/dg/tutorial-creating-lambda-state-machine.html) for orchestration. This approach mirrors the principles of [running Kedro in a distributed environment](distributed). ## Prerequisites -To use AWS Step Functions, make sure you have the following: +To use AWS Step Functions, ensure you have the following: - An [AWS account set up](https://aws.amazon.com/premiumsupport/knowledge-center/create-and-activate-aws-account/) - [Configured AWS credentials](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html) on your local machine -- Generated Kedro project called **Spaceflights Step Functions** using [Kedro Spaceflights starter](https://github.com/quantumblacklabs/kedro-starters/tree/master/spaceflights/). +- Generated Kedro project called **Spaceflights Step Functions** using [Kedro Spaceflights starter](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights/). - The final project directory's name should be `spaceflights-step-functions`. - - You should complete the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) to understand the project's structure. + - You should complete the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) to understand the project's structure. * In this tutorial, we will also be using [AWS Cloud Development Kit (CDK)](https://aws.amazon.com/cdk/) to write our deployment script. To install the `cdk` command, please consult [AWS guide](https://docs.aws.amazon.com/cdk/latest/guide/cli.html). The official method of installation is using [npm](https://www.npmjs.com/): @@ -44,7 +42,7 @@ The deployment process for a Kedro pipeline on AWS Step Functions consists of th * Develop the Kedro pipeline locally as normal * Create a new configuration environment in which we ensure all nodes' inputs and outputs have a persistent location on S3, since `MemoryDataSet` can't be shared between AWS Lambda functions * Package the Kedro pipeline as an [AWS Lambda-compliant Docker image](https://docs.aws.amazon.com/lambda/latest/dg/lambda-images.html) -* Write a script to convert and deploy each Kedro node as an AWS Lambda function. Each function will use the same pipeline Docker image created in the previous step and run a single Kedro node associated with it. This follows the principles laid out in our [distributed deployment guide](03_distributed). +* Write a script to convert and deploy each Kedro node as an AWS Lambda function. Each function will use the same pipeline Docker image created in the previous step and run a single Kedro node associated with it. This follows the principles laid out in our [distributed deployment guide](distributed). * The script above will also convert and deploy the entire Kedro pipeline as an AWS Step Functions State Machine. The final deployed AWS Step Functions State Machine will have the following visualisation in AWS Management Console: @@ -113,10 +111,10 @@ y_test: ### Step 2. Package the Kedro pipeline as an AWS Lambda-compliant Docker image -In December 2020, AWS [announced](https://aws.amazon.com/blogs/aws/new-for-aws-lambda-container-image-support/) that an AWS Lambda function can now use a container image up to **10 GB in size** as its deployment package, besides the original zip method. As it has a few [requirements](https://docs.aws.amazon.com/lambda/latest/dg/images-create.html#images-reqs) for the container image to work properly, you will need to build your own custom Docker container image to both contain the Kedro pipeline and to comply with Lambda's requirements. +In December 2020, [AWS announced that an AWS Lambda function can now use a container image up to **10 GB in size**](https://aws.amazon.com/blogs/aws/new-for-aws-lambda-container-image-support/) as its deployment package, besides the original zip method. As it has a few [requirements for the container image to work properly](https://docs.aws.amazon.com/lambda/latest/dg/images-create.html#images-reqs), you must build your own custom Docker container image, both to contain the Kedro pipeline and to comply with Lambda's requirements. -```eval_rst -.. note:: All of the following steps should be done in the Kedro project's root directory. +```{note} +All the following steps should be done in the Kedro project's root directory. ``` * **Step 2.1**: Package the Kedro pipeline as a Python package so you can install it into the container later on: @@ -125,7 +123,7 @@ In December 2020, AWS [announced](https://aws.amazon.com/blogs/aws/new-for-aws-l $ kedro package ``` -For more information, please visit the guide on [packaging Kedro as a Python package](../03_tutorial/05_package_a_project). +For more information, please visit the guide on [packaging Kedro as a Python package](../tutorial/package_a_project). * **Step 2.2**: Create a `lambda_handler.py` file: @@ -136,7 +134,7 @@ from unittest.mock import patch def handler(event, context): from kedro.framework.project import configure_project - configure_project("spaceflights_steps_function") + configure_project("spaceflights_step_functions") node_to_run = event["node_name"] # Since _multiprocessing.SemLock is not implemented on lambda yet, @@ -178,14 +176,14 @@ ARG FUNCTION_DIR ARG RUNTIME_VERSION # Create the function directory RUN mkdir -p ${FUNCTION_DIR} -RUN mkdir -p ${FUNCTION_DIR}/{conf,logs} +RUN mkdir -p ${FUNCTION_DIR}/{conf} # Add handler function COPY lambda_handler.py ${FUNCTION_DIR} # Add conf/ directory COPY conf ${FUNCTION_DIR}/conf # Install Kedro pipeline -COPY dist/spaceflights_steps_function-0.1-py3-none-any.whl . -RUN python${RUNTIME_VERSION} -m pip install --no-cache-dir spaceflights_steps_function-0.1-py3-none-any.whl --target ${FUNCTION_DIR} +COPY dist/spaceflights_step_functions-0.1-py3-none-any.whl . +RUN python${RUNTIME_VERSION} -m pip install --no-cache-dir spaceflights_step_functions-0.1-py3-none-any.whl --target ${FUNCTION_DIR} # Install Lambda Runtime Interface Client for Python RUN python${RUNTIME_VERSION} -m pip install --no-cache-dir awslambdaric --target ${FUNCTION_DIR} @@ -202,7 +200,7 @@ ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] CMD [ "lambda_handler.handler" ] ``` -This `Dockerfile` is adapted from the official guide on [how to create a custom image](https://docs.aws.amazon.com/lambda/latest/dg/images-create.html#images-create-from-alt) for Lambda to include Kedro-specific steps. +This `Dockerfile` is adapted from the official guide on [how to create a custom image](https://docs.aws.amazon.com/lambda/latest/dg/images-create.html#images-types) for Lambda to include Kedro-specific steps. * **Step 2.4**: Build the Docker image and push it to AWS Elastic Container Registry (ECR): @@ -388,8 +386,4 @@ If you go into the state machine and click on `Start Execution`, you will be abl ## Limitations -Generally speaking, the [limitations](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) on AWS Lambda have improved dramatically in recent years. However, it's still worth noting that each Lambda function has a 15-minute timeout, 10GB maximum memory limit and 10GB container image code package size limit. This means, for example, if you have a node that takes longer than 15 minutes to run, you should switch to some other AWS services, such as [AWS Batch](07_aws_batch) or [AWS ECS](https://aws.amazon.com/ecs/), to execute that node. - -## Final thought - -One major benefit of running a Kedro pipeline in a serverless computing platform is the ability to take advantage of other services from the same provider. For example, AWS has recently announced a [Feature Store for SageMaker](https://aws.amazon.com/sagemaker/feature-store/). We could easily use it as the Features layer in Kedro's [Data Engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention). +Generally speaking, the [limitations on AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) have improved dramatically in recent years. However, it's still worth noting that each Lambda function has a 15-minute timeout, 10GB maximum memory limit and 10GB container image code package size limit. This means, for example, if you have a node that takes longer than 15 minutes to run, you should switch to some other AWS services, such as [AWS Batch](aws_batch) or [AWS ECS](https://aws.amazon.com/ecs/), to execute that node. diff --git a/docs/source/deployment/azure.md b/docs/source/deployment/azure.md new file mode 100644 index 0000000000..efe0d2c2c1 --- /dev/null +++ b/docs/source/deployment/azure.md @@ -0,0 +1,8 @@ +# Azure ML pipelines + +## `kedro-azureml` plugin + +For deployment to Azure ML pipelines, you should [consult the documentation](https://kedro-azureml.readthedocs.io/en/stable/source/03_quickstart.html) for the [`kedro-azureml` plugin](https://github.com/getindata/kedro-azureml) from GetInData | Part of Xebia that enables you to run your code on Azure ML Pipelines in a fully managed fashion. + +The plugin supports both: docker-based workflows and code-upload workflows. +Besides that, kedro-azureml also supports distributed training in PyTorch/TensorFlow/MPI and works well with Azure ML native MLflow integration. diff --git a/docs/source/deployment/dask.md b/docs/source/deployment/dask.md new file mode 100644 index 0000000000..9c5734d744 --- /dev/null +++ b/docs/source/deployment/dask.md @@ -0,0 +1,329 @@ +# Dask + +This page explains how to distribute execution of the nodes composing your Kedro pipeline using [Dask](https://docs.dask.org/en/stable/), a flexible, open-source library for parallel computing in Python. + +Dask offers both a default, single-machine scheduler and a more sophisticated, distributed scheduler. The newer [`dask.distributed`](http://distributed.dask.org/en/stable/) scheduler is often preferable, even on single workstations, and is the focus of our deployment guide. For more information on the various ways to set up Dask on varied hardware, see [the official Dask how-to guide](https://docs.dask.org/en/stable/how-to/deploy-dask-clusters.html). + +## Why would you use Dask? + +`Dask.distributed` is a lightweight library for distributed computing in Python. It complements the existing PyData analysis stack, which forms the basis of many Kedro pipelines. It's also pure Python, which eases installation and simplifies debugging. For further motivation on why people choose to adopt Dask, and, more specifically, `dask.distributed`, see [Why Dask?](https://docs.dask.org/en/stable/why.html) and [the `dask.distributed` documentation](http://distributed.dask.org/en/stable/#motivation), respectively. + +## Prerequisites + +The only additional requirement, beyond what was already required by your Kedro pipeline, is to [install `dask.distributed`](http://distributed.dask.org/en/stable/install.html). To review the full installation instructions, including how to set up Python virtual environments, see our [Get Started guide](../get_started/install.md#installation-prerequisites). + +## How to distribute your Kedro pipeline using Dask + +### Create a custom runner + +Create a new Python package `runner` in your `src` folder, i.e. `kedro_tutorial/src/kedro_tutorial/runner/`. Make sure there is an `__init__.py` file at this location, and add another file named `dask_runner.py`, which will contain the implementation of your custom runner, `DaskRunner`. The `DaskRunner` will submit and monitor tasks asynchronously, surfacing any errors that occur during execution. + +Make sure the `__init__.py` file in the `runner` folder includes the following import and declaration: + +```python +from .dask_runner import DaskRunner + +__all__ = ["DaskRunner"] +``` + +Copy the contents of the script below into `dask_runner.py`: + +```python +"""``DaskRunner`` is an ``AbstractRunner`` implementation. It can be +used to distribute execution of ``Node``s in the ``Pipeline`` across +a Dask cluster, taking into account the inter-``Node`` dependencies. +""" +from collections import Counter +from itertools import chain +from typing import Any, Dict + +from distributed import Client, as_completed, worker_client +from kedro.framework.hooks.manager import ( + _create_hook_manager, + _register_hooks, + _register_hooks_setuptools, +) +from kedro.framework.project import settings +from kedro.io import AbstractDataSet, DataCatalog +from kedro.pipeline import Pipeline +from kedro.pipeline.node import Node +from kedro.runner import AbstractRunner, run_node +from pluggy import PluginManager + + +class _DaskDataSet(AbstractDataSet): + """``_DaskDataSet`` publishes/gets named datasets to/from the Dask + scheduler.""" + + def __init__(self, name: str): + self._name = name + + def _load(self) -> Any: + try: + with worker_client() as client: + return client.get_dataset(self._name) + except ValueError: + # Upon successfully executing the pipeline, the runner loads + # free outputs on the scheduler (as opposed to on a worker). + Client.current().get_dataset(self._name) + + def _save(self, data: Any) -> None: + with worker_client() as client: + client.publish_dataset(data, name=self._name, override=True) + + def _exists(self) -> bool: + return self._name in Client.current().list_datasets() + + def _release(self) -> None: + Client.current().unpublish_dataset(self._name) + + def _describe(self) -> Dict[str, Any]: + return dict(name=self._name) + + +class DaskRunner(AbstractRunner): + """``DaskRunner`` is an ``AbstractRunner`` implementation. It can be + used to distribute execution of ``Node``s in the ``Pipeline`` across + a Dask cluster, taking into account the inter-``Node`` dependencies. + """ + + def __init__(self, client_args: Dict[str, Any] = {}, is_async: bool = False): + """Instantiates the runner by creating a ``distributed.Client``. + + Args: + client_args: Arguments to pass to the ``distributed.Client`` + constructor. + is_async: If True, the node inputs and outputs are loaded and saved + asynchronously with threads. Defaults to False. + """ + super().__init__(is_async=is_async) + Client(**client_args) + + def __del__(self): + Client.current().close() + + def create_default_data_set(self, ds_name: str) -> _DaskDataSet: + """Factory method for creating the default dataset for the runner. + + Args: + ds_name: Name of the missing dataset. + + Returns: + An instance of ``_DaskDataSet`` to be used for all + unregistered datasets. + """ + return _DaskDataSet(ds_name) + + @staticmethod + def _run_node( + node: Node, + catalog: DataCatalog, + is_async: bool = False, + session_id: str = None, + *dependencies: Node, + ) -> Node: + """Run a single `Node` with inputs from and outputs to the `catalog`. + + Wraps ``run_node`` to accept the set of ``Node``s that this node + depends on. When ``dependencies`` are futures, Dask ensures that + the upstream node futures are completed before running ``node``. + + A ``PluginManager`` instance is created on each worker because the + ``PluginManager`` can't be serialised. + + Args: + node: The ``Node`` to run. + catalog: A ``DataCatalog`` containing the node's inputs and outputs. + is_async: If True, the node inputs and outputs are loaded and saved + asynchronously with threads. Defaults to False. + session_id: The session id of the pipeline run. + dependencies: The upstream ``Node``s to allow Dask to handle + dependency tracking. Their values are not actually used. + + Returns: + The node argument. + """ + hook_manager = _create_hook_manager() + _register_hooks(hook_manager, settings.HOOKS) + _register_hooks_setuptools(hook_manager, settings.DISABLE_HOOKS_FOR_PLUGINS) + + return run_node(node, catalog, hook_manager, is_async, session_id) + + def _run( + self, + pipeline: Pipeline, + catalog: DataCatalog, + hook_manager: PluginManager, + session_id: str = None, + ) -> None: + nodes = pipeline.nodes + load_counts = Counter(chain.from_iterable(n.inputs for n in nodes)) + node_dependencies = pipeline.node_dependencies + node_futures = {} + + client = Client.current() + for node in nodes: + dependencies = ( + node_futures[dependency] for dependency in node_dependencies[node] + ) + node_futures[node] = client.submit( + DaskRunner._run_node, + node, + catalog, + self._is_async, + session_id, + *dependencies, + ) + + for i, (_, node) in enumerate( + as_completed(node_futures.values(), with_results=True) + ): + self._logger.info("Completed node: %s", node.name) + self._logger.info("Completed %d out of %d tasks", i + 1, len(nodes)) + + # Decrement load counts, and release any datasets we + # have finished with. This is particularly important + # for the shared, default datasets we created above. + for data_set in node.inputs: + load_counts[data_set] -= 1 + if load_counts[data_set] < 1 and data_set not in pipeline.inputs(): + catalog.release(data_set) + for data_set in node.outputs: + if load_counts[data_set] < 1 and data_set not in pipeline.outputs(): + catalog.release(data_set) + + def run_only_missing( + self, pipeline: Pipeline, catalog: DataCatalog + ) -> Dict[str, Any]: + """Run only the missing outputs from the ``Pipeline`` using the + datasets provided by ``catalog``, and save results back to the + same objects. + + Args: + pipeline: The ``Pipeline`` to run. + catalog: The ``DataCatalog`` from which to fetch data. + Raises: + ValueError: Raised when ``Pipeline`` inputs cannot be + satisfied. + + Returns: + Any node outputs that cannot be processed by the + ``DataCatalog``. These are returned in a dictionary, where + the keys are defined by the node outputs. + """ + free_outputs = pipeline.outputs() - set(catalog.list()) + missing = {ds for ds in catalog.list() if not catalog.exists(ds)} + to_build = free_outputs | missing + to_rerun = pipeline.only_nodes_with_outputs(*to_build) + pipeline.from_inputs( + *to_build + ) + + # We also need any missing datasets that are required to run the + # `to_rerun` pipeline, including any chains of missing datasets. + unregistered_ds = pipeline.data_sets() - set(catalog.list()) + # Some of the unregistered datasets could have been published to + # the scheduler in a previous run, so we need not recreate them. + missing_unregistered_ds = { + ds_name + for ds_name in unregistered_ds + if not self.create_default_data_set(ds_name).exists() + } + output_to_unregistered = pipeline.only_nodes_with_outputs( + *missing_unregistered_ds + ) + input_from_unregistered = to_rerun.inputs() & missing_unregistered_ds + to_rerun += output_to_unregistered.to_outputs(*input_from_unregistered) + + # We need to add any previously-published, unregistered datasets + # to the catalog passed to the `run` method, so that it does not + # think that the `to_rerun` pipeline's inputs are not satisfied. + catalog = catalog.shallow_copy() + for ds_name in unregistered_ds - missing_unregistered_ds: + catalog.add(ds_name, self.create_default_data_set(ds_name)) + + return self.run(to_rerun, catalog) +``` + +### Update CLI implementation + +You're nearly there! Before you can use the new runner, you need to add a `cli.py` file at the same level as `settings.py`, using [the template we provide](../development/commands_reference.md#customise-or-override-project-specific-kedro-commands). Update the `run()` function in the newly-created `cli.py` file to make sure the runner class is instantiated correctly: + +```python +def run(tag, env, ...): + """Run the pipeline.""" + runner = runner or "SequentialRunner" + + tag = _get_values_as_tuple(tag) if tag else tag + node_names = _get_values_as_tuple(node_names) if node_names else node_names + + with KedroSession.create(env=env, extra_params=params) as session: + context = session.load_context() + runner_instance = _instantiate_runner(runner, is_async, context) + session.run( + tags=tag, + runner=runner_instance, + node_names=node_names, + from_nodes=from_nodes, + to_nodes=to_nodes, + from_inputs=from_inputs, + to_outputs=to_outputs, + load_versions=load_version, + pipeline_name=pipeline, + ) +``` + +where the helper function `_instantiate_runner()` looks like this: + +```python +def _instantiate_runner(runner, is_async, project_context): + runner_class = load_obj(runner, "kedro.runner") + runner_kwargs = dict(is_async=is_async) + + if runner.endswith("DaskRunner"): + client_args = project_context.params.get("dask_client") or {} + runner_kwargs.update(client_args=client_args) + + return runner_class(**runner_kwargs) +``` + +### Deploy + +You're now ready to trigger the run. Without any further configuration, the underlying Dask [`Client`](http://distributed.dask.org/en/stable/api.html#distributed.Client) creates a [`LocalCluster`](http://distributed.dask.org/en/stable/api.html#distributed.LocalCluster) in the background and connects to that: + +```bash +kedro run --runner=kedro_tutorial.runner.DaskRunner +``` + +#### Set up Dask and related configuration + +To connect to an existing Dask cluster, you'll need to set the Dask-related configuration that the runner will use. Create the `conf/dask/` directory and add a `parameters.yml` file inside of it with the following keys: + +```yaml +dask_client: + address: 127.0.0.1:8786 +``` + +Next, [set up scheduler and worker processes on your local computer](http://distributed.dask.org/en/stable/quickstart.html#setup-dask-distributed-the-hard-way): + +```bash +$ dask-scheduler +Scheduler started at 127.0.0.1:8786 + +$ PYTHONPATH=$PWD/src dask-worker 127.0.0.1:8786 +$ PYTHONPATH=$PWD/src dask-worker 127.0.0.1:8786 +$ PYTHONPATH=$PWD/src dask-worker 127.0.0.1:8786 +``` + +```{note} +The above code snippet assumes each worker is started from the root directory of the Kedro project in a Python environment where all required dependencies are installed. +``` + +You're once again ready to trigger the run. Execute the following command: + +```bash +kedro run --env=dask --runner=kedro_tutorial.runner.DaskRunner +``` + +You should start seeing tasks appearing on [Dask's diagnostics dashboard](http://127.0.0.1:8787/status): + +![Dask's diagnostics dashboard](../meta/images/dask_diagnostics_dashboard.png) diff --git a/docs/source/deployment/databricks/databricks_deployment_workflow.md b/docs/source/deployment/databricks/databricks_deployment_workflow.md new file mode 100644 index 0000000000..799a5044c1 --- /dev/null +++ b/docs/source/deployment/databricks/databricks_deployment_workflow.md @@ -0,0 +1,321 @@ +# Use a Databricks job to deploy a Kedro project + +Databricks jobs are a way to execute code on Databricks clusters, allowing you to run data processing tasks, ETL jobs, or machine learning workflows. In this guide, we explain how to package and run a Kedro project as a job on Databricks. + +## What are the advantages of packaging a Kedro project to run on Databricks? + +Packaging your Kedro project and running it on Databricks enables you to execute your pipeline without a notebook. This approach is particularly well-suited for production, as it provides a structured and reproducible way to run your code. + +Here are some typical use cases for running a packaged Kedro project as a Databricks job: + +- **Data engineering pipeline**: the output of your Kedro project is a file or set of files containing cleaned and processed data. +- **Machine learning with MLflow**: your Kedro project runs an ML model; metrics about your experiments are tracked in MLflow. +- **Automated and scheduled runs**: your Kedro project should be [run on Databricks automatically](https://docs.databricks.com/workflows/jobs/schedule-jobs.html#add-a-job-schedule). +- **CI/CD integration**: you have a CI/CD pipeline that produces a packaged Kedro project. + +Running your packaged project as a Databricks job is very different from running it from a Databricks notebook. The Databricks job cluster has to be provisioned and started for each run, which is significantly slower than running it as a notebook on a cluster that has already been started. In addition, there is no way to change your project's code once it has been packaged. Instead, you must change your code, create a new package, and then upload it to Databricks again. + +For those reasons, the packaging approach is unsuitable for development projects where rapid iteration is necessary. For guidance on developing a Kedro project for Databricks in a rapid build-test loop, see the [development workflow guide](./databricks_ide_development_workflow.md). + +## What this page covers + +- [Set up your Kedro project for deployment on Databricks](#set-up-your-project-for-deployment-to-databricks). +- [Run your project as a job using the Databricks workspace UI](#deploy-and-run-your-kedro-project-using-the-workspace-ui). +- [Resources for automating your Kedro deployments to Databricks](#resources-for-automatically-deploying-to-databricks). + +## Prerequisites + +- An active [Databricks deployment](https://docs.databricks.com/getting-started/index.html). +- [`conda` installed](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) on your local machine in order to create a virtual environment with a specific version of Python (>= 3.7 is required). If you have Python >= 3.7 installed, you can use other software to create a virtual environment. + +## Set up your project for deployment to Databricks + +The sequence of steps described in this section is as follows: + +1. [Note your Databricks username and host](#note-your-databricks-username-and-host) +2. [Install Kedro and the databricks CLI in a new virtual environment](#install-kedro-and-the-databricks-cli-in-a-new-virtual-environment) +3. [Authenticate the Databricks CLI](#authenticate-the-databricks-cli) +4. [Create a new Kedro project](#create-a-new-kedro-project) +5. [Create an entry point for Databricks](#create-an-entry-point-for-databricks) +6. [Package your project](#package-your-project) +7. [Upload project data and configuration to DBFS](#upload-project-data-and-configuration-to-dbfs) + +### Note your Databricks username and host + +Note your Databricks **username** and **host** as you will need it for the remainder of this guide. + +Find your Databricks username in the top right of the workspace UI and the host in the browser's URL bar, up to the first slash (e.g., `https://adb-123456789123456.1.azuredatabricks.net/`): + +![Find Databricks host and username](../../meta/images/find_databricks_host_and_username.png) + +```{note} +Your databricks host must include the protocol (`https://`). +``` + +### Install Kedro and the databricks CLI in a new virtual environment + +The following commands will create a new `conda` environment, activate it, and then install Kedro and the Databricks CLI. + +In your local development environment, create a virtual environment for this tutorial using `conda`: + +```bash +conda create --name iris-databricks python=3.10 +``` + +Once it is created, activate it: + +```bash +conda activate iris-databricks +``` + +With your `conda` environment activated, install Kedro and the Databricks CLI: + +```bash +pip install kedro databricks-cli +``` + +### Authenticate the Databricks CLI + +**Now, you must authenticate the Databricks CLI with your Databricks instance.** + +[Refer to the Databricks documentation](https://docs.databricks.com/dev-tools/cli/index.html#set-up-authentication) for a complete guide on how to authenticate your CLI. The key steps are: + +1. Create a personal access token for your user on your Databricks instance. +2. Run `databricks configure --token`. +3. Enter your token and Databricks host when prompted. +4. Run `databricks fs ls dbfs:/` at the command line to verify your authentication. + +### Create a new Kedro project + +Create a Kedro project by using the following command in your local environment: + +```bash +kedro new --starter=databricks-iris +``` + +This command creates a new Kedro project using the `databricks-iris` starter template. Name your new project `iris-databricks` for consistency with the rest of this guide. + +### Create an entry point for Databricks + +The default entry point of a Kedro project uses a Click command line interface (CLI), which is not compatible with Databricks. To run your project as a Databricks job, you must define a new entry point specifically for use on Databricks. + +The `databricks-iris` starter has this entry point pre-built, so there is no extra work to do here, but generally you must **create an entry point manually for your own projects using the following steps**: + +1. **Create an entry point script**: Create a new file in `/src/iris_databricks` named `databricks_run.py`. Copy the following code to this file: + +```python +import argparse +import logging + +from kedro.framework.project import configure_project +from kedro.framework.session import KedroSession + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--env", dest="env", type=str) + parser.add_argument("--conf-source", dest="conf_source", type=str) + parser.add_argument("--package-name", dest="package_name", type=str) + + args = parser.parse_args() + env = args.env + conf_source = args.conf_source + package_name = args.package_name + + # https://kb.databricks.com/notebooks/cmd-c-on-object-id-p0.html + logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR) + logging.getLogger("py4j.py4j.clientserver").setLevel(logging.ERROR) + + configure_project(package_name) + with KedroSession.create(env=env, conf_source=conf_source) as session: + session.run() + + +if __name__ == "__main__": + main() +``` + +2. **Define a new entry point**: Open `/src/setup.py` in a text editor or IDE and add a new line in the definition of the `entry_point` tuple, so that it becomes: + +```python +entry_point = (..., "databricks_run = .databricks_run:main") +``` + +Remember to replace with the correct package name for your project. + +This process adds an entry point to your project which can be used to run it on Databricks. + +```{note} +Because you are no longer using the default entry-point for Kedro, you will not be able to run your project with the options it usually provides. Instead, the `databricks_run` entry point in the above code and in the `databricks-iris` starter contains a simple implementation of two options: +- `--package_name` (required): the package name (defined in `setup.py`) of your packaged project. +- `--env`: specifies a [Kedro configuration environment](../../configuration/configuration_basics.md#configuration-environments) to load for your run. +- `--conf-source`: specifies the location of the `conf/` directory to use with your Kedro project. +``` + +### Package your project + +To package your Kedro project for deployment on Databricks, you must create a Wheel (`.whl`) file, which is a binary distribution of your project. In the root directory of your Kedro project, run the following command: + +```bash +kedro package +``` + +This command generates a `.whl` file in the `dist` directory within your project's root directory. + +### Upload project data and configuration to DBFS + +```{note} +A Kedro project's configuration and data do not get included when it is packaged. They must be stored somewhere accessible to allow your packaged project to run. +``` + +Your packaged Kedro project needs access to data and configuration in order to run. Therefore, you will need to upload your project's data and configuration to a location accessible to Databricks. In this guide, we will store the data on the Databricks File System (DBFS). + +The `databricks-iris` starter contains a [catalog](../../data/data_catalog.md#the-data-catalog) that is set up to access data stored in DBFS (`/conf/`). You will point your project to use configuration stored on DBFS using the `--conf-source` option when you create your job on Databricks. + +There are several ways to upload data to DBFS: you can use the [DBFS API](https://learn.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/dbfs), the [`dbutils` module](https://docs.databricks.com/dev-tools/databricks-utils.html) in a Databricks notebook or the [Databricks CLI](https://docs.databricks.com/dev-tools/cli/dbfs-cli.html). In this guide, it is recommended to use the Databricks CLI because of the convenience it offers. + +- **Upload your project's data and config**: at the command line in your local environment, use the following Databricks CLI commands to upload your project's locally stored data and configuration to DBFS: + +```bash +databricks fs cp --recursive /data/ dbfs:/FileStore/iris-databricks/data +databricks fs cp --recursive /conf/ dbfs:/FileStore/iris-databricks/conf +``` + +The `--recursive` flag ensures that the entire folder and its contents are uploaded. You can list the contents of the destination folder in DBFS using the following command: + +```bash +databricks fs ls dbfs:/FileStore/iris-databricks/data +``` + +You should see the contents of the project's `data/` directory printed to your terminal: + +```bash +01_raw +02_intermediate +03_primary +04_feature +05_model_input +06_models +07_model_output +08_reporting +``` + +## Deploy and run your Kedro project using the workspace UI + +To run your packaged project on Databricks, login to your Databricks account and perform the following steps in the workspace: + +1. [Create a new job](#create-a-new-job) +2. [Create a new job cluster specific to your job](#create-a-new-job-cluster-specific-to-your-job) +3. [Configure the job](#configure-the-job) +4. [Run the job](#run-the-job) + +### Create a new job + +In the Databricks workspace, navigate to the `Workflows` tab and click `Create Job` **or** click the `New` button, then `Job`: + +![Create Databricks job](../../meta/images/databricks_create_new_job.png) + +### Create a new job cluster specific to your job + +Create a dedicated [job cluster](https://docs.databricks.com/clusters/index.html) to run your job by clicking on the drop-down menu in the `Cluster` field and then clicking `Add new job cluster`: + +**Do not use the default `Job_cluster`, it has not been configured to run this job.** + +![Create Databricks job cluster](../../meta/images/databricks_create_job_cluster.png) + +Once you click `Add new job cluster`, the configuration page for this cluster appears. + +Configure the job cluster with the following settings: + +- In the `name` field enter `kedro_deployment_demo`. +- Select the radio button for `Single node`. +- Select the runtime `12.2 LTS` in the `Databricks runtime version` field. +- Leave all other settings with their default values in place. + +The final configuration for the job cluster should look the same as the following: + +![Configure Databricks job cluster](../../meta/images/databricks_configure_job_cluster.png) + +### Configure the job + +Configure the job with the following settings: + +- Enter `iris-databricks` in the `Name` field. +- In the dropdown menu for the `Type` field, select `Python wheel`. +- In the `Package name` field, enter `iris_databricks`. This is the name of your package as defined in your project's `src/setup.py` file. +- In the `Entry Point` field, enter `databricks_run`. This is the name of the [entry point](#create-an-entry-point-for-databricks) to run your package from. +- Ensure the job cluster you created in step two is selected in the dropdown menu for the `Cluster` field. +- In the `Dependent libraries` field, click `Add` and upload [your project's `.whl` file](#package-your-project), making sure that the radio buttons for `Upload` and `Python Whl` are selected for the `Library Source` and `Library Type` fields. +- In the `Parameters` field, enter the following list of runtime options: + +```bash +["--conf-source", "/dbfs/FileStore/iris-databricks/conf", "--package-name", "iris_databricks"] +``` + +The final configuration for your job should look the same as the following: + +![Configure Databricks job](../../meta/images/databricks_configure_new_job.png) + +Click `Create` and then `Confirm and create` in the following pop-up asking you to name the job. + +### Run the job + +Click `Run now` in the top-right corner of your new job's page to start a run of the job. The status of your run can be viewed in the `Runs` tab of your job's page. Navigate to the `Runs` tab and track the progress of your run: + +![Databricks job status](../../meta/images/databricks_job_status.png) + +This page also shows an overview of all past runs of your job. As you only just started your job run, it's status will be `Pending`. A status of `Pending` indicates that the cluster is being started and your code is waiting to run. + +The following things happen when you run your job: + +- The job cluster is provisioned and started (job status: `Pending`). +- The packaged Kedro project and all its dependencies are installed (job status: `Pending`) +- The packaged Kedro project is run from the specified `databricks_run` entry point (job status: `In Progress`). +- The packaged code finishes executing and the job cluster is stopped (job status: `Succeeded`). + +A run will take roughly six to seven minutes. + +When the status of your run is `Succeeded`, your job has successfully finished executing. You can view the logging output created by the run by clicking on the link with the text `Go to the latest successful run` to take you to the `main run` view. You should see logs similar to the following: + +```bash +... +2023-06-06 12:56:14,399 - iris_databricks.nodes - INFO - Model has an accuracy of 0.972 on test data. +2023-06-06 12:56:14,403 - kedro.runner.sequential_runner - INFO - Completed 3 out of 3 tasks +2023-06-06 12:56:14,404 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. +``` + +By following these steps, you packaged your Kedro project and manually ran it as a job on Databricks using the workspace UI. + +## Resources for automatically deploying to Databricks + +Up to this point, this page has described a manual workflow for deploying and running a project on Databricks. The process can be automated in two ways: + +- [Use the Databricks API](#how-to-use-the-databricks-api-to-automatically-deploy-a-kedro-project). +- [Use the Databricks CLI](#how-to-use-the-databricks-cli-to-automatically-deploy-a-kedro-project). + +Both of these methods enable you to store information about your job declaratively in the same version control system as the rest of your project. For each method, the information stored declaratively is the same as what is entered manually in the [above section on creating and running a job in Databricks](#deploy-and-run-your-kedro-project-using-the-workspace-ui). + +These methods can be integrated into a CI/CD pipeline to automatically deploy a packaged Kedro project to Databricks as a job. + +### How to use the Databricks API to automatically deploy a Kedro project + +The Databricks API enables you to programmatically interact with Databricks services, including job creation and execution. You can use the Jobs API to automate the deployment of your Kedro project to Databricks. The following steps outline how to use the Databricks API to do this: + +1. [Set up your Kedro project for deployment on Databricks](#set-up-your-project-for-deployment-to-databricks) +2. Create a JSON file containing your job's configuration. +3. Use the Jobs API's [`/create` endpoint](https://docs.databricks.com/workflows/jobs/jobs-api-updates.html#create) to create a new job. +4. Use the Jobs API's [`/runs/submit` endpoint](https://docs.databricks.com/workflows/jobs/jobs-api-updates.html#runs-submit) to run your newly created job. + +### How to use the Databricks CLI to automatically deploy a Kedro project + +The Databricks Command Line Interface (CLI) is another way to automate deployment of your Kedro project. The following steps outline how to use the Databricks CLI to automate the deployment of a Kedro project: + +1. [Set up your Kedro project for deployment on Databricks.](#set-up-your-project-for-deployment-to-databricks) +2. Install the Databricks CLI and authenticate it with your workspace. +3. Create a JSON file containing your job's configuration. +4. Use the [`jobs create` command](https://docs.databricks.com/dev-tools/cli/jobs-cli.html#create-a-job) to create a new job. +5. Use the [`jobs run-now` command](https://docs.databricks.com/dev-tools/cli/jobs-cli.html#run-a-job) to run your newly created job. + +## Summary + +This guide demonstrated how to deploy a packaged Kedro project on Databricks. This is a structured and reproducible way to run your Kedro projects on Databricks that can be automated and integrated into CI/CD pipelines. diff --git a/docs/source/deployment/databricks/databricks_ide_development_workflow.md b/docs/source/deployment/databricks/databricks_ide_development_workflow.md new file mode 100644 index 0000000000..dc723189c9 --- /dev/null +++ b/docs/source/deployment/databricks/databricks_ide_development_workflow.md @@ -0,0 +1,264 @@ +# Use an IDE, dbx and Databricks Repos to develop a Kedro project + +This guide demonstrates a workflow for developing Kedro projects on Databricks using your local environment for development, then using dbx and Databricks Repos to sync code for testing on Databricks. + +By working in your local environment, you can take advantage of features within an IDE that are not available on Databricks notebooks: + +- Auto-completion and suggestions for code, improving your development speed and accuracy. +- Linters like Pylint or Flake8 can be integrated to catch potential issues in your code. +- Static type checkers like Mypy can check types in your code, helping to identify potential type-related issues early in the development process. + +To set up these features, look for instructions specific to your IDE (for instance, [VS Code](https://code.visualstudio.com/docs/python/linting)). + +If you prefer to develop a projects in notebooks rather than an in an IDE, you should follow our guide on [how to develop a Kedro project within a Databricks workspace](./databricks_notebooks_development_workflow.md) instead. + +## What this page covers + +The main steps in this tutorial are as follows: + +- [Create a virtual environment and install and configure dbx.](#install-kedro-and-dbx-in-a-new-virtual-environment) +- [Create a new Kedro project using the `databricks-iris` starter.](#create-a-new-kedro-project) +- [Create a Repo on Databricks and sync your project using dbx.](#create-a-repo-on-databricks) +- [Upload project data to a location accessible by Kedro when run on Databricks (such as DBFS).](#upload-project-data-to-dbfs) +- [Create a Databricks notebook to run your project.](#create-a-new-databricks-notebook) +- [Modify your project in your local environment and test the changes on Databricks in an iterative loop.](#modify-your-project-and-test-the-changes) + +## Prerequisites + +- An active [Databricks deployment](https://docs.databricks.com/getting-started/index.html). +- A [Databricks cluster](https://docs.databricks.com/clusters/configure.html) configured with a recent version (>= 11.3 is recommended) of the Databricks runtime. +- [Conda installed](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) on your local machine in order to create a virtual environment with a specific version of Python (>= 3.8 is required). If you have Python >= 3.8 installed, you can use other software to create a virtual environment. + +## Set up your project + +### Note your Databricks username and host + +Note your Databricks **username** and **host** as you will need it for the remainder of this guide. + +Find your Databricks username in the top right of the workspace UI and the host in the browser's URL bar, up to the first slash (e.g., `https://adb-123456789123456.1.azuredatabricks.net/`): + +![Find Databricks host and username](../../meta/images/find_databricks_host_and_username.png) + +```{note} +Your databricks host must include the protocol (`https://`). +``` + +### Install Kedro and dbx in a new virtual environment + +In your local development environment, create a virtual environment for this tutorial using Conda: + +```bash +conda create --name iris-databricks python=3.10 +``` + +Once it is created, activate it: + +```bash +conda activate iris-databricks +``` + +With your Conda environment activated, install Kedro and dbx: + +```bash +pip install kedro dbx --upgrade +``` + +### Authenticate the Databricks CLI + +**Now, you must authenticate the Databricks CLI with your Databricks instance.** + +[Refer to the Databricks documentation](https://docs.databricks.com/dev-tools/cli/index.html#set-up-authentication) for a complete guide on how to authenticate your CLI. The key steps are: + +1. Create a personal access token for your user on your Databricks instance. +2. Run `databricks configure --token`. +3. Enter your token and Databricks host when prompted. +4. Run `databricks fs ls dbfs:/` at the command line to verify your authentication. + +```{note} +dbx is an extension of the Databricks CLI, a command-line program for interacting with Databricks without using its UI. You will use dbx to sync your project's code with Databricks. While Git can sync code to Databricks Repos, dbx is preferred for development as it avoids creating new commits for every change, even if those changes do not work. +``` + +### Create a new Kedro project + +Create a Kedro project with the `databricks-iris` starter using the following command in your local environment: + +```bash +kedro new --starter=databricks-iris +``` + +Name your new project `iris-databricks` for consistency with the rest of this guide. This command creates a new Kedro project using the `databricks-iris` starter template. + +### Create a Repo on Databricks + +Create a new Repo on Databricks by navigating to `New` tab in the Databricks workspace UI side bar and clicking `Repo` in the drop-down menu that appears. + +In this guide, you will not sync your project with a remote Git provider, so uncheck `Create repo by cloning a Git repository` and enter `iris-databricks` as the name of your new repository: + +![Create a new Repo on Databricks](../../meta/images/databricks_repo_creation.png) + +### Sync code with your Databricks Repo using dbx + +The next step is to use dbx to sync your project to your Repo. + +**Open a new terminal instance**, activate your conda environment, and navigate to your project directory and start `dbx sync`: + +```bash +conda activate iris-databricks +cd +dbx sync repo --dest-repo iris-databricks --source . +``` + +This command will sync your local directory (`--source .`) with your Repo (`--dest-repo iris-databricks`) on Databricks. When started for the first time, `dbx sync` will write output similar to the following to your terminal: + +```bash +... +[dbx][2023-04-13 21:59:48.148] Putting /Repos//iris-databricks/src/tests/__init__.py +[dbx][2023-04-13 21:59:48.168] Putting /Repos//iris-databricks/src/tests/test_pipeline.py +[dbx][2023-04-13 21:59:48.189] Putting /Repos//iris-databricks/src/tests/test_run.py +[dbx][2023-04-13 21:59:48.928] Done. Watching for changes... +``` + +**Keep the second terminal (running dbx sync) alive during development; closing it stops syncing new changes.** + +`dbx sync` will automatically sync any further changes made in your local project directory with your Databricks Repo while it runs. + +```{note} +Syncing with dbx is one-way only, meaning changes you make using the Databricks Repos code editor will not be reflected in your local environment. Only make changes to your project in your local environment while syncing, not in the editor that Databricks Repos provides. +``` + +### Create a `conf/local` directory in your Databricks Repo + +Kedro requires your project to have a `conf/local` directory to exist to successfully run, even if it is empty. `dbx sync` does not copy the contents of your local `conf/local` directory to your Databricks Repo, so you must create it manually. + +Open the Databricks workspace UI and using the panel on the left, navigate to `Repos -> -> iris-databricks -> conf`, right click and select `Create -> Folder` as in the image below: + +![Create a conf folder in Databricks Repo](../../meta/images/databricks_conf_folder_creation.png) + +Name the new folder `local`. In this guide, we have no local credentials to store and so we will leave the newly created folder empty. Your `conf/local` and `local` directories should now look like the following: + +![Final conf folder](../../meta/images/final_conf_folder.png) + +### Upload project data to DBFS + +When run on Databricks, Kedro cannot access data stored in your project's directory. Therefore, you will need to upload your project's data to an accessible location. In this guide, we will store the data on the Databricks File System (DBFS). + +The `databricks-iris` starter contains a [catalog](../../data/data_catalog.md#the-data-catalog) that is set up to access data stored in DBFS (`/conf/`). You will point your project to use configuration stored on DBFS using the `--conf-source` option when you create your job on Databricks. + +There are several ways to upload data to DBFS. In this guide, it is recommended to use [Databricks CLI](https://docs.databricks.com/dev-tools/cli/dbfs-cli.html) because of the convenience it offers. At the command line in your local environment, use the following Databricks CLI command to upload your locally stored data to DBFS: + +```bash +databricks fs cp --recursive /data/ dbfs:/FileStore/iris-databricks/data +``` + +The `--recursive` flag ensures that the entire folder and its contents are uploaded. You can list the contents of the destination folder in DBFS using the following command: + +```bash +databricks fs ls dbfs:/FileStore/iris-databricks/data +``` + +You should see the contents of the project's `data/` directory printed to your terminal: + +```bash +01_raw +02_intermediate +03_primary +04_feature +05_model_input +06_models +07_model_output +08_reporting +``` + +### Create a new Databricks notebook + +Now that your project is available on Databricks, you can run it on a cluster using a notebook. + +To run the Python code from your Databricks Repo, [create a new Python notebook](https://docs.databricks.com/notebooks/notebooks-manage.html#create-a-notebook) in your workspace. Name it `iris-databricks` for traceability and attach it to your cluster: + +![Create a new notebook on Databricks](../../meta/images/databricks_notebook_creation.png) + +### Run your project + +Open your newly-created notebook and create **four new cells** inside it. You will fill these cells with code that runs your project. When copying the following code snippets, remember to replace `` with your username on Databricks such that `project_root` correctly points to your project's location. + +1. Before you import and run your Python code, you'll need to install your project's dependencies on the cluster attached to your notebook. Your project has a `requirements.txt` file for this purpose. Add the following code to the first new cell to install the dependencies: + +```ipython +%pip install -r "/Workspace/Repos//iris-databricks/src/requirements.txt" +``` + +2. To run your project in your notebook, you must load the Kedro IPython extension. Add the following code to the second new cell to load the IPython extension: + +```ipython +%load_ext kedro.ipython +``` + +3. Loading the extension allows you to use the `%reload_kedro` line magic to load your Kedro project. Add the following code to the third new cell to load your Kedro project: + +```ipython +%reload_kedro /Workspace/Repos//iris-databricks +``` + +4. Loading your Kedro project with the `%reload_kedro` line magic will define four global variables in your notebook: `context`, `session`, `catalog` and `pipelines`. You will use the `session` variable to run your project. Add the following code to the fourth new cell to run your Kedro project: + +```ipython +session.run() +``` + +After completing these steps, your notebook should match the following image: + +![Databricks completed notebook](../../meta/images/databricks_finished_notebook.png) + +Run the completed notebook using the `Run All` bottom in the top right of the UI: + +![Databricks notebook run all](../../meta/images/databricks_run_all.png) + +On your first run, you will be prompted to consent to analytics, type `y` or `N` in the field that appears and press `Enter`: + +![Databricks notebook telemetry consent](../../meta/images/databricks_telemetry_consent.png) + +You should see logging output while the cell is running. After execution finishes, you should see output similar to the following: + +```bash +... +2023-06-06 17:21:53,221 - iris_databricks.nodes - INFO - Model has an accuracy of 0.960 on test data. +2023-06-06 17:21:53,222 - kedro.runner.sequential_runner - INFO - Completed 3 out of 3 tasks +2023-06-06 17:21:53,224 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. +``` + +## Modify your project and test the changes + +Now that your project has run successfully once, you can make changes using the convenience and power of your local development environment. In this section, you will modify the project to use a different ratio of training data to test data and check the effect of this change on Databricks. + +### Modify the training / test split ratio + +The `databricks-iris` starter uses a default 80-20 ratio of training data to test data when training the classifier. In this section, you will change this ratio to 70-30 by editing your project in your local environment, then sync it with the Databricks Repo using `dbx`, and then run the modified project on Databricks to observe the different result. + +Open the file `/conf/base/parameters.yml` in your local environment. Edit the line `train_fraction: 0.8` to `train_fraction: 0.7` and save your changes. Look in the terminal where `dbx sync` is running, you should see it automatically sync your changes with your Databricks Repo: + +```bash +... +[dbx][2023-04-14 18:29:39.235] Putting /Repos//iris-databricks/conf/base/parameters.yml +[dbx][2023-04-14 18:29:40.820] Done +``` + +### Re-run your project + +Return to your Databricks notebook. Re-run the third and fourth cells in your notebook (containing the code `%reload_kedro ...` and `session.run()`). The project will now run again, producing output similar to the following: + +```bash +... +2023-06-06 17:23:19,561 - iris_databricks.nodes - INFO - Model has an accuracy of 0.972 on test data. +2023-06-06 17:23:19,562 - kedro.runner.sequential_runner - INFO - Completed 3 out of 3 tasks +2023-06-06 17:23:19,564 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. +``` + +You can see that your model's accuracy has changed now that you are using a different classifier to produce the result. + +```{note} +If your cluster terminates, you must re-run your entire notebook, as libraries installed using `%pip install ...` are ephemeral. If not, repeating this step is only necessary if your project's requirements change. +``` + +## Summary + +This guide demonstrated a development workflow on Databricks, using your local development environment, dbx, and Databricks Repos to sync code. This approach improves development efficiency and provides access to powerful development features, such as auto-completion, linting, and static type checking, that are not available when working exclusively with Databricks notebooks. diff --git a/docs/source/deployment/databricks/databricks_notebooks_development_workflow.md b/docs/source/deployment/databricks/databricks_notebooks_development_workflow.md new file mode 100644 index 0000000000..5867163ab9 --- /dev/null +++ b/docs/source/deployment/databricks/databricks_notebooks_development_workflow.md @@ -0,0 +1,297 @@ +# Use a Databricks workspace to develop a Kedro project + +This guide demonstrates a workflow for developing Kedro projects on Databricks using only a Databricks Repo and a Databricks notebook. You will learn how to develop and test your Kedro projects entirely within the Databricks workspace. + +This method of developing a Kedro project for use on Databricks is ideal for developers who prefer developing their projects in notebooks rather than an in an IDE. It also avoids the overhead of setting up and syncing a local environment with Databricks. If you want to take advantage of the powerful features of an IDE to develop your project, consider following the [guide for developing a Kedro project for Databricks using your local environment](./databricks_ide_development_workflow.md). + +In this guide, you will store your project's code in a repository on [GitHub](https://github.com/). Databricks integrates with many [Git providers](https://docs.databricks.com/repos/index.html#supported-git-providers), including GitLab and Azure Devops. The steps to create a Git repository and sync it with Databricks also generally apply to these Git providers, though the exact details may vary. + +## What this page covers + +This tutorial introduces a Kedro project development workflow using only the Databricks workspace. The main steps in this workflow are: + +- [Create a new Kedro project using the `databricks-iris` starter.](#create-a-new-kedro-project) +- [Create a Databricks notebook to run your project.](#create-a-new-databricks-notebook) +- [Copy project data to DBFS.](#copy-project-data-to-dbfs-using-dbutils) +- [Modify your project in the Databricks workspace](#modify-your-project-and-test-the-changes) + +## Prerequisites + +- An active [Databricks deployment](https://docs.databricks.com/getting-started/index.html). +- A [Databricks cluster](https://docs.databricks.com/clusters/configure.html) configured with a recent version (>= 11.3 is recommended) of the Databricks runtime. +- Python >= 3.7 installed. +- Git installed. +- A [GitHub](https://github.com/) account. +- A Python environment management system installed, [venv](https://docs.python.org/3/library/venv.html), [virtualenv](https://virtualenv.pypa.io/en/latest/) or [Conda](https://docs.conda.io/en/latest/) are popular choices. + +## Set up your project + +### Install Kedro in a new virtual environment + +In your local development environment, create a virtual environment for this tutorial. Any environment management system can be used, though the following commands use Conda: + +```bash +conda create --name iris-databricks python=3.10 +``` + +Once it is created, activate it: + +```bash +conda activate iris-databricks +``` + +With your Conda environment activated, install Kedro: + +```bash +pip install kedro +``` + +### Create a new Kedro project + +Create a Kedro project with the `databricks-iris` starter using the following command in your local environment: + +```bash +kedro new --starter=databricks-iris +``` + +Name your new project `iris-databricks` for consistency with the rest of this guide. This command creates a new Kedro project using the `databricks-iris` starter template. + +### Create a GitHub repository + +Now you should [create a new repository in GitHub](https://docs.github.com/en/github/getting-started-with-github/create-a-repo) using the official guide. Keep the repository private and don't commit to it yet. For consistency with the rest of this guide, name your GitHub repository `iris-databricks`. + +### Create a GitHub personal access token + +To synchronise your project between your local development environment and Databricks, you will use a private GitHub repository, which you will create in the next step. For authentication, you will need to create a GitHub personal access token. [Create this token in your GitHub developer settings](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token). + +The main steps are: + +- Verify your email and navigate to "Settings" under your profile photo. +- Select "Developer settings" then "Fine-grained tokens" and click on "Generate new token". +- Select a name and expiration time for your token, choose an expiration time. +- Select which repositories your token will allow access to and define the token permissions. + +### Push your Kedro project to the GitHub repository + +At the command line, initialise Git in your project root directory: + +```bash +# change the directory to the project root +cd iris-databricks/ +# initialise git +git init +``` + +Then, create the first commit: + +```bash +# add all files to git staging area +git add . +# create the first commit +git commit -m "first commit" +``` + +To connect to your GitHub repository from your local environment, use one of two options: + +- **SSH:** If you choose to connect with SSH, you will also need to configure [the SSH connection to GitHub](https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh), unless you already have [an existing SSH key configured for GitHub](https://docs.github.com/en/github/authenticating-to-github/checking-for-existing-ssh-keys) +- **HTTPS:** If using HTTPS, you will be asked for your GitHub username and password when you push your first commit. Use your GitHub username and your [personal access token](#create-a-github-personal-access-token) generated in the previous step as the password, [do _not_ use your original GitHub password](https://docs.github.com/en/rest/overview/authenticating-to-the-rest-api#authenticating-with-username-and-password). + +With one of these two options chosen, run the following commands: + +```bash +# configure a new remote +# for HTTPS run: +git remote add origin https://github.com//iris-databricks.git +# or for SSH run: +git remote add origin git@github.com:/iris-databricks.git + +# verify the new remote URL +git remote -v + +# push the first commit +git push --set-upstream origin main +``` + +### Create a repo on Databricks + +You will now create a repo on Databricks using the following steps: + +1. **Create a new repo:** + +- Navigate to the `Repos` tab in the Databricks workspace UI and click `Add Repo`. +- Keep the `Add Repo` popup open for the following steps. + +![Create a new Databricks repo](../../meta/images/databricks_new_repo_popup.png) + +2. **Specify your GitHub repo:** + +- In the `Git repository URL` field, enter your GitHub repository's URL. This will automatically populate the `Git provider` and `Repository name` fields also. + +![Specify GitHub repo details](../../meta/images/databricks_specify_github_repo.png) + +3. **Authenticate Databricks with GitHub:** + +- Click on the `Git credential` field. +- In the `Git provider` field, select `GitHub` in the dropdown menu. +- In the `Git provider username or email` field, enter the username or email address of your GitHub account. +- In the `Token` field, enter your [GitHub personal access token](#create-a-github-personal-access-token). +- Click the `Save` button to save your new Git credential. + +![Authenticate Databricks with GitHub](../../meta/images/databricks_authenticate_repo.png) + +4. **Finish the Repo creation process:** + +- Click `Create Repo`. Your GitHub repository is cloned to Databricks and the popup window closes. + +### Create a new Databricks notebook + +Now that your project is available in a Databricks Repo, you can run it on a cluster using a notebook. + +To run the Python code from your Databricks repo, [create a new Python notebook](https://docs.databricks.com/notebooks/notebooks-manage.html#create-a-notebook) in your workspace. Name it `iris-databricks` for traceability and attach it to your cluster: + +![Create a new notebook on Databricks](../../meta/images/databricks_notebook_creation.png) + +### Copy project data to DBFS using dbutils + +On Databricks, Kedro cannot access data stored directly in your project's directory. As a result, you'll need to move your project's data to a location accessible by Databricks. You can store your project's data in the Databricks File System (DBFS), where it is accessible. + +A number of methods exist for moving data to DBFS. However, in this guide, you will use your new notebook and `dbutils`. + +To move your locally stored data to DBFS, open your `iris-databricks` notebook and in the first cell enter the following python code: + +```python +dbutils.fs.cp( + "file:///Workspace/Repos//iris-databricks/data/", + "dbfs:/FileStore/iris-databricks/data", + recurse=True, +) +``` + +Run this cell to copy the complete directory and its contents from your Repo to DBFS. + +To ensure that your data was copied correctly, you can list the contents of the destination directory in DBFS. Create a new cell underneath the first cell and enter the following code: + +```python +dbutils.fs.ls("dbfs:/FileStore/iris-databricks/data") +``` + +Run this command to displays the contents of your project's `data/` directory. You can expect to see the following structure: + +```bash +[FileInfo(path='dbfs:/FileStore/iris-databricks/data/01_raw', name='01_raw', size=...), + FileInfo(path='dbfs:/FileStore/iris-databricks/data/02_intermediate', name='02_intermediate', size=...), + FileInfo(path='dbfs:/FileStore/iris-databricks/data/03_primary', name='03_primary', size=...), + FileInfo(path='dbfs:/FileStore/iris-databricks/data/04_feature', name='04_feature', size=...), + FileInfo(path='dbfs:/FileStore/iris-databricks/data/05_model_input', name='05_model_input', size=...), + FileInfo(path='dbfs:/FileStore/iris-databricks/data/06_models', name='06_models', size=...), + FileInfo(path='dbfs:/FileStore/iris-databricks/data/07_model_output', name='07_model_output', size=...), + FileInfo(path='dbfs:/FileStore/iris-databricks/data/08_reporting', name='08_reporting', size=...)] +``` + +After these cells have successfully run, you should comment the code inside them so their operations are not unnecessarily performed during notebook runs. The cells should appear as below: + +**Cell 1:** + +```ipython +#dbutils.fs.cp( +# "file:///Workspace/Repos//iris-databricks/data", +# "dbfs:/FileStore/iris-databricks/data", +# recurse=True, +#) +``` + +**Cell 2:** + +```ipython +#dbutils.fs.ls("dbfs:/FileStore/iris-databricks/data") +``` + +### Run your project + +Create **four new cells** inside your notebook. You will fill these cells with code that runs your project. When copying the following code snippets, remember to replace `` with your username on Databricks such that `project_root` correctly points to your project's location. + +1. Before you import and run your Python code, you'll need to install your project's dependencies on the cluster attached to your notebook. Your project has a `requirements.txt` file for this purpose. Add the following code to the first new cell to install the dependencies: + +```ipython +%pip install -r "/Workspace/Repos//iris-databricks/src/requirements.txt" +``` + +2. To run your project in your notebook, you must load the Kedro IPython extension. Add the following code to the second new cell to load the IPython extension: + +```ipython +%load_ext kedro.ipython +``` + +3. Loading the extension allows you to use the `%reload_kedro` line magic to load your Kedro project. Add the following code to the third new cell to load your Kedro project: + +```ipython +%reload_kedro /Workspace/Repos//iris-databricks +``` + +4. Loading your Kedro project with the `%reload_kedro` line magic will define four global variables in your notebook: `context`, `session`, `catalog` and `pipelines`. You will use the `session` variable to run your project. Add the following code to the fourth new cell to run your Kedro project: + +```ipython +session.run() +``` + +After completing these steps, your notebook should match the following image: + +![Databricks completed notebook](../../meta/images/databricks_notebooks_workflow_finished_notebook.png) + +Run the completed notebook using the `Run All` button in the top right of the UI: + +![Databricks notebook run all](../../meta/images/databricks_run_all.png) + +On the first run of your Kedro project, you will be prompted to consent to analytics, type `y` or `N` in the field that appears and press `Enter`: + +![Databricks notebook telemetry consent](../../meta/images/databricks_telemetry_consent.png) + +You should see logging output while the cell is running. After execution finishes, you should see output similar to the following: + +```bash +... +2023-06-06 12:55:22,705 - iris_databricks.nodes - INFO - Model has an accuracy of 0.953 on test data. +2023-06-06 12:55:22,709 - kedro.runner.sequential_runner - INFO - Completed 3 out of 3 tasks +2023-06-06 12:55:22,709 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. +``` + +## Modify your project and test the changes + +Now that your project has run successfully once, you can make changes using the Databricks UI. In this section, you will modify the project to use a different ratio of training data to test data and check the effect of this change. + +### Modify the training / test split ratio + +The `databricks-iris` starter uses a default 80-20 ratio of training data to test data when training the classifier. You will edit this ratio to 70-30 and re-run your project to view the different result. + +In the Databricks workspace, click on the `Repos` tab in the side bar and navigate to `/iris-databricks/conf/base/`. Open the the file `parameters.yml` by double-clicking it. This will take you to a built-in file editor. Edit the line `train_fraction: 0.8` to `train_fraction: 0.7`, your changes will automatically be saved. + +![Databricks edit file](../../meta/images/databricks_edit_file.png) + +### Re-run your project + +Return to your Databricks notebook. Re-run the third and fourth cells in your notebook (containing the code `%reload_kedro ...` and `session.run()`). The project will now run again, producing output similar to the following: + +```bash +... +2023-06-06 12:56:14,399 - iris_databricks.nodes - INFO - Model has an accuracy of 0.972 on test data. +2023-06-06 12:56:14,403 - kedro.runner.sequential_runner - INFO - Completed 3 out of 3 tasks +2023-06-06 12:56:14,404 - kedro.runner.sequential_runner - INFO - Pipeline execution completed successfully. +``` + +You can see that your model's accuracy has changed now that you are using a different classifier to produce the result. + +```{note} +If your cluster terminates, you must re-run your entire notebook, as libraries installed using `%pip install ...` are ephemeral. If not, repeating this step is only necessary if your project's dependencies change. +``` + +### Managing your Databricks Repo + +Your Databricks Repo now has untracked changes that are not synced with your GitHub repository. To track your changes and sync your Repo, you can use the corresponding [Git operations in Databricks Repos](https://docs.databricks.com/repos/git-operations-with-repos.html). A basic overview of the steps to achieve this is: + +- Commit your changes in your Databricks Repo. +- Push the changes to the GitHub repository linked to your Databricks Repo. +- Check that the latest commits are visible in your GitHub repository. + +## Summary + +This guide demonstrated a development workflow on Databricks using only the Databricks workspace. This approach is ideal for users who prefer to develop using notebooks and avoids having to set up and sync a local environment with Databricks. diff --git a/docs/source/deployment/databricks/databricks_visualisation.md b/docs/source/deployment/databricks/databricks_visualisation.md new file mode 100644 index 0000000000..33cf4e5090 --- /dev/null +++ b/docs/source/deployment/databricks/databricks_visualisation.md @@ -0,0 +1,32 @@ +# Visualise a Kedro project in Databricks notebooks + +[Kedro-Viz](../../visualisation/kedro-viz_visualisation.md) is a tool that enables you to visualise your Kedro pipeline and metrics generated from your data science experiments. It is a standalone web application that runs on a web browser, it can be run on a local machine or in a Databricks notebook. + +For Kedro-Viz to run with your Kedro project, you need to ensure that both the packages are installed in the same scope (notebook-scoped vs. cluster library). This means that if you `%pip install kedro` from inside your notebook then you should also `%pip install kedro-viz` from inside your notebook. +If your cluster comes with Kedro installed on it as a library already then you should also add Kedro-Viz as a [cluster library](https://docs.microsoft.com/en-us/azure/databricks/libraries/cluster-libraries). + +To run Kedro-Viz in a Databricks notebook you must first launch the Kedro IPython extension: + +```ipython +%load_ext kedro.ipython +``` + +And load your Kedro project from where it is stored in either the Databricks workspace or in a Repo: + +```ipython +%reload_kedro /iris-databricks +``` + +Kedro-Viz can then be launched in a new browser tab with the `%run_viz` line magic: + +```ipython +%run_viz +``` + +This command presents you with a link to the Kedro-Viz web application. + +![databricks_viz_link](../../meta/images/databricks_viz_link.png) + +Clicking this link opens a new browser tab running Kedro-Viz for your project. + +![databricks_viz_demo](../../meta/images/databricks_viz_demo.png) diff --git a/docs/source/deployment/databricks/index.md b/docs/source/deployment/databricks/index.md new file mode 100644 index 0000000000..a4e349d732 --- /dev/null +++ b/docs/source/deployment/databricks/index.md @@ -0,0 +1,52 @@ +# Databricks + +Databricks offers integration with Kedro through three principal workflows, which range across a spectrum and combine local development with Databricks. + +Let's break down the advantages and use cases of each workflow to help you make an informed decision and choose the workflow that best fits your project's needs. + +**I want to work within a Databricks workspace** + +The workflow documented in ["Use a Databricks workspace to develop a Kedro project"](./databricks_notebooks_development_workflow.md) is for those who prefer to develop and test their projects directly within Databricks notebooks. + +To avoid the overhead of setting up and syncing a local development environment with Databricks, choose this as your workflow. You gain the flexibility for quick iteration, although switching to a [job-based deployment workflow](./databricks_deployment_workflow.md) might be necessary when you transition into a production deployment. + +**I want a hybrid workflow model combining local IDE with Databricks** + + +The workflow documented in ["Use an IDE, dbx and Databricks Repos to develop a Kedro project"](./databricks_ide_development_workflow.md) is for those that prefer to work in a local IDE. + +If you're in the early stages of learning Kedro, or your project requires constant testing and adjustments, choose this workflow. You can use your IDE's capabilities for faster, error-free development, while testing on Databricks. Later you can make the transition into a production deployment with this approach, although you may prefer to switch to use [job-based deployment](./databricks_deployment_workflow.md) and fully optimise your workflow for production. + +**I want to deploy a packaged Kedro project to Databricks** + +The workflow documented in ["Use a Databricks job to deploy a Kedro project"](./databricks_deployment_workflow.md) is the go-to choice when dealing with complex project requirements that need a high degree of structure and reproducibility. It's your best bet for a production setup, given its support for CI/CD, automated/scheduled runs and other advanced use cases. It might not be the ideal choice for projects requiring quick iterations due to its relatively rigid nature. + +--- +Here's a flowchart to guide your choice of workflow: + +```{mermaid} +:alt: mermaid-Decision making diagram for deploying Kedro projects to Databricks + +flowchart TD + A[Start] --> B{Do you prefer developing your projects in notebooks?} + B -->|Yes| C[Use a Databricks workspace to develop a Kedro project] + B -->|No| D{Are you a beginner with Kedro?} + D -->|Yes| E[Use an IDE, dbx and Databricks Repos to develop a Kedro project] + D -->|No| F{Do you have advanced project requirements
e.g. CI/CD, scheduling, production-ready, complex pipelines, etc.?} + F -->|Yes| G{Is rapid development needed for your project needs?} + F -->|No| H[Use an IDE, dbx and Databricks Repos to develop a Kedro project] + G -->|Yes| I[Use an IDE, dbx and Databricks Repos to develop a Kedro project] + G -->|No| J[Use a Databricks job to deploy a Kedro project] +``` + +Remember, the best choice of workflow is the one that aligns best with your project's requirements, whether that's quick development, notebook-based coding, or a production-ready setup. Make sure to consider these factors alongside your comfort level with Kedro when making your decision. + + +```{toctree} +:maxdepth: 1 + +databricks_notebooks_development_workflow.md +databricks_ide_development_workflow.md +databricks_deployment_workflow +databricks_visualisation +``` diff --git a/docs/source/10_deployment/03_distributed.md b/docs/source/deployment/distributed.md similarity index 71% rename from docs/source/10_deployment/03_distributed.md rename to docs/source/deployment/distributed.md index dbb912c6a0..2b005afe42 100644 --- a/docs/source/10_deployment/03_distributed.md +++ b/docs/source/deployment/distributed.md @@ -3,26 +3,26 @@ This topic explains how to deploy Kedro in a distributed system. Distributed applications refer to software that runs on multiple computers within a network at the same time and can be stored on servers or with cloud computing. Unlike traditional applications that run on a single machine, distributed applications run on multiple systems simultaneously for a single task or job. -You may select to use a distributed system if your Kedro pipelines are very compute-intensive because you can benefit from the cloud's elasticity and scalability to manage compute resources. +You may select to use a distributed system if your Kedro pipelines are very compute-intensive to benefit from the cloud's elasticity and scalability to manage compute resources. - As a distributed deployment strategy, we recommend the following series of steps: +As a distributed deployment strategy, we recommend the following series of steps: ## 1. Containerise the pipeline For better dependency management, we encourage you to containerise the entire pipeline/project. We recommend using [Docker](https://www.docker.com/), but you're free to use any preferred container solution available to you. For the purpose of this walk-through, we are going to assume a `Docker` workflow. -Firstly make sure your project requirements are up to date by running: +Firstly make sure your [project requirements are up-to-date](../kedro_project_setup/dependencies.md) by running: ```bash -kedro build-reqs +pip-compile --output-file=/src/requirements.txt --input-file=/src/requirements.txt ``` -We then recommend the [`Kedro-Docker`](https://github.com/quantumblacklabs/kedro-docker) plugin to streamline the process of building the image. [Instructions for using this are in the plugin's README.md](https://github.com/quantumblacklabs/kedro-docker/blob/master/README.md). +We then recommend the [`Kedro-Docker`](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) plugin to streamline the process of building the image. [Instructions for using this are in the plugin's README.md](https://github.com/kedro-org/kedro-plugins/blob/main/README.md). -After you’ve built the Docker image for your project locally, you would typically have to transfer the image to a container registry, such as DockerHub or AWS Elastic Container Registry, to be able to pull it on your remote servers. You can find instructions on how to do so [in our guide for single-machine deployment](./02_single_machine.md#how-to-use-container-registry). +After you’ve built the Docker image for your project locally, you would typically have to transfer the image to a container registry, such as DockerHub or AWS Elastic Container Registry, to be able to pull it on your remote servers. You can find instructions on how to do so [in our guide for single-machine deployment](./single_machine.md#how-to-use-container-registry). -## 2. Convert your Kedro pipeline into targeted platform's primitives +## 2. Convert your Kedro pipeline into targeted platform primitives A Kedro pipeline benefits from a structure that's normally easy to translate (at least semantically) into the language that different platforms would understand. A DAG of `nodes` can be converted into a series of tasks where each node maps to an individual task, whether it being a Kubeflow operator, an AWS Batch job, etc, and the dependencies are the same as those mapped in `Pipeline.node_dependencies`. @@ -33,11 +33,11 @@ To perform the conversion programmatically, you will need to develop a script. M A `node` typically corresponds to a unit of compute, which can be run by parameterising the basic `kedro run`: ```bash -kedro run --node +kedro run --node= ``` We encourage you to play with different ways of parameterising your runs as you see fit. Use names, tags, custom flags, in preference to making a code change to execute different behaviour. All your jobs/tasks/operators/etc. should have the same version of the code, i.e. same Docker image, to run on. ## 4. (Optional) Create starters -This is an optional step, but it may speed up your work in the long term. If you find yourself having to deploy in a similar environment or to a similar platform fairly often, you may want to build your own [Kedro starter](../02_get_started/06_starters.md). That way you will be able to re-use any deployment scripts written as part of step 2. +This is an optional step, but it may speed up your work in the long term. If you find yourself having to deploy in a similar environment or to a similar platform fairly often, you may want to [build your own Kedro starter](../kedro_project_setup/starters.md). That way you will be able to re-use any deployment scripts written as part of step 2. diff --git a/docs/source/deployment/index.md b/docs/source/deployment/index.md new file mode 100644 index 0000000000..baf2ca74fd --- /dev/null +++ b/docs/source/deployment/index.md @@ -0,0 +1,64 @@ +# Deployment + +In this section we provide guides for different deployment methods; your choice will depend on a range of factors. + +If you decide to deploy your Kedro project onto a single machine, you should consult our [guide to single-machine deployment](single_machine.md), and decide whether to: + +* [use Docker for container-based deployment](./single_machine.md#container-based) +* [use package-based deployment](./single_machine.md#package-based) +* [use the CLI to clone and deploy your codebase to a server](./single_machine.md#cli-based) + +If your pipeline is sizeable, you may want to run it across separate machines, so will need to consult our [guide to distributed deployment](distributed.md). + +```{mermaid} +:alt: mermaid-Decision making diagram for deploying Kedro projects + +flowchart TD + A{Can your Kedro pipeline run on a single machine?} -- YES --> B[Consult the single-machine deployment guide]; + B --> C{Do you have Docker on your machine?}; + C -- YES --> D[Use a container-based approach]; + C -- NO --> E[Use the CLI or package mode]; + A -- NO --> F[Consult the distributed deployment guide]; + F --> G["What distributed platform are you using?

Check out the guides for:

  • Airflow
  • Amazon SageMaker
  • AWS Step functions
  • Azure
  • Dask
  • Databricks
  • Kubeflow Workflows
  • Prefect
  • Vertex AI
  • "]; + style G text-align:left +``` + +This following pages provide information for deployment to, or integration with, the following: + +* [Airflow](airflow_astronomer.md) +* [Amazon SageMaker](amazon_sagemaker.md) +* [AWS Step functions](aws_step_functions.md) +* [Azure](azure.md) +* [Dask](dask.md) +* [Databricks](./databricks/index.md) +* [Kubeflow Workflows](kubeflow.md) +* [Prefect](prefect.md) +* [Vertex AI](vertexai.md) + +``` {warning} +We also have legacy documentation pages for the following deployment targets, but these have not been tested against recent Kedro releases and we cannot guarantee them: + +* for [Argo Workflows](argo.md) +* for [AWS Batch](aws_batch.md) +``` + + + +```{toctree} +:maxdepth: 1 +:hidden: + +single_machine +distributed +airflow_astronomer +amazon_sagemaker +aws_step_functions +azure +dask +databricks/index +kubeflow +prefect +vertexai +argo +aws_batch +``` diff --git a/docs/source/deployment/kubeflow.md b/docs/source/deployment/kubeflow.md new file mode 100644 index 0000000000..44c426597e --- /dev/null +++ b/docs/source/deployment/kubeflow.md @@ -0,0 +1,15 @@ +# Kubeflow Pipelines + + +## Why would you use Kubeflow Pipelines? +Kubeflow Pipelines is an end-to-end (E2E) orchestration tool to deploy, scale and manage your machine learning systems within Docker containers. You can schedule and compare runs, and examine detailed reports on each run. + +Here are the main reasons to use Kubeflow Pipelines: + +- It is cloud-agnostic and can run on any Kubernetes cluster +- Kubeflow is tailored towards machine learning workflows for model deployment, experiment tracking, and hyperparameter tuning +- You can re-use components and pipelines to create E2E solutions + + +## The `kedro-kubeflow` plugin +The `kedro-kubeflow` plugin from GetInData | Part of Xebia enables you to run a Kedro pipeline on Kubeflow Pipelines. Consult the [GitHub repository for `kedro-kubeflow`](https://github.com/getindata/kedro-kubeflow) for further details, or take a look at the [documentation](https://kedro-kubeflow.readthedocs.io/). diff --git a/docs/source/deployment/prefect.md b/docs/source/deployment/prefect.md new file mode 100644 index 0000000000..64d1018984 --- /dev/null +++ b/docs/source/deployment/prefect.md @@ -0,0 +1,265 @@ +# Prefect + +This page explains how to run your Kedro pipeline using [Prefect 2.0](https://www.prefect.io/products/core/), an open-source workflow management system. + +The scope of this documentation is the deployment to a self hosted [Prefect Server](https://docs.prefect.io/2.10.17/host/), which is an open-source backend that makes it easy to monitor and execute your Prefect flows and automatically extends Prefect 2.0. We will use an [Agent that dequeues submitted flow runs from a Work Queue](https://docs.prefect.io/2.10.17/tutorial/deployments/#why-workpools-and-workers). + +```{note} +This deployment has been tested using Kedro 0.18.10 with Prefect version 2.10.17. If you want to deploy with Prefect 1.0, we recommend you review [earlier versions of Kedro's Prefect deployment documentation](https://docs.kedro.org/en/0.18.9/deployment/prefect.html). +``` + +## Prerequisites + +To use Prefect 2.0 and Prefect Server, ensure you have the following prerequisites in place: + +- [Prefect 2.0 is installed](https://docs.prefect.io/2.10.17/getting-started/installation/#installing-the-latest-version) on your machine + +## Setup + +Configure your `PREFECT_API_URL` to point to your local Prefect instance: + +```bash +prefect config set PREFECT_API_URL="http://127.0.0.1:4200/api" +``` + +For each new Kedro project you create, you need to decide whether to opt into [usage analytics](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry). Your decision is recorded in the `.telemetry` file stored in the project root. + +```{important} +When you run a Kedro project locally, you are asked on the first `kedro` command for the project, but in this use case, the project will hang unless you follow these instructions. +``` + +Create a `.telemetry` file manually and put it in the **root of your Kedro project** and add your preference to give or decline consent. To do this, specify either `true` (to give consent) or `false`. The example given below accepts Kedro's usage analytics. + +```text +consent: true +``` + +Run a Prefect Server instance: + +```bash +prefect server start +``` + +In a separate terminal, [create a work pool](https://docs.prefect.io/2.10.17/concepts/work-pools/#work-pool-configuration) to organize the work and [create a work queue](https://docs.prefect.io/2.10.17/concepts/work-pools/#work-queues) for your agent to pull from: + +```bash +prefect work-pool create --type prefect-agent +prefect work-queue create --pool +``` + +Now run a Prefect Agent that subscribes to a work queue inside the work pool you created: + +```bash +prefect agent start --pool --work-queue +``` + +## How to run your Kedro pipeline using Prefect 2.0 + +### Convert your Kedro pipeline to Prefect 2.0 flow + +To build a [Prefect flow](https://docs.prefect.io/core/concepts/flows.html) for your Kedro pipeline programmatically and register it with the Prefect API, use the following Python script, which should be stored in your project’s **root directory**: + +```python +# /register_prefect_flow.py +import click +from pathlib import Path +from typing import Dict, List, Union, Callable + +from kedro.framework.hooks.manager import _create_hook_manager +from kedro.framework.project import pipelines +from kedro.framework.session import KedroSession +from kedro.framework.startup import bootstrap_project +from kedro.io import DataCatalog, MemoryDataSet +from kedro.pipeline.node import Node +from kedro.runner import run_node + +from prefect import flow, task, get_run_logger +from prefect.deployments import Deployment + + +@click.command() +@click.option("-p", "--pipeline", "pipeline_name", default="__default__") +@click.option("--env", "-e", type=str, default="base") +@click.option("--deployment_name", "deployment_name", default="example") +@click.option("--work_pool_name", "work_pool_name", default="default") +@click.option("--work_queue_name", "work_queue_name", default="default") +@click.option("--version", "version", default="1.0") +def prefect_deploy( + pipeline_name, env, deployment_name, work_pool_name, work_queue_name, version +): + """Register a Kedro pipeline as a Prefect flow.""" + + # Pipeline name to execute + pipeline_name = pipeline_name or "__default__" + + # Use standard deployment configuration for local execution. If you require a different + # infrastructure, check the API docs for Deployments at: https://docs.prefect.io/latest/api-ref/prefect/deployments/ + deployment = Deployment.build_from_flow( + flow=my_flow, + name=deployment_name, + path=str(Path.cwd()), + version=version, + parameters={ + "pipeline_name": pipeline_name, + "env": env, + }, + infra_overrides={"env": {"PREFECT_LOGGING_LEVEL": "DEBUG"}}, + work_pool_name=work_pool_name, + work_queue_name=work_queue_name, + ) + + deployment.apply() + + +@flow(name="my_flow") +def my_flow(pipeline_name: str, env: str): + logger = get_run_logger() + project_path = Path.cwd() + + metadata = bootstrap_project(project_path) + logger.info("Project name: %s", metadata.project_name) + + logger.info("Initializing Kedro...") + execution_config = kedro_init( + pipeline_name=pipeline_name, project_path=project_path, env=env + ) + + logger.info("Building execution layers...") + execution_layers = init_kedro_tasks_by_execution_layer( + pipeline_name, execution_config + ) + + for layer in execution_layers: + logger.info("Running layer...") + for node_task in layer: + logger.info("Running node...") + node_task() + + +@task() +def kedro_init( + pipeline_name: str, + project_path: Path, + env: str, +): + """ + Initializes a Kedro session and returns the DataCatalog and + KedroSession + """ + # bootstrap project within task / flow scope + + logger = get_run_logger() + logger.info("Bootstrapping project") + bootstrap_project(project_path) + + session = KedroSession.create( + project_path=project_path, + env=env, + ) + # Note that for logging inside a Prefect task logger is used. + logger.info("Session created with ID %s", session.session_id) + pipeline = pipelines.get(pipeline_name) + logger.info("Loading context...") + context = session.load_context() + catalog = context.catalog + logger.info("Registering datasets...") + unregistered_ds = pipeline.data_sets() - set(catalog.list()) # NOQA + for ds_name in unregistered_ds: + catalog.add(ds_name, MemoryDataSet()) + return {"catalog": catalog, "sess_id": session.session_id} + + +def init_kedro_tasks_by_execution_layer( + pipeline_name: str, + execution_config: Union[None, Dict[str, Union[DataCatalog, str]]] = None, +) -> List[List[Callable]]: + """ + Inits the Kedro tasks ordered topologically in groups, which implies that an earlier group + is the dependency of later one. + + Args: + pipeline_name (str): The pipeline name to execute + execution_config (Union[None, Dict[str, Union[DataCatalog, str]]], optional): + The required execution config for each node. Defaults to None. + + Returns: + List[List[Callable]]: A list of topologically ordered task groups + """ + + pipeline = pipelines.get(pipeline_name) + + execution_layers = [] + + # Return a list of the pipeline nodes in topologically ordered groups, + # i.e. if node A needs to be run before node B, it will appear in an + # earlier group. + for layer in pipeline.grouped_nodes: + execution_layer = [] + for node in layer: + # Use a function for task instantiation which avoids duplication of + # tasks + task = instantiate_task(node, execution_config) + execution_layer.append(task) + execution_layers.append(execution_layer) + + return execution_layers + + +def kedro_task( + node: Node, task_dict: Union[None, Dict[str, Union[DataCatalog, str]]] = None +): + run_node( + node, + task_dict["catalog"], + _create_hook_manager(), + task_dict["sess_id"], + ) + + +def instantiate_task( + node: Node, + execution_config: Union[None, Dict[str, Union[DataCatalog, str]]] = None, +) -> Callable: + """ + Function that wraps a Node inside a task for future execution + + Args: + node: Kedro node for which a Prefect task is being created. + execution_config: The configurations required for the node to execute + that includes catalogs and session id + + Returns: Prefect task for the passed node + + """ + return task(lambda: kedro_task(node, execution_config)).with_options(name=node.name) + + +if __name__ == "__main__": + prefect_deploy() +``` + +Then, run the deployment script in other terminal: + +```bash +python register_prefect_flow.py --work_pool_name --work_queue_name +``` + +```{note} +Be sure that your Prefect Server is up and running. Verify that the deployment script arguments match the work pool and work queue names. +``` + +### Run Prefect flow + +Now, having the flow registered, you can use [Prefect Server UI](https://docs.prefect.io/2.10.17/host/) to orchestrate and monitor it. + +Navigate to http://localhost:4200/deployments to see your registered flow. + +![prefect_2_flow_deployment](../meta/images/prefect_2_flow_deployment.png) + +Click on the flow to open it and then trigger your flow using the "RUN" > "QUICK RUN" button and leave the parameters by default. If you want to run a specific pipeline you can replace the `__default__` value. + +```{note} +Be sure that both your Prefect Server and Agent are up and running. +``` + +![prefect_2_flow_details](../meta/images/prefect_2_flow_details.png) diff --git a/docs/source/10_deployment/02_single_machine.md b/docs/source/deployment/single_machine.md similarity index 59% rename from docs/source/10_deployment/02_single_machine.md rename to docs/source/deployment/single_machine.md index 8c255a1600..0964a6a968 100644 --- a/docs/source/10_deployment/02_single_machine.md +++ b/docs/source/deployment/single_machine.md @@ -1,15 +1,15 @@ # Single-machine deployment This topic explains how to deploy Kedro on a production server. You can use three alternative methods to deploy your Kedro pipelines: -- Container based using [Kedro-Docker](https://github.com/quantumblacklabs/kedro-docker) -- Package based using [`kedro package`](../09_development/03_commands_reference.md#deploy-the-project) -- CLI based using the [Kedro CLI](../09_development/03_commands_reference.md) +- [Container-based deployment](#container-based) +- [Package-based deployment](#package-based) +- [CLI-based deployment](#cli-based) -## Container based +## Container-based This approach uses containers, such as [`Docker`](https://www.docker.com/) or any other container solution, to build an image and run the entire Kedro project in your preferred environment. -For the purpose of this walk-through, we are going to assume a `Docker` workflow. We recommend the [`Kedro-Docker`](https://github.com/quantumblacklabs/kedro-docker) plugin to streamline the process, and [usage instructions are in the plugin's README.md](https://github.com/quantumblacklabs/kedro-docker/blob/master/README.md). After you’ve built the Docker image for your project locally, transfer the image to the production server. You can do this as follows: +For the purpose of this walk-through, we are going to assume a Docker workflow. We recommend the [Kedro-Docker plugin](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) to streamline the process, and [usage instructions are in the plugin's README.md](https://github.com/kedro-org/kedro-plugins/blob/main/README.md). After you’ve built the Docker image for your project locally, transfer the image to the production server. You can do this as follows: ### How to use container registry A container registry allows you to store and share container images. [Docker Hub](https://www.docker.com/products/docker-hub) is one example of a container registry you can use for deploying your Kedro project. If you have a [Docker ID](https://docs.docker.com/docker-id) you can use it to push and pull your images from the Docker server using the following steps. @@ -32,45 +32,45 @@ Pull the image from Docker hub onto your production server: docker pull / ``` -```eval_rst -.. note:: Repositories on Docker Hub are set to public visibility by default. You can change your project to private on the Docker Hub website. +```{note} +Repositories on Docker Hub are set to public visibility by default. You can change your project to private on the Docker Hub website. ``` The procedure for using other container registries, like AWS ECR or GitLab Container Registry, will be almost identical to the steps described above. However, authentication will be different for each solution. -## Package based -If you prefer not to use containerisation, you can instead package your Kedro project by running the following in your project’s root directory: +## Package-based +If you prefer not to use containerisation, you can instead package your Kedro project using [`kedro package`](../development/commands_reference.md#deploy-the-project). + +Run the following in your project’s root directory: ```console kedro package ``` -Kedro builds the package into the `dist/` folder of your project, and creates one `.egg` file and one `.whl` file, which are [Python packaging formats for binary distribution](https://packaging.python.org/overview/). +Kedro builds the package into the `dist/` folder of your project, and creates a `.whl` file, which is [a Python packaging format for binary distribution](https://packaging.python.org/overview/). -The resulting package only contains the Python source code of your Kedro pipeline, not any of the `conf/`, `data/` and `logs/` subfolders nor the `pyproject.toml` file. This means that you can distribute the project to run elsewhere, such as on a separate computer with different configuration, data and logging. When distributed, the packaged project must be run from within a directory that contains the `pyproject.toml` file and `conf/` subfolder (and `data/` and `logs/` if your pipeline loads/saves local data or uses logging). This means that you will have to create these directories on the remote servers manually. +The resulting `.whl` package only contains the Python source code of your Kedro pipeline, not any of the `conf/` and `data/` subfolders nor the `pyproject.toml` file. +The project configuration is packaged separately in a `tar.gz` file. This compressed version of the config files excludes any files inside your `local` directory. +This means that you can distribute the project to run elsewhere, such as on a separate computer with different configuration, data and logging. When distributed, the packaged project must be run from within a directory that contains the `pyproject.toml` file and `conf/` subfolder (and `data/` if your pipeline loads/saves local data). This means that you will have to create these directories on the remote servers manually. -Recipients of the `.egg` and `.whl` files need to have Python and `pip` set up on their machines, but do not need to have Kedro installed. The project is installed to the root of a folder with the relevant `conf/`, `data/` and `logs/` subfolders, by navigating to the root and calling: +Recipients of the `.whl` file need to have Python and `pip` set up on their machines, but do not need to have Kedro installed. The project is installed to the root of a folder with the relevant `conf/` and `data/` subfolders, by navigating to the root and calling: ```console pip install ``` -Or when using the .egg file: - -```console -easy_install -``` - After having installed your project on the remote server, run the Kedro project as follows from the root of the project: ```console -python -m project_name.run +python -m project_name ``` -## CLI based -If neither containers nor packages are viable options for your project, you can also run it on a production server by cloning your project codebase to the server. You will need to follow these steps to get your project running: +## CLI-based +If neither containers nor packages are viable options for your project, you can also run it on a production server by cloning your project codebase to the server using the [Kedro CLI](../development/commands_reference.md). + +You will need to follow these steps to get your project running: -#### Use GitHub workflow to copy your project +### Use GitHub workflow to copy your project This workflow posits that development of the Kedro project is done on a local environment under version control by Git. Commits are pushed to a remote server (e.g. GitHub, GitLab, Bitbucket, etc.). Deployment of the (latest) code on a production server is accomplished through cloning and the periodic pulling of changes from the Git remote. The pipeline is then executed on the server. @@ -96,7 +96,7 @@ Finally clone the project to the server: git clone ``` -#### Install and run the Kedro project +### Install and run the Kedro project Once you have copied your Kedro project to the server, you need to follow these steps to install all project requirements and run the project. Install Kedro on the server using pip: @@ -114,7 +114,7 @@ conda install -c conda-forge kedro Install the project’s dependencies, by running the following in the project's root directory: ```console -kedro install +pip install -r src/requirements.txt ``` After having installed your project on the remote server you can run the Kedro project as follows from the root of the project: diff --git a/docs/source/deployment/vertexai.md b/docs/source/deployment/vertexai.md new file mode 100644 index 0000000000..97e2f7bfda --- /dev/null +++ b/docs/source/deployment/vertexai.md @@ -0,0 +1,6 @@ +# VertexAI + +Vertex AI pipelines is a Google Cloud Platform service that aims to deliver Kubeflow Pipelines functionality in a fully managed fashion. + +## The `kedro-vertexai` plugin +The `kedro-vertexai` plugin from GetInData | Part of Xebia enables you to run a Kedro pipeline on Vertex AI Pipelines. Consult the [GitHub repository for `kedro-vertexai`](https://github.com/getindata/kedro-vertexai) for further details, or take a look at the [documentation](https://kedro-vertexai.readthedocs.io/). diff --git a/docs/source/development/automated_testing.md b/docs/source/development/automated_testing.md new file mode 100644 index 0000000000..6efcfa73b9 --- /dev/null +++ b/docs/source/development/automated_testing.md @@ -0,0 +1,167 @@ +# Automated Testing + +An important step towards achieving high code quality and maintainability in your Kedro project is the use of automated tests. Let's look at how you can set this up. +## Introduction + +Software testing is the process of checking that the code you have written fulfills its requirements. Software testing can either be **manual** or **automated**. In the context of Kedro: +- **Manual testing** is when you run part or all of your project and check that the results are what you expect. +- **Automated testing** is writing new code (using libraries called _testing frameworks_) that runs part or all of your project and automatically checks the results against what you expect. + +As a project grows larger, new code will increasingly rely on existing code. As these interdependencies grow, making changes in one part of the code base can unexpectedly break the intended functionality in another part. + +The major disadvantage of manual testing is that it is time-consuming. Manual tests are usually run once, directly after new functionality has been added. It is impractical to repeat manual tests for the entire code base each time a change is made, which means this strategy often misses breaking changes. + +The solution to this problem is automated testing. Automated testing allows many tests across the whole code base to be run in seconds, every time a new feature is added or an old one is changed. In this way, breaking changes can be discovered during development rather than in production. + +## Set up automated testing with `pytest` + +There are many testing frameworks available for Python. One of the most popular is `pytest` (see the [project's home page](https://docs.pytest.org/en/7.1.x/) for a quick overview). `pytest` is often used in Python projects for its short, readable tests and powerful set of features. + +Let's look at how you can start working with `pytest` in your Kedro project. + +### Install `pytest` + +Install `pytest` as you would install other packages with `pip`, making sure your [project's virtual environment is active](../get_started/install.md#create-a-virtual-environment-for-your-kedro-project). + +```bash +pip install pytest +``` + +### Create a `/tests` directory + +Now that `pytest` is installed, you will need a place to put your tests. Create a `/tests` folder in the `/src` directory of your project. + +```bash +mkdir /src/tests +``` + +### Test directory structure + +The subdirectories in your project's `/tests` directory should mirror the directory structure of your project's `/src/` directory. All files in the `/tests` folder should be named `test_.py`. See an example `/src` folder below. + +``` +src +│ ... +└─── +│ └───pipelines +│ └───dataprocessing +│ │ ... +│ │ nodes.py +│ │ ... +│ +└───tests +│ └───pipelines +│ └───dataprocessing +│ │ ... +│ │ test_nodes.py +│ │ ... +``` + +### Create an example test + +Now that you have a place to put your tests, you can create an example test in the new file `/src/tests/test_run.py`. The example test simply checks that the project_path attribute of a specially-defined `KedroContext` object has been correctly set. + +``` +import pytest +from kedro.config import ConfigLoader +from kedro.framework.context import KedroContext +from kedro.framework.hooks import _create_hook_manager + + +@pytest.fixture +def config_loader(): + return ConfigLoader(conf_source=str(Path.cwd())) + + +@pytest.fixture +def project_context(config_loader): + return KedroContext( + package_name=, + project_path=Path.cwd(), + config_loader=config_loader, + hook_manager=_create_hook_manager(), + ) + +class TestProjectContext: + def test_project_path(self, project_context): + assert project_context.project_path == Path.cwd() +``` + +This test is redundant, but it introduces a few of `pytest`'s core features and demonstrates the layout of a test file: +- [Fixtures](https://docs.pytest.org/en/7.1.x/explanation/fixtures.html#about-fixtures) are used to define resources used in tests. +- Tests are implemented in methods or functions beginning with `test_` and classes beginning with `Test`. +- The `assert` statement is used to compare the result of the test with an expected value. + +Tests should be named as descriptively as possible, especially if you are working with other people. For example, it is easier to understand the purpose of a test with the name `test_node_passes_with_valid_input` than a test with the name `test_passes`. + +You can read more about the [basics of using `pytest` on the getting started page](https://docs.pytest.org/en/7.1.x/getting-started.html). For help writing your own tests and using all of the features of `pytest`, see the [project documentation](https://docs.pytest.org/). + +### Run your tests + +To run your tests, run `pytest` from within your project's root directory. + +```bash +cd +pytest +``` + +If you created the example test in the previous section, you should see the following output in your shell. + +``` +============================= test session starts ============================== +... +collected 1 item + +src/tests/test_run.py . [100%] + +============================== 1 passed in 0.38s =============================== +``` + +This output indicates that one test ran successfully in the file `src/tests/test_run.py`. + +## Add test coverage reports with `pytest-cov` + +It can be useful to see how much of your project is covered by tests. For this, you can install and configure the [`pytest-cov`](https://pypi.org/project/pytest-cov/) plugin for `pytest`, which is based on the popular [`coverage.py` library](https://coverage.readthedocs.io/). + +### Install `pytest-cov` + +Install `pytest` as you would install other packages with pip, making sure your [project's virtual environment is active](../get_started/install.md#create-a-virtual-environment-for-your-kedro-project). + +```bash +pip install pytest-cov +``` + +### Configure `pytest` to use `pytest-cov` + +To configure `pytest` to generate a coverage report using `pytest-cov`, you can add the following lines to your `/pyproject.toml` file (creating it if it does not exist). + +``` +[tool.pytest.ini_options] +addopts = """ +--cov-report term-missing \ +--cov src/ -ra""" +``` + +### Run `pytest` with `pytest-cov` + +Running `pytest` in the spaceflights starter with `pytest-cov` installed results in the following additional report. + +``` +Name Stmts Miss Cover Missing +-------------------------------------------------------------------------------------- +src/spaceflights/__init__.py 1 1 0% 4 +src/spaceflights/__main__.py 30 30 0% 4-47 +src/spaceflights/pipeline_registry.py 7 7 0% 2-16 +src/spaceflights/pipelines/__init__.py 0 0 100% +src/spaceflights/pipelines/data_processing/__init__.py 1 1 0% 3 +src/spaceflights/pipelines/data_processing/nodes.py 25 25 0% 1-67 +src/spaceflights/pipelines/data_processing/pipeline.py 5 5 0% 1-8 +src/spaceflights/pipelines/data_science/__init__.py 1 1 0% 3 +src/spaceflights/pipelines/data_science/nodes.py 20 20 0% 1-55 +src/spaceflights/pipelines/data_science/pipeline.py 8 8 0% 1-40 +src/spaceflights/settings.py 0 0 100% +-------------------------------------------------------------------------------------- +TOTAL 98 98 0% +``` + +This is the simplest report that `coverage.py` (via `pytest-cov`) will produce. It gives an overview of how many of the executable statements in each project file are covered by tests. For detail on the full set of features offered, see the [`coverage.py` docs](https://coverage.readthedocs.io/). diff --git a/docs/source/development/commands_reference.md b/docs/source/development/commands_reference.md new file mode 100644 index 0000000000..e109332a11 --- /dev/null +++ b/docs/source/development/commands_reference.md @@ -0,0 +1,554 @@ +# Kedro's command line interface + +Kedro's command line interface (CLI) is used to give commands to Kedro via a terminal shell (such as the terminal app on macOS, or cmd.exe or PowerShell on Windows). You need to use the CLI to set up a new Kedro project, and to run it. + +## Autocompletion (optional) + +If you are using macOS or Linux, you can set up your shell to autocomplete `kedro` commands. If you don't know the type of shell you are using, first type the following: + +```bash +echo $0 +``` + +
    +If you are using Bash (click to expand) +
    +Add the following to your ~/.bashrc (or just run it on the command line): + +```bash +eval "$(_KEDRO_COMPLETE=source kedro)" +``` +
    + +
    +If you are using Z shell (ZSh) (click to expand) +
    +Add the following to ~/.zshrc: + +```bash +eval "$(_KEDRO_COMPLETE=source_zsh kedro)" +``` +
    + +
    +If you are using Fish (click to expand) +
    +Add the following to ~/.config/fish/completions/foo-bar.fish: + +```bash +eval (env _KEDRO_COMPLETE=source_fish kedro) +``` +
    + +## Invoke Kedro CLI from Python (optional) +You can invoke the Kedro CLI as a Python module: + +```bash +python -m kedro +``` + +## Kedro commands +Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. Project-specific commands are called from within a project directory and apply to that particular project. Global commands can be run anywhere and don't apply to any particular project: + +* Global Kedro commands + * [`kedro --help`](#get-help-on-kedro-commands) + * [`kedro --version`](#confirm-the-kedro-version) + * [`kedro info`](#confirm-kedro-information) + * [`kedro new`](#create-a-new-kedro-project) + +* Project-specific Kedro commands + * [`kedro activate-nbstripout`](#strip-output-cells)(deprecated from version 0.19.0) + * [`kedro build-docs`](#build-the-project-documentation) (deprecated from version 0.19.0) + * [`kedro build-reqs`](#build-the-projects-dependency-tree) (deprecated from version 0.19.0) + * [`kedro catalog list`](#list-datasets-per-pipeline-per-type) + * [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file) + * [`kedro ipython`](#notebooks) + * [`kedro jupyter convert`](#copy-tagged-cells) (deprecated from version 0.19.0) + * [`kedro jupyter lab`](#notebooks) + * [`kedro jupyter notebook`](#notebooks) + * [`kedro lint`](#lint-your-project) (deprecated from version 0.19.0) + * [`kedro micropkg package `](#package-a-micro-package) + * [`kedro micropkg pull `](#pull-a-micro-package) + * [`kedro package`](#deploy-the-project) + * [`kedro pipeline create `](#create-a-new-modular-pipeline-in-your-project) + * [`kedro pipeline delete `](#delete-a-modular-pipeline) + * [`kedro registry describe `](#describe-a-registered-pipeline) + * [`kedro registry list`](#list-all-registered-pipelines-in-your-project) + * [`kedro run`](#run-the-project) + * [`kedro test`](#test-your-project) (deprecated from version 0.19.0) + +## Global Kedro commands + +The following are Kedro commands that apply globally and can be run from any directory location. + +```{note} +You only need to use one of those given below (e.g. specify `kedro -V` **OR** `kedro --version`). +``` + +### Get help on Kedro commands + +```bash +kedro +kedro -h +kedro --help +``` + +### Confirm the Kedro version + +```bash +kedro -V +kedro --version +``` + +### Confirm Kedro information + +```bash +kedro info +``` +Returns output similar to the following, depending on the version of Kedro used and plugins installed. + +``` + _ _ +| | _____ __| |_ __ ___ +| |/ / _ \/ _` | '__/ _ \ +| < __/ (_| | | | (_) | +|_|\_\___|\__,_|_| \___/ +v0.18.11 + +Kedro is a Python framework for +creating reproducible, maintainable +and modular data science code. + +Installed plugins: +kedro_viz: 4.4.0 (hooks:global,line_magic) + +``` + +### Create a new Kedro project + +```bash +kedro new +``` + +## Customise or Override Project-specific Kedro commands + +```{note} +All project related CLI commands should be run from the project’s root directory. +``` + +Kedro's command line interface (CLI) allows you to associate a set of commands and dependencies with a target, which you can then execute from inside the project directory. + +The commands a project supports are specified on the framework side. If you want to customise any of the Kedro commands you can do this either by adding a file called `cli.py` or by injecting commands into it via the [`plugin` framework](../extend_kedro/plugins.md). Find the template for the `cli.py` file below. + +
    +Click to expand + +``` +"""Command line tools for manipulating a Kedro project. +Intended to be invoked via `kedro`.""" +import click +from kedro.framework.cli.project import ( + ASYNC_ARG_HELP, + CONFIG_FILE_HELP, + CONF_SOURCE_HELP, + FROM_INPUTS_HELP, + FROM_NODES_HELP, + LOAD_VERSION_HELP, + NODE_ARG_HELP, + PARAMS_ARG_HELP, + PIPELINE_ARG_HELP, + RUNNER_ARG_HELP, + TAG_ARG_HELP, + TO_NODES_HELP, + TO_OUTPUTS_HELP, + project_group, +) +from kedro.framework.cli.utils import ( + CONTEXT_SETTINGS, + _config_file_callback, + _get_values_as_tuple, + _reformat_load_versions, + _split_params, + env_option, + split_string, + split_node_names, +) +from kedro.framework.session import KedroSession +from kedro.utils import load_obj + + +@click.group(context_settings=CONTEXT_SETTINGS, name=__file__) +def cli(): + """Command line tools for manipulating a Kedro project.""" + + +@project_group.command() +@click.option( + "--from-inputs", type=str, default="", help=FROM_INPUTS_HELP, callback=split_string +) +@click.option( + "--to-outputs", type=str, default="", help=TO_OUTPUTS_HELP, callback=split_string +) +@click.option( + "--from-nodes", type=str, default="", help=FROM_NODES_HELP, callback=split_node_names +) +@click.option( + "--to-nodes", type=str, default="", help=TO_NODES_HELP, callback=split_node_names +) +@click.option("--node", "-n", "node_names", type=str, multiple=True, help=NODE_ARG_HELP) +@click.option( + "--runner", "-r", type=str, default=None, multiple=False, help=RUNNER_ARG_HELP +) +@click.option("--async", "is_async", is_flag=True, multiple=False, help=ASYNC_ARG_HELP) +@env_option +@click.option("--tag", "-t", type=str, multiple=True, help=TAG_ARG_HELP) +@click.option( + "--load-version", + "-lv", + type=str, + multiple=True, + help=LOAD_VERSION_HELP, + callback=_reformat_load_versions, +) +@click.option("--pipeline", "-p", type=str, default=None, help=PIPELINE_ARG_HELP) +@click.option( + "--config", + "-c", + type=click.Path(exists=True, dir_okay=False, resolve_path=True), + help=CONFIG_FILE_HELP, + callback=_config_file_callback, +) +@click.option( + "--conf-source", + type=click.Path(exists=True, file_okay=False, resolve_path=True), + help=CONF_SOURCE_HELP, +) +@click.option( + "--params", + type=click.UNPROCESSED, + default="", + help=PARAMS_ARG_HELP, + callback=_split_params, +) +# pylint: disable=too-many-arguments,unused-argument +def run( + tag, + env, + runner, + is_async, + node_names, + to_nodes, + from_nodes, + from_inputs, + to_outputs, + load_version, + pipeline, + config, + conf_source, + params, +): + """Run the pipeline.""" + + ##### ADD YOUR CUSTOM RUN COMMAND CODE HERE ##### + runner = load_obj(runner or "SequentialRunner", "kedro.runner") + + tag = _get_values_as_tuple(tag) if tag else tag + node_names = _get_values_as_tuple(node_names) if node_names else node_names + + with KedroSession.create( + env=env, conf_source=conf_source, extra_params=params + ) as session: + session.run( + tags=tag, + runner=runner(is_async=is_async), + node_names=node_names, + from_nodes=from_nodes, + to_nodes=to_nodes, + from_inputs=from_inputs, + to_outputs=to_outputs, + load_versions=load_version, + pipeline_name=pipeline, + ) + + +``` +
    + +### Project setup + +#### Build the project's dependency tree + +```{note} +_This command will be deprecated from Kedro version 0.19.0._ +``` +```bash +kedro build-reqs +``` + +This command runs [`pip-compile`](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) on the project's `src/requirements.txt` file and will create `src/requirements.lock` with the compiled requirements. + +`kedro build-reqs` has two optional arguments to specify which file to compile the requirements from and where to save the compiled requirements to. These arguments are `--input-file` and `--output-file` respectively. + +`kedro build-reqs` also accepts and passes through CLI options accepted by `pip-compile`. For example, `kedro build-reqs --generate-hashes` will call `pip-compile --output-file=src/requirements.lock --generate-hashes src/requirements.txt`. + +#### Install all package dependencies + +The following runs [`pip`](https://github.com/pypa/pip) to install all package dependencies specified in `src/requirements.txt`: + +```bash +pip install -r src/requirements.txt +``` + +For further information, see the [documentation on installing project-specific dependencies](../kedro_project_setup/dependencies.md#install-project-specific-dependencies). + + +### Run the project +Call the `run()` method of the `KedroSession` defined in `kedro.framework.session`. + +```bash +kedro run +``` + +`KedroContext` can be extended in `run.py` (`src//run.py`). In order to use the extended `KedroContext`, you need to set `context_path` in the `pyproject.toml` configuration file. + +#### Modifying a `kedro run` + +Kedro has options to modify pipeline runs. Below is a list of CLI arguments supported out of the box. Note that the names inside angular brackets (`<>`) are placeholders, and you should replace these values with the +the names of relevant nodes, datasets, envs, etc. in your project. + +| CLI command | Description | +|---------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `kedro run --from-inputs=,` | A list of dataset names which should be used as a starting point | +| `kedro run --to-outputs=,` | A list of dataset names which should be used as an end point | +| `kedro run --from-nodes=,` | A list of node names which should be used as a starting point | +| `kedro run --to-nodes=,` | A list of node names which should be used as an end point | +| [DEPRECATED] `kedro run --node=,` | Run only nodes with specified names.
    Multiple instances allowed.
    NOTE: This flag will be deprecated in `Kedro 0.19.0`. Use the following flag `--nodes` instead. | +| `kedro run --nodes=,` | Run only nodes with specified names. | +| `kedro run --runner=` | Run the pipeline with a specific runner | +| `kedro run --async` | Load and save node inputs and outputs asynchronously with threads | +| `kedro run --env=` | Run the pipeline in the env_name environment. Defaults to local if not provided | +| [DEPRECATED] `kedro run --tag=,` | Run only nodes which have any of these tags attached.
    Multiple instances allowed.
    NOTE: This flag will be deprecated in `Kedro 0.19.0`. Use the following flag `--tags` instead. | +| `kedro run --tags=,` | Run only nodes which have any of these tags attached. | +| [DEPRECATED] `kedro run --load-version=:YYYY-MM-DDThh.mm.ss.sssZ` | Specify a particular dataset version (timestamp) for loading.
    Multiple instances allowed.
    NOTE: This flag will be deprecated in `Kedro 0.19.0`. Use the following flag `--load-versions` instead. | +| `kedro run --load-versions=:YYYY-MM-DDThh.mm.ss.sssZ` | Specify particular dataset versions (timestamp) for loading. | +| `kedro run --pipeline=` | Run the whole pipeline by its name | +| `kedro run --namespace=` | Run only nodes with the specified namespace | +| `kedro run --config=.yml` | Specify all command line options in a named YAML configuration file | +| `kedro run --conf-source=` | Specify a new source directory for configuration files | +| `kedro run --conf-source=` | Only possible when using the [``OmegaConfigLoader``](../configuration/advanced_configuration.md#omegaconfigloader). Specify a compressed config file in `zip` or `tar` format. | +| `kedro run --params=:,:` | Does a parametrised kedro run with `{"param_key1": "value1", "param_key2": 2}`. These will take precedence over parameters defined in the `conf` directory. Additionally, dot (`.`) syntax can be used to address nested keys like `parent.child:value` | + +You can also combine these options together, so the following command runs all the nodes from `split` to `predict` and `report`: + +```bash +kedro run --from-nodes=split --to-nodes=predict,report +``` + +This functionality is extended to the `kedro run --config=config.yml` command, which allows you to [specify run commands in a configuration file](../nodes_and_pipelines/run_a_pipeline.md#configure-kedro-run-arguments). + +A parameterised run is best used for dynamic parameters, i.e. running the same pipeline with different inputs, for static parameters that do not change we recommend following the [Kedro project setup methodology](../configuration/parameters.md). + +### Deploy the project + +The following packages your application as one `.whl` file within the `dist/` folder of your project. It packages the project configuration separately in a `tar.gz` file: + +```bash +kedro package +``` + +See [the Python documentation for further information about packaging](https://packaging.python.org/overview/). + +### Pull a micro-package +Since Kedro 0.17.7 you can pull a micro-package into your Kedro project as follows: + +```bash +kedro micropkg pull +``` + +The above command will take the bundled `.tar.gz` file and do the following: + +* Place source code in `src//pipelines/` +* Place parameters in `conf/base/parameters/.yml` +* Pull out tests and place in `src/tests/pipelines/` + +`kedro micropkg pull` works with PyPI, local and cloud storage: + +* PyPI: `kedro micropkg pull ` with `` being a package on PyPI +* Local storage: `kedro micropkg pull dist/-0.1.tar.gz` +* Cloud storage: `kedro micropkg pull s3:///-0.1.tar.gz` + +### Project quality + +#### Build the project documentation + +```{note} +_This command will be deprecated from Kedro version 0.19.0._ +``` + +```bash +kedro build-docs +``` + +The `build-docs` command builds [project documentation](../tutorial/package_a_project.md#add-documentation-to-a-kedro-project) using the [Sphinx](https://www.sphinx-doc.org) framework. To further customise your documentation, please refer to `docs/source/conf.py` and the [Sphinx documentation](http://www.sphinx-doc.org/en/master/usage/configuration.html). + + +#### Lint your project + +```{note} +_This command will be deprecated from Kedro version 0.19.0._. We still recommend to (../development/linting.md) and you can find more help here +``` + +```bash +kedro lint +``` + +Your project is linted with [`black`](https://github.com/psf/black), [`flake8`](https://github.com/PyCQA/flake8) and [`isort`](https://github.com/PyCQA/isort). + + +#### Test your project + +```{note} +_This command will be deprecated from Kedro version 0.19.0._ +``` + +The following runs all `pytest` unit tests found in `src/tests`, including coverage (see the file `.coveragerc`): + +```bash +kedro test +``` + +### Project development + +#### Modular pipelines + +##### Create a new [modular pipeline](../nodes_and_pipelines/modular_pipelines) in your project + +```bash +kedro pipeline create +``` + +##### Package a micro-package +The following command packages all the files related to a micro-package, e.g. a modular pipeline, into a [Python source distribution file](https://packaging.python.org/overview/#python-source-distributions): + +```bash +kedro micropkg package +``` + +Further information is available in the [micro-packaging documentation](../nodes_and_pipelines/micro_packaging.md). + +##### Pull a micro-package in your project +The following command pulls all the files related to a micro-package, e.g. a modular pipeline, from either [Pypi](https://pypi.org/) or a storage location of a [Python source distribution file](https://packaging.python.org/overview/#python-source-distributions). + +```bash +kedro micropkg pull (or path to a sdist file) +``` + +Further information is available in [the micro-packaging documentation](../nodes_and_pipelines/micro_packaging.md). + +##### Delete a modular pipeline +The following command deletes all the files related to a modular pipeline in your Kedro project. + +```bash +kedro pipeline delete +``` + +Further information is available in [the micro-packaging documentation](../nodes_and_pipelines/micro_packaging.md). + +#### Registered pipelines + +##### Describe a registered pipeline + +```bash +kedro registry describe +``` +The output includes all the nodes in the pipeline. If no pipeline name is provided, this command returns all nodes in the `__default__` pipeline. + +##### List all registered pipelines in your project + +```bash +kedro registry list +``` + +#### Datasets + +##### List datasets per pipeline per type + +```bash +kedro catalog list +``` +The results include datasets that are/aren't used by a specific pipeline. + +The command also accepts an optional `--pipeline` argument that allows you to specify the pipeline name(s) (comma-separated values) in order to filter datasets used only by those named pipeline(s). For example: + +```bash +kedro catalog list --pipeline=ds,de +``` + +#### Data Catalog + +##### Create a Data Catalog YAML configuration file + +The following command creates a Data Catalog YAML configuration file with `MemoryDataSet` datasets for each dataset in a registered pipeline, if it is missing from the `DataCatalog`. + +```bash +kedro catalog create --pipeline= +``` + +The command also accepts an optional `--env` argument that allows you to specify a configuration environment (defaults to `base`). + +The command creates the following file: `//catalog/.yml` + +#### Notebooks + +To start a Jupyter Notebook: + +```bash +kedro jupyter notebook +``` + +To start JupyterLab: + +```bash +kedro jupyter lab +``` + +To start an IPython shell: + +```bash +kedro ipython +``` + +The [Kedro IPython extension](../notebooks_and_ipython/kedro_and_notebooks.md#a-custom-kedro-kernel) makes the following variables available in your IPython or Jupyter session: + +* `catalog` (type `DataCatalog`): [Data Catalog](../data/data_catalog.md) instance that contains all defined datasets; this is a shortcut for `context.catalog` +* `context` (type `KedroContext`): Kedro project context that provides access to Kedro's library components +* `pipelines` (type `Dict[str, Pipeline]`): Pipelines defined in your [pipeline registry](../nodes_and_pipelines/run_a_pipeline.md#run-a-pipeline-by-name) +* `session` (type `KedroSession`): [Kedro session](../kedro_project_setup/session.md) that orchestrates a pipeline run + +To reload these variables (e.g. if you updated `catalog.yml`) use the `%reload_kedro` line magic, which can also be used to see the error message if any of the variables above are undefined. + +##### Copy tagged cells + +```{note} +_This command will be deprecated from Kedro version 0.19.0._ +``` + +To copy the code from [cells tagged](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#cell-tags) with a `node` tag into Python files under `src//nodes/` in a Kedro project: + +```bash +kedro jupyter convert --all +``` + +##### Strip output cells + +```{note} +_This command will be deprecated from Kedro version 0.19.0._ +``` + +Output cells of Jupyter Notebook should not be tracked by git, especially if they contain sensitive information. To strip them out: + +```bash +kedro activate-nbstripout +``` + +This command adds a `git hook` which clears all notebook output cells before committing anything to `git`. It needs to run only once per local repository. diff --git a/docs/source/09_development/04_debugging.md b/docs/source/development/debugging.md similarity index 79% rename from docs/source/09_development/04_debugging.md rename to docs/source/development/debugging.md index 4c888ebc5a..5f464b9fb2 100644 --- a/docs/source/09_development/04_debugging.md +++ b/docs/source/development/debugging.md @@ -1,19 +1,15 @@ # Debugging -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - ## Introduction If you're running your Kedro pipeline from the CLI or you can't/don't want to run Kedro from within your IDE debugging framework, it can be hard to debug your Kedro pipeline or nodes. This is particularly frustrating because: -* If you have long running nodes or pipelines, inserting `print` statements and running them multiple times quickly becomes a time-consuming procedure. +* If you have long running nodes or pipelines, inserting `print` statements and running them multiple times quickly becomes time-consuming. * Debugging nodes outside the `run` session isn't very helpful because getting access to the local scope within the `node` can be hard, especially if you're dealing with large data or memory datasets, where you need to chain a few nodes together or re-run your pipeline to produce the data for debugging purposes. -This guide provides examples on how to instantiate a [post-mortem](https://docs.python.org/3/library/pdb.html#pdb.post_mortem) debugging session with [`pdb`](https://docs.python.org/3/library/pdb.html) using [Hooks](../07_extend_kedro/02_hooks.md) when an uncaught error occurs during a pipeline run. Note that [ipdb](https://pypi.org/project/ipdb/) could be integrated in the same manner. +This guide provides examples on [how to instantiate a post-mortem debugging session](https://docs.python.org/3/library/pdb.html#pdb.post_mortem) with [`pdb`](https://docs.python.org/3/library/pdb.html) using [Kedro Hooks](../hooks/introduction.md) when an uncaught error occurs during a pipeline run. Note that [ipdb](https://pypi.org/project/ipdb/) could be integrated in the same manner. -If you are looking for guides on how to setup debugging with IDEs, please visit the guide for [VSCode](./01_set_up_vscode.md#debugging) and [PyCharm](./02_set_up_pycharm.md#debugging). +For guides on how to set up debugging with IDEs, please visit the [guide for debugging in VSCode](./set_up_vscode.md#debugging) and the [guide for debugging in PyCharm](./set_up_pycharm.md#debugging). ## Debugging Node diff --git a/docs/source/development/index.md b/docs/source/development/index.md new file mode 100644 index 0000000000..ccdeeecbfc --- /dev/null +++ b/docs/source/development/index.md @@ -0,0 +1,12 @@ +# Development + +```{toctree} +:maxdepth: 1 + +set_up_vscode +set_up_pycharm +commands_reference +debugging +automated_testing +linting +``` diff --git a/docs/source/development/linting.md b/docs/source/development/linting.md new file mode 100644 index 0000000000..d795086b51 --- /dev/null +++ b/docs/source/development/linting.md @@ -0,0 +1,124 @@ +# Code formatting and linting + +## Introduction + +Code formatting guidelines set a standard for the layout of your code, for stylistic elements such as use of line breaks and whitespace. Format doesn't have any impact on how the code works, but using a consistent style makes your code more readable, and makes it more likely to be reused. + +Linting tools check your code for errors such as a missing bracket or line indent. This can save time and frustration because you can catch errors in advance of running the code. + +As a project grows and goes through various stages of development it becomes important to maintain code quality. Using a consistent format and linting your code ensures that it is consistent, readable, and easy to debug and maintain. + +## Set up Python tools +There are a variety of Python tools available to use with your Kedro projects. This guide shows you how to use +[`black`](https://github.com/psf/black), [`ruff`](https://beta.ruff.rs). +- **`black`** is a [PEP 8](https://peps.python.org/pep-0008/) compliant opinionated Python code formatter. `black` can +check for styling inconsistencies and reformat your files in place. +[You can read more in the `black` documentation](https://black.readthedocs.io/en/stable/). +- **`ruff`** is a fast linter that replaces `flake8`, `pylint`, `pyupgrade`, `isort` and [more](https://beta.ruff.rs/docs/rules/). + - It helps to make your code compliant to [`pep8`](https://pypi.org/project/pep8/). + - It reformats code by sorting imports alphabetically and automatically separating them into sections by +type. [You can read more in the `isort` documentation](https://pycqa.github.io/isort/). + + +### Install the tools +Install `black` and `ruff` by adding the following lines to your project's `src/requirements.txt` +file: +```text +black # Used for formatting code +ruff # Used for linting, formatting and sorting module imports + +``` +To install all the project-specific dependencies, including the linting tools, navigate to the root directory of the +project and run: +```bash +pip install -r src/requirements.txt +``` +Alternatively, you can individually install the linting tools using the following shell commands: +```bash +pip install black ruff +``` +#### Configure `ruff` +`ruff` read configurations from `pyproject.toml` within your project root. You can enable different rule sets within the `[tool.ruff]` section. For example, the rule set `F` is equivalent to `Pyflakes`. + +To start with `ruff`, we recommend adding this section to enable a few basic rules sets. +```toml +[tool.ruff] +select = [ + "F", # Pyflakes + "E", # Pycodestyle + "W", # Pycodestyle + "UP", # pyupgrade + "I", # isort + "PL", # Pylint +] +ignore = ["E501"] # Black take care off line-too-long +``` + +```{note} +It is a good practice to [split your line when it is too long](https://beta.ruff.rs/docs/rules/line-too-long/), so it can be read easily even in a small screen. `ruff` treats this slightly different from `black`, when using together we recommend to disable this rule, i.e. `E501` to avoid conflicts. +``` + +#### Configure `flake8` + +Store your `flake8` configuration in a file named `setup.cfg` within your project root. The Kedro starters use the [following configuration](https://github.com/kedro-org/kedro-starters/blob/main/pandas-iris/%7B%7B%20cookiecutter.repo_name%20%7D%7D/setup.cfg): + +```text +[flake8] +max-line-length=88 +extend-ignore=E203 +``` + +### Run the tools +Use the following commands to run lint checks: +```bash +black --check +isort --profile black --check +``` +You can also have `black` and `isort` automatically format your code by omitting the `--check` flag. Since `isort` and +`black` both format your imports, adding `--profile black` to the `isort` run helps avoid potential conflicts. + +Use the following to invoke `flake8`: +```bash +flake8 +``` + +## Automated formatting and linting with `pre-commit` hooks + +You can automate the process of formatting and linting with [`pre-commit`](https://github.com/pre-commit/pre-commit) hooks. +These hooks are run before committing your code to your repositories to automatically point out formatting issues, +making code reviews easier and less time-consuming. + +### Install `pre-commit` +You can install `pre-commit` along with other dependencies by including it in the `src/requirements.txt` file of your +Kedro project by adding the following line: +```text +pre-commit +``` +You can also install `pre-commit` using the following command: +```bash +pip install pre-commit +``` +### Add `pre-commit` configuration file +Create a file named `.pre-commit-config.yaml` in your Kedro project root directory. You can add entries for the hooks +you want to run before each `commit`. +Below is a sample `YAML` file with entries for `black`,`flake8`, and `isort`: +```yaml +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.0.270 + hooks: + - id: ruff + + - repo: https://github.com/psf/black + rev: 22.8.0 + hooks: + - id: black + language_version: python3.9 +``` +### Install git hook scripts +Run the following command to complete installation: +```bash +pre-commit install +``` +This enables `pre-commit` hooks to run automatically every time you execute `git commit`. diff --git a/docs/source/09_development/02_set_up_pycharm.md b/docs/source/development/set_up_pycharm.md similarity index 79% rename from docs/source/09_development/02_set_up_pycharm.md rename to docs/source/development/set_up_pycharm.md index 87e9c6c16f..28d936bf22 100644 --- a/docs/source/09_development/02_set_up_pycharm.md +++ b/docs/source/development/set_up_pycharm.md @@ -1,9 +1,5 @@ # Set up PyCharm -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - This section will present a quick guide on how to configure [PyCharm](https://www.jetbrains.com/pycharm/) as a development environment for working on Kedro projects. Open a new project directory in PyCharm. You will need to add your **Project Interpreter**, so go to **PyCharm | Preferences** for macOS or **File | Settings** for Windows and Linux: @@ -42,9 +38,9 @@ Finally, in the **Project Explorer** right-click on `src` and then go to **Mark ## Set up Run configurations -[PyCharm Run configurations](https://www.jetbrains.com/help/pycharm/creating-and-editing-run-debug-configurations.html) allow you to execute preconfigured scripts rapidly in your IDE with a click of a button. This may be useful for testing, running and packaging your Kedro projects. +[PyCharm Run configurations](https://www.jetbrains.com/help/pycharm/creating-run-debug-configuration-for-tests.html) allow you to execute preconfigured scripts rapidly in your IDE with a click of a button. This may be useful for testing, running and packaging your Kedro projects. -Here we will walk you through an example of how to setup Run configuration for Kedro CLI `run` command, however it is also applicable to other Kedro commands: `test`, `install`, `package`, `build-docs`. +Here we will walk you through an example of how to set up Run configuration for the Kedro CLI `run` command. It is also applicable to other Kedro commands, such as `test` or `install`. Go to **Run | Edit Configurations**: @@ -70,6 +66,10 @@ Edit the new Run configuration as follows: Replace **Script path** with path obtained above and **Working directory** with the path of your project directory and then click **OK**. +```{note} +**Emulate terminal in output console** enables PyCharm to show [rich terminal output](../logging/index.md). +``` + To execute the Run configuration, select it from the **Run / Debug Configurations** dropdown in the toolbar (if that toolbar is not visible, you can enable it by going to **View > Toolbar**). Click the green triangle: ![](../meta/images/pycharm_conf_run_button.png) @@ -96,8 +96,8 @@ Then click the bug button in the toolbar (![](../meta/images/pycharm_debugger_bu ## Advanced: Remote SSH interpreter -```eval_rst -.. note:: This section uses features supported in PyCharm Professional Edition only. +```{note} +This section uses features supported in PyCharm Professional Edition only. ``` Firstly, add an SSH interpreter. Go to **Preferences | Project Interpreter** as above and proceed to add a new interpreter. Select **SSH Interpreter** and fill in details of the remote computer: @@ -120,13 +120,13 @@ Click **OK** and then select **Remote Run** from the toolbar and click **Run** t ![](../meta/images/pycharm_remote_run.png) -To remotely debug, click the debugger button as [described above](#debugging). +[To debug remotely, click the debugger button as described above](#debugging). ## Advanced: Docker interpreter -```eval_rst -.. note:: This section uses features supported by PyCharm Professional Edition only. +```{note} +This section uses features supported by PyCharm Professional Edition only. ``` First, add a Docker interpreter. Go to **Preferences | Project Interpreter** as above and proceed to add a new interpreter. Select **Docker Interpreter** and then choose the target Docker image: @@ -147,16 +147,32 @@ Click **OK** and then select your run configuration from the toolbar and click * [To debug, click the debugger button as described above](#debugging). +## Configure Python Console + +You can configure Pycharm's IPython to load Kedro's Extension. + +Click **PyCharm | Preferences** for macOS or **File | Settings**, inside **Build, Execution, Deployment** and **Console**, enter the **Python Console** configuration. + +You can append the configuration necessary to use Kedro IPython to the **Starting script**: + +``` +%load_ext kedro.ipython +``` + +With this configuration, when you create a Python Console you should be able to use context, session and catalog. + +![](../meta/images/pycharm_ipython_working_example.png) + ## Configuring the Kedro catalog validation schema You can enable the Kedro catalog validation schema in your PyCharm IDE to enable real-time validation, autocompletion and see information about the different fields in your `catalog` as you write it. To enable this, open a `catalog.yml` file and you should see "No JSON Schema" in the bottom right corner of your window. Click it and select "Edit Schema Mapping". ![](../meta/images/pycharm_edit_schema_mapping.png) -Add a new mapping using the "+" button in the top left of the window and select the name you want for it. Enter this URL `https://raw.githubusercontent.com/quantumblacklabs/kedro/develop/static/jsonschema/kedro-catalog-0.17.json` in the "Schema URL" field and select "JSON Schema Version 7" in the "Schema version" field. +Add a new mapping using the "+" button in the top left of the window and select the name you want for it. Enter this URL `https://raw.githubusercontent.com/kedro-org/kedro/develop/static/jsonschema/kedro-catalog-0.18.json` in the "Schema URL" field and select "JSON Schema Version 7" in the "Schema version" field. Add the following file path pattern to the mapping: `conf/**/*catalog*`. ![](../meta/images/pycharm_catalog_schema_mapping.png) -> Different schemas for different Kedro versions can be found [here](https://github.com/quantumblacklabs/kedro/tree/master/static/jsonschema). +> [Different schemas for different Kedro versions can be found in the Kedro repository](https://github.com/kedro-org/kedro/tree/main/static/jsonschema). diff --git a/docs/source/09_development/01_set_up_vscode.md b/docs/source/development/set_up_vscode.md similarity index 83% rename from docs/source/09_development/01_set_up_vscode.md rename to docs/source/development/set_up_vscode.md index f9f4daf869..4db2474bcc 100644 --- a/docs/source/09_development/01_set_up_vscode.md +++ b/docs/source/development/set_up_vscode.md @@ -1,10 +1,5 @@ # Set up Visual Studio Code - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - Start by opening a new project directory in VS Code and installing the Python plugin under **Tools and languages**: ![](../meta/images/vscode_startup.png) @@ -15,7 +10,7 @@ At this stage, you should be able to see the `conda` environment that you have c ![](../meta/images/vscode_setup_interpreter.png) -### Advanced: For those using `venv` / `virtualenv` +## Advanced: For those using `venv` / `virtualenv` We're going to show you how to get your virtual environments to show up in your Python interpreter in VS Code. You do this by opening [`settings.json`](https://code.visualstudio.com/docs/getstarted/settings#_settings-file-locations) and adding the following: @@ -112,12 +107,12 @@ PYTHONPATH=/path/to/project/src:$PYTHONPATH PYTHONPATH=C:/path/to/project/src;%PYTHONPATH% ``` -You can find more information about setting up environmental variables [here](https://code.visualstudio.com/docs/python/environments#_environment-variable-definitions-file). +You can find [more information about setting up environmental variables in the VSCode documentation](https://code.visualstudio.com/docs/python/environments#_environment-variable-definitions-file). Go to **Debug > Add Configurations**. -```eval_rst -.. note:: If you encounter the following error: ``Cannot read property 'openConfigFile' of undefined``, you can manually create ``launch.json`` file in ``.vscode`` directory and paste the configuration from below. +```{note} +If you encounter the following error: `Cannot read property 'openConfigFile' of undefined`, you can manually create `launch.json` file in `.vscode` directory and paste the configuration from below. ``` Edit the `launch.json` that opens in the editor with: @@ -169,7 +164,7 @@ First install the `ptvsd` Python library on both the local and remote computer u python -m pip install --upgrade ptvsd ``` -Go to the Debugger Configurations as described [above](#debugging). Add the following to the `configurations` array in `launch.json`: +[Go to the Debugger Configurations as described in the debugging section above](#debugging). Add the following to the `configurations` array in `launch.json`: ``` { @@ -204,7 +199,7 @@ ptvsd.wait_for_attach() Ensure both computers (the computer you are working on and the remote computer executing your code) have the same source code. For example, you can use `scp` to sync your code: ```console -scp -r /path/to/ @:projects/ +scp -r @:projects/ ``` ❗The example above assumes there is a directory called `projects` in the home directory of the user account on the remote computer. This is where the project will be copied to. This can be set up as a deploy task as described above: @@ -214,15 +209,15 @@ scp -r /path/to/ @:projects/ { "label": "Deploy", "type": "shell", - "command": "scp -r /path/to/ @:projects/", + "command": "scp -r @:projects/", } ``` -```eval_rst -.. note:: `There is also a third-party plugin for VS Code that supports remote workspaces `_. +```{note} +[There is also a third-party plugin for VS Code that supports remote workspaces.](https://marketplace.visualstudio.com/items?itemName=Liveecommerce.vscode-remote-workspace) ``` Start executing the pipeline on your remote computer: @@ -247,7 +242,7 @@ Go to the **Debugging** section in VS Code and select the newly created remote d ![](../meta/images/vscode_remote_debugger.png) -You will need to set a breakpoint in VS Code as described [above](#debugging) and start the debugger by clicking the green play triangle: +You must [set a breakpoint in VS Code as described in the debugging section above](#debugging) and start the debugger by clicking the green play triangle: [Find more information on debugging in VS Code](https://code.visualstudio.com/docs/python/debugging). @@ -260,11 +255,11 @@ Enter the following in your `settings.json` file: ```json { "yaml.schemas": { - "https://raw.githubusercontent.com/quantumblacklabs/kedro/develop/static/jsonschema/kedro-catalog-0.17.json": "conf/**/*catalog*" + "https://raw.githubusercontent.com/kedro-org/kedro/develop/static/jsonschema/kedro-catalog-0.18.json": "conf/**/*catalog*" } } ``` and start editing your `catalog` files. -> Different schemas for different Kedro versions can be found [here](https://github.com/quantumblacklabs/kedro/tree/master/static/jsonschema). +> [Different schemas for different Kedro versions can be found in the Kedro repository](https://github.com/kedro-org/kedro/tree/main/static/jsonschema). diff --git a/docs/source/experiment_tracking/index.md b/docs/source/experiment_tracking/index.md new file mode 100644 index 0000000000..a8e94dd05b --- /dev/null +++ b/docs/source/experiment_tracking/index.md @@ -0,0 +1,354 @@ +# Experiment tracking in Kedro-Viz + + +Experiment tracking is the process of saving all the metadata related to an experiment each time you run it. It enables you to compare different runs of a machine-learning model as part of the experimentation process. + +The metadata you store may include: + +* Scripts used for running the experiment +* Environment configuration files +* Versions of the data used for training and evaluation +* Evaluation metrics +* Model weights +* Plots and other visualisations + +You can use Kedro-Viz experiment tracking to store and access results, and to share them with others for comparison. Storage can be local or remote, such as cloud storage on AWS S3. + +Kedro's [experiment tracking demo](https://demo.kedro.org/experiment-tracking) enables you to explore the experiment tracking capabilities of Kedro-Viz. + +![](../meta/images/experiment-tracking_demo.gif) + +## Kedro versions supporting experiment tracking +Kedro has always supported parameter versioning (as part of your codebase with a version control system like `git`) and Kedro’s dataset versioning capabilities enabled you to [snapshot models, datasets and plots](../data/data_catalog.md#version-datasets-and-ml-models). + +Kedro-Viz version 4.1.1 introduced metadata capture, visualisation, discovery and comparison, enabling you to access, edit and [compare your experiments](#access-run-data-and-compare-runs) and additionally [track how your metrics change over time](#view-and-compare-metrics-data). + +Kedro-Viz version 5.0 also supports the [display and comparison of plots, such as Plotly and Matplotlib](../visualisation/visualise_charts_with_plotly.md). Support for metric plots (timeseries and parellel coords) was added to Kedro-Viz version 5.2.1. + +Kedro-Viz version 6.2 includes support for collaborative experiment tracking using a cloud storage solution. This means that multiple users can store their experiment data in a centralized remote storage, such as AWS S3, and access it through Kedro-Viz. + +## When should I use experiment tracking in Kedro? + +The choice of experiment tracking tool depends on your use case and choice of complementary tools, such as MLflow and Neptune: + +- **Kedro** - If you need experiment tracking, are looking for improved metrics visualisation and want a lightweight tool to work alongside existing functionality in Kedro. Kedro does not support a model registry. +- **MLflow** - You can combine MLFlow with Kedro by using [`kedro-mlflow`](https://kedro-mlflow.readthedocs.io/en/stable/) if you require experiment tracking, model registry and/or model serving capabilities or have access to Managed MLflow within the Databricks ecosystem. +- **Neptune** - If you require experiment tracking and model registry functionality, improved visualisation of metrics and support for collaborative data science, you may consider [`kedro-neptune`](https://docs.neptune.ai/integrations/kedro/) for your workflow. + +[We support a growing list of integrations](../extend_kedro/plugins.md). + +## Set up a project + +This section describes the steps necessary to set up experiment tracking and access logged metrics, using the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) with a version of Kedro equal to or higher than 0.18.4, and a version of Kedro-Viz equal to or higher than 5.2. + +There are three steps to enable experiment tracking features with Kedro-Viz. We illustrate how to: + +- [Set up a session store to capture experiment metadata](#set-up-the-session-store) +- [Set up experiment tracking datasets to list the metrics to track](#set-up-experiment-tracking-datasets) +- [Modify your nodes and pipelines to output those metrics](#modify-your-nodes-and-pipelines-to-log-metrics) + +### Install Kedro and Kedro-Viz +To use this tutorial code, you must already have [installed Kedro](../get_started/install.md) and [Kedro-Viz](../visualisation/kedro-viz_visualisation.md). You can confirm the versions you have installed by running `kedro info` + +```{note} +The example code uses a version of Kedro-Viz `>6.2.0`. +``` + +Create a new project using the spaceflights starter. From the terminal run: + +```bash +kedro new --starter=spaceflights +``` + +Feel free to name your project as you like, but this guide assumes the project is named `Spaceflights`. + +### Install the dependencies for the project + +Once you have created the project, to run project-specific Kedro commands, you must navigate to the directory in which it has been created: + +```bash +cd spaceflights +``` +Install the project's dependencies: + +```bash +pip install -r src/requirements.txt +``` + +## Set up the session store + +In the domain of experiment tracking, each pipeline run is considered a session. A session store records all related metadata for each pipeline run, from logged metrics to other run-related data such as timestamp, `git` username and branch. The session store is a [SQLite](https://www.sqlite.org/index.html) database that is generated during your first pipeline run after it has been set up in your project. + +### Local storage +To set up the session store locally, go to the `src/spaceflights/settings.py` file and add the following: + +```python +from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore +from pathlib import Path + +SESSION_STORE_CLASS = SQLiteStore +SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")} +``` + +This specifies the creation of the `SQLiteStore` under the `data` subfolder, using the `SQLiteStore` setup from your installed Kedro-Viz plugin + +This step is crucial to enable experiment tracking features on Kedro-Viz, as it is the database used to serve all run data to the Kedro-Viz front-end. Once this step is complete, you can either proceed to [set up the tracking datasets](#set-up-experiment-tracking-datasets) or [set up your nodes and pipelines to log metrics](#modify-your-nodes-and-pipelines-to-log-metrics); these two activities are interchangeable, but both should be completed to get a working experiment tracking setup. + + +## Collaborative experiment tracking + +```{note} +To use collaborative experiment tracking, ensure that your installed version of Kedro-Viz is `>=6.2.0`. +``` + +For collaborative experiment tracking, Kedro-Viz saves your experiments as SQLite database files on a central cloud storage. To ensure that all users have a unique filename, set up your `KEDRO_SQLITE_STORE_USERNAME` in the environment variables. By default, Kedro-Viz will take your computer user name if this is not specified. + +> Note: In Kedro-Viz version 6.2, the only way to set up credentials for accessing your cloud storage is through environment variables. + +```bash +export KEDRO_SQLITE_STORE_USERNAME="your_unique__username" + +``` + +Now specify a remote path in the `SESSION_STORE_ARGS` variable, which links to your cloud storage. + + +```python +from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore +from pathlib import Path + +SESSION_STORE_CLASS = SQLiteStore +SESSION_STORE_ARGS = { + "path": str(Path(__file__).parents[2] / "data"), + "remote_path": "s3://my-bucket-name/path/to/experiments", +} +``` + +Finally, ensure you have the necessary credentials set up as shown below: + +```bash +export AWS_ACCESS_KEY_ID="your_access_key_id" +export AWS_SECRET_ACCESS_KEY="your_secret_access_key" +export AWS_REGION="your_aws_region" + +``` + +## Set up experiment tracking datasets + +There are two types of tracking datasets: [`tracking.MetricsDataSet`](/kedro.extras.datasets.tracking.MetricsDataSet) and [`tracking.JSONDataSet`](/kedro.extras.datasets.tracking.JSONDataSet). The `tracking.MetricsDataSet` should be used for tracking numerical metrics, and the `tracking.JSONDataSet` can be used for tracking any other JSON-compatible data like boolean or text-based data. + +Set up two datasets to log the columns used in the companies dataset (`companies_columns`) and experiment metrics for the data science pipeline (`metrics`) like the coefficient of determination (`r2 score`), max error (`me`) and mean absolute error (`mae`) by adding the following in the `conf/base/catalog.yml` file: + +```yaml +metrics: + type: tracking.MetricsDataSet + filepath: data/09_tracking/metrics.json + +companies_columns: + type: tracking.JSONDataSet + filepath: data/09_tracking/companies_columns.json +``` + +## Modify your nodes and pipelines to log metrics + +Now that you have set up the tracking datasets to log experiment tracking data, next ensure that the data is returned from your nodes. + +Set up the data to be logged for the metrics dataset - under `nodes.py` of your `data_science` pipeline (`src/spaceflights/pipelines/data_science/nodes.py`), add three different metrics to your `evaluate_model` function to log `r2_score`, `mae` and `me` and return these 3 metrics as key-value pairs. + +The new `evaluate_model` function should look like this: + +```python +from sklearn.metrics import mean_absolute_error, max_error + + +def evaluate_model( + regressor: LinearRegression, X_test: pd.DataFrame, y_test: pd.Series +) -> Dict[str, float]: + """Calculates and logs the coefficient of determination. + + Args: + regressor: Trained model. + X_test: Testing data of independent features. + y_test: Testing data for price. + """ + y_pred = regressor.predict(X_test) + score = r2_score(y_test, y_pred) + mae = mean_absolute_error(y_test, y_pred) + me = max_error(y_test, y_pred) + logger = logging.getLogger(__name__) + logger.info("Model has a coefficient R^2 of %.3f on test data.", score) + return {"r2_score": score, "mae": mae, "max_error": me} +``` + +Next, ensure that the dataset is also specified as an output of your `evaluate_model` node. In the `src/spaceflights/pipelines/data_science/pipeline.py` file, specify the `output` of your `evaluate_model` to be the `metrics` dataset. Note that the output dataset must exactly match the name of the tracking dataset specified in the catalog file. + +The node of the `evaluate_model` on the pipeline should look like this: + +```python +node( + func=evaluate_model, + inputs=["regressor", "X_test", "y_test"], + name="evaluate_model_node", + outputs="metrics", +) +``` + +Repeat the same steps to set up the `companies_column` dataset. For this dataset, log the column that contains the list of companies as outlined in the `companies.csv` file under the `data/01_raw` directory. Modify the `preprocess_companies` node under the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/nodes.py`) to return the data under a key-value pair, as shown below: + +```python +from typing import Tuple, Dict + + +def preprocess_companies(companies: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]: + """Preprocesses the data for companies. + + Args: + companies: Raw data. + Returns: + Preprocessed data, with `company_rating` converted to a float and + `iata_approved` converted to boolean. + """ + companies["iata_approved"] = _is_true(companies["iata_approved"]) + companies["company_rating"] = _parse_percentage(companies["company_rating"]) + return companies, {"columns": companies.columns.tolist(), "data_type": "companies"} +``` + +Again, you must ensure that the dataset is also specified as an output on the `pipeline.py` file under the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/pipeline.py`), as follows: + +```python +node( + func=preprocess_companies, + inputs="companies", + outputs=["preprocessed_companies", "companies_columns"], + name="preprocess_companies_node", +) +``` + +Having set up both datasets, you can now generate your first set of experiment tracking data! + +## Generate the run data + +The beauty of native experiment tracking in Kedro is that all tracked data is generated and stored each time you do a Kedro run. Hence, to generate the data, you need only execute: + +```bash +kedro run +``` + +After the run completes, under `data/09_tracking`, you can now see two folders, `companies_column.json` and `metrics.json`. On performing a pipeline run after setting up the tracking datasets, Kedro generates a folder with the dataset name for each tracked dataset. Each folder of the tracked dataset contains folders named by the timestamp of each pipeline run to store the saved metrics of the dataset, and each future pipeline run generates a new timestamp folder with the JSON file of the saved metrics under the folder of its subsequent tracked dataset. + +You can also see the `session_store.db` generated from your first pipeline run after enabling experiment tracking, which is used to store all the generated run metadata, alongside the tracking dataset, to be used for exposing experiment tracking to Kedro-Viz. + +![](../meta/images/experiment-tracking-folder.png) + +Execute `kedro run` a few times in a row to generate a larger set of experiment data. You can also play around with setting up different tracking datasets, and check the logged data via the generated JSON data files. + +## Access run data and compare runs + +Here comes the fun part of accessing your run data on Kedro-Viz. Having generated some run data, execute the following command: + +```bash +kedro viz +``` + +When you open the Kedro-Viz web app, you see an experiment tracking icon on the left-hand side of the screen. + +![](../meta/images/experiment-tracking-icon.png) + +Click the icon to go to the experiment tracking page (you can also access the page from your browser at `http://127.0.0.1:4141/experiment-tracking`), where you can see the sets of experiment data generated from all previous runs: + +![](../meta/images/experiment-tracking-runs-list.png) + +You can now access, compare and pin your runs by toggling the `Compare runs` button: + +![](../meta/images/experiment-tracking-compare-runs.png) + +## View and compare plots + +In this section, we illustrate how to compare Matplotlib plots across experimental runs (functionality available since Kedro-Viz version 5.0). + +### Update the dependencies + +Update the `src/requirements.txt` file in your Kedro project by adding the following dataset to enable Matplotlib for your project: + +```text +kedro-datasets[matplotlib.MatplotlibWriter]~=1.1 +seaborn~=0.12.1 +``` + +And install the requirements with: + +```bash +pip install -r src/requirements.txt +``` + +### Add a plotting node + +Add a new node to the `data_processing` nodes (`src/spaceflights/pipelines/data_processing/nodes.py`): + +```python +import matplotlib.pyplot as plt +import seaborn as sn + + +def create_confusion_matrix(companies: pd.DataFrame): + actuals = [0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1] + predicted = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1] + data = {"y_Actual": actuals, "y_Predicted": predicted} + df = pd.DataFrame(data, columns=["y_Actual", "y_Predicted"]) + confusion_matrix = pd.crosstab( + df["y_Actual"], df["y_Predicted"], rownames=["Actual"], colnames=["Predicted"] + ) + sn.heatmap(confusion_matrix, annot=True) + return plt +``` + +And now add this node to the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/pipeline.py`): + +```python +from .nodes import create_confusion_matrix + +node( + func=create_confusion_matrix, + inputs="companies", + outputs="confusion_matrix", +), +``` + +In the catalog (`conf/base/catalog.yml`) add the `confusion_matrix` data definition, making sure to set the versioned flag to `true` within the project catalog to include the plot in experiment tracking: + +```yaml +confusion_matrix: + type: matplotlib.MatplotlibWriter + filepath: data/09_tracking/confusion_matrix.png + versioned: true +``` + +After running the pipeline with `kedro run`, the plot is saved and you can see it in the experiment tracking panel when you execute `kedro viz`. Clicking on a plot expands it. When in comparison view, expanding a plot shows all the plots in that view for side-by-side comparison. + +![](../meta/images/experiment-tracking-plots-comparison.png) + +![](../meta/images/experiment-tracking-plots-comparison-expanded.png) + +## View and compare metrics data + +From Kedro-Viz `>=5.2.1` experiment tracking also supports the display and comparison of metrics data through two chart types: time series and parallel coordinates. + +Time series displays one metric per graph, showing how the metric value has changed over time. + +Parallel coordinates displays all metrics on a single graph, with each vertical line representing one metric with its own scale. The metric values are positioned along those vertical lines and connected across each axis. + +When in comparison view, comparing runs highlights your selections on the respective chart types, improving readability even in the event there is a multitude of data points. + +```{note} +The following graphic is taken from the [Kedro-Viz experiment tracking demo](https://demo.kedro.org/experiment-tracking) (it is not a visualisation from the example code you created above). +``` + +![](../meta/images/experiment-tracking-metrics-comparison.gif) + +Additionally, you can monitor the changes to metrics over time from the pipeline visualisation tab which you can access by following the icon on the left-hand side of the screen. + +![](../meta/images/pipeline_visualisation_icon.png) + +Clicking on any `MetricsDataset` node opens a side panel displaying how the metric value has changed over time: + +![](../meta/images/pipeline_show_metrics.gif) diff --git a/docs/source/extend_kedro/common_use_cases.md b/docs/source/extend_kedro/common_use_cases.md new file mode 100644 index 0000000000..04b36d6ca5 --- /dev/null +++ b/docs/source/extend_kedro/common_use_cases.md @@ -0,0 +1,42 @@ +# Common use cases + +Kedro has a few built-in mechanisms for you to extend its behaviour. This document explains how to select which mechanism to employ for the most common use cases. + +## Use Case 1: How to add extra behaviour to Kedro's execution timeline + +The execution timeline of a Kedro pipeline can be thought of as a sequence of actions performed by various Kedro library components, such as the [DataSets](/kedro_datasets), [DataCatalog](/kedro.io.DataCatalog), [Pipeline](/kedro.pipeline.Pipeline), [Node](/kedro.pipeline.node.Node) and [KedroContext](/kedro.framework.context.KedroContext). + +At different points in the lifecycle of these components, you might want to add extra behaviour: for example, you could add extra computation for profiling purposes _before_ and _after_ a node runs, or _before_ and _after_ the I/O actions of a dataset, namely the `load` and `save` actions. + +This can now achieved by using [Hooks](../hooks/introduction.md), to define the extra behaviour and when in the execution timeline it should be introduced. + +## Use Case 2: How to integrate Kedro with additional data sources + +You can use [DataSets](/kedro_datasets) to interface with various different data sources. If the data source you plan to use is not supported out of the box by Kedro, you can [create a custom dataset](custom_datasets.md). + +## Use Case 3: How to add or modify CLI commands + +If you want to customise a built-in Kedro command, such as `kedro run`, for a specific project, add a `cli.py` file that defines a custom `run()` function. You should add the `cli.py` file at the same level as `settings.py`, which is usually the `src/PROJECT_NAME` directory. See the [template for the `cli.py` file](../development/commands_reference.md#customise-or-override-project-specific-kedro-commands). + + +If you want to customise a Kedro command from a command group, such as `kedro pipeline` or `kedro jupyter`, you need to import the corresponding click command group from the Kedro framework `cli`. For `kedro pipeline` commands this would be `from kedro.framework.cli.pipeline import pipeline`, and for `kedro jupyter` commands `from kedro.framework.cli.jupyter import jupyter`. Note that you must still add the `cli` click group from the snippet above, even if you don't modify it. + +You can then add or overwrite any command by adding it to the click group, as in the snippet below: +``` +@jupyter.command("notebook") +@env_option( + help="Open a notebook" +) +def notebook_run(...): + == ADD YOUR CUSTOM NOTEBOOK COMMAND CODE HERE == +``` + +To inject additional CLI commands intended to be reused across projects, please refer to [our plugin system](./plugins.md). An example of one such command is the `kedro viz` command introduced by the [Kedro-Viz plugin](https://github.com/kedro-org/kedro-viz). This command is intended to work on every Kedro project and therefore must be a standalone plugin. + +```{note} +Your plugin's implementation can take advantage of other extension mechanisms such as Hooks. +``` + +## Use Case 4: How to customise the initial boilerplate of your project + +Sometimes you might want to tailor the starting boilerplate of a Kedro project to your specific needs. For example, your organisation might have a standard CI script that you want to include in every new Kedro project. To this end, please visit the [guide for creating Kedro starters](../kedro_project_setup/starters.md#how-to-create-a-kedro-starter) to solve this extension requirement. diff --git a/docs/source/07_extend_kedro/03_custom_datasets.md b/docs/source/extend_kedro/custom_datasets.md similarity index 69% rename from docs/source/07_extend_kedro/03_custom_datasets.md rename to docs/source/extend_kedro/custom_datasets.md index 02a2ce9390..9e4b0713eb 100644 --- a/docs/source/07_extend_kedro/03_custom_datasets.md +++ b/docs/source/extend_kedro/custom_datasets.md @@ -1,6 +1,6 @@ # Custom datasets -Kedro supports many [datasets](/kedro.extras.datasets) out of the box, but you may find that you need to create a custom dataset. For example, you may need to handle a proprietary data format or filesystem in your pipeline, or perhaps you have found a particular use case for a dataset that Kedro does not support. This tutorial explains how to create a custom dataset to read and save image data. +[Kedro supports many datasets](/kedro_datasets) out of the box, but you may find that you need to create a custom dataset. For example, you may need to handle a proprietary data format or filesystem in your pipeline, or perhaps you have found a particular use case for a dataset that Kedro does not support. This tutorial explains how to create a custom dataset to read and save image data. ## Scenario @@ -8,7 +8,7 @@ In this example, we use a [Kaggle dataset of Pokémon images and types](https:// ## Project setup -We assume that you have already [installed Kedro](../02_get_started/02_install.md). Now [create a project](../02_get_started/04_new_project.md) (feel free to name your project as you like, but here we will assume the project's repository name is `kedro-pokemon`). +We assume that you have already [installed Kedro](../get_started/install.md). Now [create a project](../get_started/new_project.md) (feel free to name your project as you like, but here we will assume the project's repository name is `kedro-pokemon`). Log into your Kaggle account to [download the Pokémon dataset](https://www.kaggle.com/vishalsubbiah/pokemon-images-and-types) and unzip it into `data/01_raw`, within a subfolder named `pokemon-images-and-types`. The data comprises a single `pokemon.csv` file plus a subfolder of images. @@ -30,20 +30,23 @@ At the minimum, a valid Kedro dataset needs to subclass the base [AbstractDataSe * `_save` * `_describe` +`AbstractDataSet` is generically typed with an input data type for saving data, and an output data type for loading data. +This typing is optional however, and defaults to `Any` type. + Here is an example skeleton for `ImageDataSet`:
    Click to expand ```python -from typing import Any, Dict, List +from typing import Any, Dict import numpy as np -from kedro.io import AbstractVersionedDataSet +from kedro.io import AbstractDataSet -class ImageDataSet(AbstractVersionedDataSet): +class ImageDataSet(AbstractDataSet[np.ndarray, np.ndarray]): """``ImageDataSet`` loads / save image data from a given filepath as `numpy` array using Pillow. Example: @@ -90,7 +93,7 @@ src/kedro_pokemon/extras ## Implement the `_load` method with `fsspec` -Many of the built-in Kedro datasets rely on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) as a consistent interface to different data sources, as described earlier in the section about the [Data Catalog](../05_data/01_data_catalog.md#specifying-the-location-of-the-dataset). In this example, it's particularly convenient to use `fsspec` in conjunction with `Pillow` to read image data, since it allows the dataset to work flexibly with different image locations and formats. +Many of the built-in Kedro datasets rely on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) as a consistent interface to different data sources, as described earlier in the section about the [Data Catalog](../data/data_catalog.md#specify-the-location-of-the-dataset). In this example, it's particularly convenient to use `fsspec` in conjunction with `Pillow` to read image data, since it allows the dataset to work flexibly with different image locations and formats. Here is the implementation of the `_load` method using `fsspec` and `Pillow` to read the data of a single image into a `numpy` array: @@ -99,21 +102,17 @@ Here is the implementation of the `_load` method using `fsspec` and `Pillow` to ```python from pathlib import PurePosixPath - -from kedro.io.core import ( - AbstractVersionedDataSet, - get_filepath_str, - get_protocol_and_path, -) +from typing import Any, Dict import fsspec import numpy as np - -# PIL is the package from Pillow from PIL import Image +from kedro.io import AbstractDataSet +from kedro.io.core import get_filepath_str, get_protocol_and_path -class ImageDataSet(AbstractVersionedDataSet): + +class ImageDataSet(AbstractDataSet[np.ndarray, np.ndarray]): def __init__(self, filepath: str): """Creates a new instance of ImageDataSet to load / save image data for given filepath. @@ -133,10 +132,12 @@ class ImageDataSet(AbstractVersionedDataSet): Data from the image file as a numpy array """ # using get_filepath_str ensures that the protocol and path are appended correctly for different filesystems - load_path = get_filepath_str(self._get_load_path(), self._protocol) + load_path = get_filepath_str(self._filepath, self._protocol) with self._fs.open(load_path) as f: image = Image.open(f).convert("RGBA") return np.asarray(image) + + ... ```
    @@ -168,16 +169,11 @@ Similarly, we can implement the `_save` method as follows: ```python -import numpy as np -from PIL import Image -from kedro.io.core import AbstractVersionedDataSet, get_filepath_str - - -class ImageDataSet(AbstractVersionedDataSet): +class ImageDataSet(AbstractDataSet[np.ndarray, np.ndarray]): def _save(self, data: np.ndarray) -> None: """Saves image data to the specified filepath.""" # using get_filepath_str ensures that the protocol and path are appended correctly for different filesystems - save_path = get_filepath_str(self._get_save_path(), self._protocol) + save_path = get_filepath_str(self._filepath, self._protocol) with self._fs.open(save_path, "wb") as f: image = Image.fromarray(data) image.save(f) @@ -197,10 +193,7 @@ You can open the file to verify that the data was written back correctly. The `_describe` method is used for printing purposes. The convention in Kedro is for the method to return a dictionary describing the attributes of the dataset. ```python -from kedro.io import AbstractVersionedDataSet - - -class ImageDataSet(AbstractVersionedDataSet): +class ImageDataSet(AbstractDataSet[np.ndarray, np.ndarray]): def _describe(self) -> Dict[str, Any]: """Returns a dict that describes the attributes of the dataset.""" return dict(filepath=self._filepath, protocol=self._protocol) @@ -217,18 +210,15 @@ Here is the full implementation of our basic `ImageDataSet`: from pathlib import PurePosixPath from typing import Any, Dict -from kedro.io.core import ( - AbstractVersionedDataSet, - get_filepath_str, - get_protocol_and_path, -) - import fsspec import numpy as np from PIL import Image +from kedro.io import AbstractDataSet +from kedro.io.core import get_filepath_str, get_protocol_and_path -class ImageDataSet(AbstractVersionedDataSet): + +class ImageDataSet(AbstractDataSet[np.ndarray, np.ndarray]): """``ImageDataSet`` loads / save image data from a given filepath as `numpy` array using Pillow. Example: @@ -243,7 +233,6 @@ class ImageDataSet(AbstractVersionedDataSet): Args: filepath: The location of the image file to load / save data. """ - # parse the path and protocol (e.g. file, http, s3, etc.) protocol, path = get_protocol_and_path(filepath) self._protocol = protocol self._filepath = PurePosixPath(path) @@ -255,16 +244,14 @@ class ImageDataSet(AbstractVersionedDataSet): Returns: Data from the image file as a numpy array """ - # using get_filepath_str ensures that the protocol and path are appended correctly for different filesystems - load_path = get_filepath_str(self._get_load_path(), self._protocol) + load_path = get_filepath_str(self._filepath, self._protocol) with self._fs.open(load_path, mode="r") as f: image = Image.open(f).convert("RGBA") return np.asarray(image) def _save(self, data: np.ndarray) -> None: """Saves image data to the specified filepath.""" - # using get_filepath_str ensures that the protocol and path are appended correctly for different filesystems - save_path = get_filepath_str(self._get_save_path(), self._protocol) + save_path = get_filepath_str(self._filepath, self._protocol) with self._fs.open(save_path, mode="wb") as f: image = Image.fromarray(data) image.save(f) @@ -279,7 +266,7 @@ class ImageDataSet(AbstractVersionedDataSet): Currently, the `ImageDataSet` only works with a single image, but this example needs to load all Pokemon images from the raw data directory for further processing. -Kedro's [`PartitionedDataSet`](../05_data/02_kedro_io.md#partitioned-dataset) is a convenient way to load multiple separate data files of the same underlying dataset type into a directory. +Kedro's [`PartitionedDataSet`](../data/kedro_io.md#partitioned-dataset) is a convenient way to load multiple separate data files of the same underlying dataset type into a directory. To use `PartitionedDataSet` with `ImageDataSet` to load all Pokemon PNG images, add this to the data catalog YAML so that `PartitionedDataSet` loads all PNG files from the data directory using `ImageDataSet`: @@ -310,10 +297,10 @@ $ ls -la data/01_raw/pokemon-images-and-types/images/images/*.png | wc -l ## Versioning -```eval_rst -.. note:: Versioning doesn't work with `PartitionedDataSet`. You can't use both of them at the same time. +```{note} +Versioning doesn't work with `PartitionedDataSet`. You can't use both of them at the same time. ``` -To add [Versioning](../05_data/02_kedro_io.md#versioning) support to the new dataset we need to extend the +To add [Versioning](../data/kedro_io.md#versioning) support to the new dataset we need to extend the [AbstractVersionedDataSet](/kedro.io.AbstractVersionedDataSet) to: * Accept a `version` keyword argument as part of the constructor @@ -329,15 +316,15 @@ The following amends the full implementation of our basic `ImageDataSet`. It now from pathlib import PurePosixPath from typing import Any, Dict -from kedro.io import AbstractVersionedDataSet, Version -from kedro.io.core import get_protocol_and_path - import fsspec import numpy as np from PIL import Image +from kedro.io import AbstractVersionedDataSet +from kedro.io.core import get_filepath_str, get_protocol_and_path, Version -class ImageDataSet(AbstractVersionedDataSet): + +class ImageDataSet(AbstractVersionedDataSet[np.ndarray, np.ndarray]): """``ImageDataSet`` loads / save image data from a given filepath as `numpy` array using Pillow. Example: @@ -370,14 +357,14 @@ class ImageDataSet(AbstractVersionedDataSet): Returns: Data from the image file as a numpy array """ - load_path = self._get_load_path() + load_path = get_filepath_str(self._get_load_path(), self._protocol) with self._fs.open(load_path, mode="r") as f: image = Image.open(f).convert("RGBA") return np.asarray(image) def _save(self, data: np.ndarray) -> None: """Saves image data to the specified filepath.""" - save_path = self._get_save_path() + save_path = get_filepath_str(self._get_save_path(), self._protocol) with self._fs.open(save_path, mode="wb") as f: image = Image.fromarray(data) image.save(f) @@ -390,9 +377,84 @@ class ImageDataSet(AbstractVersionedDataSet): ``` -The graphic shows the differences between the original `ImageDataSet` and the versioned `ImageDataSet`: +The difference between the original `ImageDataSet` and the versioned `ImageDataSet` is as follows: -![](../meta/images/diffs-graphic.png) + +
    +Click to expand + +```diff + from pathlib import PurePosixPath + from typing import Any, Dict + + import fsspec + import numpy as np + from PIL import Image + +-from kedro.io import AbstractDataSet +-from kedro.io.core import get_filepath_str, get_protocol_and_path ++from kedro.io import AbstractVersionedDataSet ++from kedro.io.core import get_filepath_str, get_protocol_and_path, Version + + +-class ImageDataSet(AbstractDataSet[np.ndarray, np.ndarray]): ++class ImageDataSet(AbstractVersionedDataSet[np.ndarray, np.ndarray]): + """``ImageDataSet`` loads / save image data from a given filepath as `numpy` array using Pillow. + + Example: + :: + + >>> ImageDataSet(filepath='/img/file/path.png') + """ + +- def __init__(self, filepath: str): ++ def __init__(self, filepath: str, version: Version = None): + """Creates a new instance of ImageDataSet to load / save image data for given filepath. + + Args: + filepath: The location of the image file to load / save data. ++ version: The version of the dataset being saved and loaded. + """ + protocol, path = get_protocol_and_path(filepath) + self._protocol = protocol +- self._filepath = PurePosixPath(path) + self._fs = fsspec.filesystem(self._protocol) + ++ super().__init__( ++ filepath=PurePosixPath(path), ++ version=version, ++ exists_function=self._fs.exists, ++ glob_function=self._fs.glob, ++ ) ++ + def _load(self) -> np.ndarray: + """Loads data from the image file. + + Returns: + Data from the image file as a numpy array + """ +- load_path = get_filepath_str(self._filepath, self._protocol) ++ load_path = get_filepath_str(self._get_load_path(), self._protocol) + with self._fs.open(load_path, mode="r") as f: + image = Image.open(f).convert("RGBA") + return np.asarray(image) + + def _save(self, data: np.ndarray) -> None: + """Saves image data to the specified filepath.""" +- save_path = get_filepath_str(self._filepath, self._protocol) ++ save_path = get_filepath_str(self._get_save_path(), self._protocol) + with self._fs.open(save_path, mode="wb") as f: + image = Image.fromarray(data) + image.save(f) + + def _describe(self) -> Dict[str, Any]: + """Returns a dict that describes the attributes of the dataset.""" +- return dict(filepath=self._filepath, protocol=self._protocol) ++ return dict( ++ filepath=self._filepath, version=self._version, protocol=self._protocol ++ ) +``` +
    To test the code, you need to enable versioning support in the data catalog: @@ -405,8 +467,8 @@ pikachu: versioned: true ``` -```eval_rst -.. note:: Using an HTTP(S)-based ``filepath`` with ``versioned: true`` is NOT supported. +```{note} +Using an HTTP(S)-based `filepath` with `versioned: true` is NOT supported. ``` Create an initial version of the data by creating an example first version (e.g. `2020-02-22T00.00.00.000Z`): @@ -436,13 +498,13 @@ In [2]: context.catalog.save('pikachu', data=img) Inspect the content of the data directory to find a new version of the data, written by `save`. -You may also want to consult the [in-depth documentation about the Versioning API](../05_data/02_kedro_io.md#versioning). +You may also want to consult the [in-depth documentation about the Versioning API](../data/kedro_io.md#versioning). ## Thread-safety Kedro datasets should work with the [SequentialRunner](/kedro.runner.SequentialRunner) and the [ParallelRunner](/kedro.runner.ParallelRunner), so they must be fully serialisable by the [Python multiprocessing package](https://docs.python.org/3/library/multiprocessing.html). This means that your datasets should not make use of lambda functions, nested functions, closures etc. If you are using custom decorators, you need to ensure that they are using [`functools.wraps()`](https://docs.python.org/3/library/functools.html#functools.wraps). -There is one dataset that is an exception: [SparkDataSet](/kedro.extras.datasets.spark.SparkDataSet). The explanation for this exception is that [Apache Spark](https://spark.apache.org/) uses its own parallelism and therefore doesn't work with Kedro [ParallelRunner](/kedro.runner.ParallelRunner). For parallelism within a Kedro project that leverages Spark please consider the alternative [ThreadRunner](/kedro.runner.ThreadRunner). +There is one dataset that is an exception: [SparkDataSet](/kedro_datasets.spark.SparkDataSet). The explanation for this exception is that [Apache Spark](https://spark.apache.org/) uses its own parallelism and therefore doesn't work with Kedro [ParallelRunner](/kedro.runner.ParallelRunner). For parallelism within a Kedro project that leverages Spark please consider the alternative [ThreadRunner](/kedro.runner.ThreadRunner). To verify whether your dataset is serialisable by `multiprocessing`, use the console or an iPython session to try dumping it using `multiprocessing.reduction.ForkingPickler`: @@ -500,27 +562,37 @@ class ImageDataSet(AbstractVersionedDataSet): ... ``` -We provide additional examples of [how to use parameters through the data catalog's YAML API](../05_data/01_data_catalog.md#using-the-data-catalog-with-the-yaml-api). For an example of how to use these parameters in your dataset's constructor, please see the [SparkDataSet](/kedro.extras.datasets.spark.SparkDataSet)'s implementation. +We provide additional examples of [how to use parameters through the data catalog's YAML API](../data/data_catalog.md#use-the-data-catalog-with-the-yaml-api). For an example of how to use these parameters in your dataset's constructor, please see the [SparkDataSet](/kedro_datasets.spark.SparkDataSet)'s implementation. ## How to contribute a custom dataset implementation -One of the easiest ways to contribute back to Kedro is to share a custom dataset. Kedro has a :code:`kedro.extras.datasets` sub-package where you can add a new custom dataset implementation to share it with others. You can find out more in the [Kedro contribution guide](https://github.com/quantumblacklabs/kedro/blob/master/CONTRIBUTING.md) on Github. +One of the easiest ways to contribute back to Kedro is to share a custom dataset. Kedro has a `kedro-datasets` package in +[`kedro-plugins` repository](https://github.com/kedro-org/kedro-plugins) where you can add a new custom dataset +implementation to share it with others. You can find out more in the [Kedro contribution guide on GitHub](https://github.com/kedro-org/kedro/blob/main/CONTRIBUTING.md). To contribute your custom dataset: -1. Add your dataset package to `kedro/extras/datasets/`. +1. Add your dataset package to `kedro-plugins/kedro-datasets/kedro_datasets/`. For example, in our `ImageDataSet` example, the directory structure should be: ``` -kedro/extras/datasets/image +kedro-plugins/kedro-datasets/kedro_datasets/image ├── __init__.py └── image_dataset.py ``` 2. If the dataset is complex, create a `README.md` file to explain how it works and document its API. -3. The dataset should be accompanied by full test coverage in `tests/extras/datasets`. +3. The dataset should be accompanied by full test coverage in `kedro-plugins/kedro-datasets/tests/`. + +4. Make a pull request against the `main` branch of [Kedro's plugin repository](https://github.com/kedro-org/kedro-plugins). + +```{note} +There are two special considerations when contributing a dataset: -4. Make a pull request against the `master` branch of [Kedro's Github repository](https://github.com/quantumblacklabs/kedro). + 1. Add the dataset to `kedro_datasets.rst` so it shows up in the API documentation. + 2. Add the dataset to `static/jsonschema/kedro-catalog-X.json` for IDE validation. + +``` diff --git a/docs/source/extend_kedro/index.md b/docs/source/extend_kedro/index.md new file mode 100644 index 0000000000..f368ac9a73 --- /dev/null +++ b/docs/source/extend_kedro/index.md @@ -0,0 +1,9 @@ +# Extend Kedro + +```{toctree} +:maxdepth: 1 + +common_use_cases +custom_datasets +plugins +``` diff --git a/docs/source/extend_kedro/plugins.md b/docs/source/extend_kedro/plugins.md new file mode 100644 index 0000000000..51cb3b1946 --- /dev/null +++ b/docs/source/extend_kedro/plugins.md @@ -0,0 +1,231 @@ +# Kedro plugins + +Kedro plugins allow you to create new features for Kedro and inject additional commands into the CLI. Plugins are developed as separate Python packages that exist outside of any Kedro project. + +## Overview + +Kedro's extension mechanism is built on [`pluggy`](https://pluggy.readthedocs.io/), a solid plugin management library that was created for the [pytest](https://docs.pytest.org/) ecosystem. `pluggy` relies on [entry points](https://packaging.python.org/en/latest/specifications/entry-points/), a Python mechanism for packages to provide components that can be discovered by other packages using [`importlib.metadata`](https://docs.python.org/3/library/importlib.metadata.html#entry-points). + +## Example of a simple plugin + +Here is a simple example of a plugin that prints the pipeline as JSON: + +`kedrojson/plugin.py` + +```python +import click +from kedro.framework.project import pipelines + + +@click.group(name="JSON") +def commands(): + pass + + +@commands.command() +@click.pass_obj +def to_json(metadata): + """Display the pipeline in JSON format""" + pipeline = pipelines["__default__"] + print(pipeline.to_json()) +``` + +The plugin provides the following `entry_points` config in `setup.py`: + +```python +setup( + entry_points={"kedro.project_commands": ["kedrojson = kedrojson.plugin:commands"]} +) +``` + +Once the plugin is installed, you can run it as follows: +```bash +kedro to_json +``` + +## Extend starter aliases +It is possible to extend the list of starter aliases built into Kedro. This means that a [custom Kedro starter](../kedro_project_setup/starters.md#how-to-create-a-kedro-starter) can be used directly through the `starter` argument in `kedro new` rather than needing to explicitly provide the `template` and `directory` arguments. A custom starter alias behaves in the same way as an official Kedro starter alias and is also picked up by `kedro starter list`. + +You need to extend the starters by providing a list of `KedroStarterSpec`, in this example it is defined in a file called `plugin.py`. + +Example for a non-git repository starter: +```python +# plugin.py +starters = [ + KedroStarterSpec( + alias="test_plugin_starter", + template_path="your_local_directory/starter_folder", + ) +] +``` + +Example for a git repository starter: +```python +# plugin.py +starters = [ + KedroStarterSpec( + alias="test_plugin_starter", + template_path="https://github.com/kedro-org/kedro-starters/", + directory="pandas-iris", + ) +] +``` + +The `directory` argument is optional and should be used when you have multiple templates in one repository as for the [official kedro-starters](https://github.com/kedro-org/kedro-starters). If you only have one template, your top-level directory will be treated as the template. For an example, see the [pandas-iris starter](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris). + +In your `setup.py`, you need to register the specifications to `kedro.starters`. + +```python +setup( + entry_points={"kedro.starters": ["starter = plugin:starters"]}, +) +``` + +After that you can use this starter with `kedro new --starter=test_plugin_starter`. + +```{note} +If your starter lives on a git repository, by default Kedro attempts to use a tag or branch labelled with your version of Kedro, e.g. `0.18.11`. This means that you can host different versions of your starter template on the same repository, and the correct one will automatically be used. If you do not wish to follow this structure, you should override it with the `checkout` flag, e.g. `kedro new --starter=test_plugin_starter --checkout=main`. +``` + +## Working with `click` + +Commands must be provided as [`click` `Groups`](https://click.palletsprojects.com/en/7.x/api/#click.Group) + +The `click Group` will be merged into the main CLI Group. In the process, the options on the group are lost, as is any processing that was done as part of its callback function. + + +## Project context + +When they run, plugins may request information about the current project by creating a session and loading its context: + +```python +from pathlib import Path + +from kedro.framework.startup import _get_project_metadata +from kedro.framework.session import KedroSession + + +project_path = Path.cwd() +session = KedroSession.create(project_path=project_path) +context = session.load_context() +``` + +## Initialisation + +If the plugin initialisation needs to occur prior to Kedro starting, it can declare the `entry_point` key `kedro.init`. This entry point must refer to a function that currently has no arguments, but for future proofing you should declare it with `**kwargs`. + +## `global` and `project` commands + +Plugins may also add commands to the Kedro CLI, which supports two types of commands: + +* _global_ - available both inside and outside a Kedro project. Global commands use the `entry_point` key `kedro.global_commands`. +* _project_ - available only when a Kedro project is detected in the current directory. Project commands use the `entry_point` key `kedro.project_commands`. + +## Suggested command convention + +We use the following command convention: `kedro `, with `kedro ` acting as a top-level command group. This is our suggested way of structuring your plugin bit it is not necessary for your plugin to work. + +## Hooks + +You can develop hook implementations and have them automatically registered to the project context when the plugin is installed. To enable this for your custom plugin, simply add the following entry in your `setup.py`: + +```python +setup(entry_points={"kedro.hooks": ["plugin_name = plugin_name.plugin:hooks"]}) +``` + +where `plugin.py` is the module where you declare hook implementations: + +```python +import logging + +from kedro.framework.hooks import hook_impl + + +class MyHooks: + @hook_impl + def after_catalog_created(self, catalog): # pylint: disable=unused-argument + logging.info("Reached after_catalog_created hook") + + +hooks = MyHooks() +``` + +```{note} +`hooks` should be an instance of the class defining the Hooks. +``` + +## CLI Hooks + +You can also develop hook implementations to extend Kedro's CLI behaviour in your plugin. To find available CLI hooks, please visit [kedro.framework.cli.hooks](/kedro.framework.cli.hooks). To register CLI hooks developed in your plugin with Kedro, add the following entry in your project's `setup.py`: + +```python +setup(entry_points={"kedro.cli_hooks": ["plugin_name = plugin_name.plugin:cli_hooks"]}) +``` + +where `plugin.py` is the module where you declare hook implementations: + +```python +import logging + +from kedro.framework.cli.hooks import cli_hook_impl + + +class MyCLIHooks: + @cli_hook_impl + def before_command_run(self, project_metadata, command_args): + logging.info( + "Command %s will be run for project %s", command_args, project_metadata + ) + + +cli_hooks = MyCLIHooks() +``` + +## Contributing process + +When you are ready to submit your code: + +1. Create a separate repository using our naming convention for `plugin`s (`kedro-`) +2. Choose a command approach: `global` and / or `project` commands: + - All `global` commands should be provided as a single `click` group + - All `project` commands should be provided as another `click` group + - The `click` groups are declared through the [entry points mechanism](https://setuptools.pypa.io/en/latest/userguide/entry_point.html) +3. Include a `README.md` describing your plugin's functionality and all dependencies that should be included +4. Use GitHub tagging to tag your plugin as a `kedro-plugin` so that we can find it + +## Supported Kedro plugins + +- [Kedro-Datasets](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets), a collection of all of Kedro's data connectors. These data +connectors are implementations of the `AbstractDataSet` +- [Kedro-Docker](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker), a tool for packaging and shipping Kedro projects within containers +- [Kedro-Airflow](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow), a tool for converting your Kedro project into an Airflow project +- [Kedro-Viz](https://github.com/kedro-org/kedro-viz), a tool for visualising your Kedro pipelines + + +## Community-developed plugins + +See the full list of plugins using the GitHub tag [kedro-plugin](https://github.com/topics/kedro-plugin). + + +```{note} +Your plugin needs to have an [Apache 2.0 compatible license](https://www.apache.org/legal/resolved.html#category-a) to be considered for this list. +``` + +- [Kedro-Pandas-Profiling](https://github.com/BrickFrog/kedro-pandas-profiling), by [Justin Malloy](https://github.com/BrickFrog), uses [Pandas Profiling](https://github.com/pandas-profiling/pandas-profiling) to profile datasets in the Kedro catalog +- [find-kedro](https://github.com/WaylonWalker/find-kedro), by [Waylon Walker](https://github.com/WaylonWalker), automatically constructs pipelines using `pytest`-style pattern matching +- [kedro-static-viz](https://github.com/WaylonWalker/kedro-static-viz), by [Waylon Walker](https://github.com/WaylonWalker), generates a static [Kedro-Viz](https://github.com/kedro-org/kedro-viz) site (HTML, CSS, JS) +- [steel-toes](https://github.com/WaylonWalker/steel-toes), by [Waylon Walker](https://github.com/WaylonWalker), prevents stepping on toes by automatically branching data paths +- [kedro-wings](https://github.com/tamsanh/kedro-wings), by [Tam-Sanh Nguyen](https://github.com/tamsanh), simplifies and speeds up pipeline creation by auto-generating catalog datasets +- [kedro-great](https://github.com/tamsanh/kedro-great), by [Tam-Sanh Nguyen](https://github.com/tamsanh), integrates Kedro with [Great Expectations](https://greatexpectations.io), enabling catalog-based expectation generation and data validation on pipeline run +- [Kedro-Accelerator](https://github.com/deepyaman/kedro-accelerator), by [Deepyaman Datta](https://github.com/deepyaman), speeds up pipelines by parallelizing I/O in the background +- [kedro-dataframe-dropin](https://github.com/mzjp2/kedro-dataframe-dropin), by [Zain Patel](https://github.com/mzjp2), lets you swap out pandas datasets for modin or RAPIDs equivalents for specialised use to speed up your workflows (e.g on GPUs) +- [kedro-mlflow](https://github.com/Galileo-Galilei/kedro-mlflow), by [Yolan Honoré-Rougé](https://github.com/galileo-galilei) and [Takieddine Kadiri](https://github.com/takikadiri), facilitates [MLflow](https://www.mlflow.org/) integration within a Kedro project. Its main features are modular configuration, automatic parameters tracking, datasets versioning, Kedro pipelines packaging and serving and automatic synchronization between training and inference pipelines for high reproducibility of machine learning experiments and ease of deployment. A tutorial is provided in the [kedro-mlflow-tutorial repo](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial). You can find more information in the [kedro-mlflow documentation](https://kedro-mlflow.readthedocs.io/en/stable/). +- [Kedro-Neptune](https://github.com/neptune-ai/kedro-neptune), by [Jakub Czakon](https://github.com/jakubczakon) and [Rafał Jankowski](https://github.com/Raalsky), lets you have all the benefits of a nicely organized Kedro pipeline with Neptune: a powerful user interface built for ML metadata management. It lets you browse and filter pipeline executions, compare nodes and pipelines on metrics and parameters, and visualize pipeline metadata like learning curves, node outputs, and charts. For more information, tutorials and videos, go to the [Kedro-Neptune documentation](https://docs.neptune.ai/integrations-and-supported-tools/automation-pipelines/kedro). +- [kedro-dolt](https://www.dolthub.com/blog/2021-06-16-kedro-dolt-plugin/), by [Max Hoffman](https://github.com/max-hoffman) and [Oscar Batori](https://github.com/oscarbatori), allows you to expand the data versioning abilities of data scientists and engineers +- [kedro-kubeflow](https://github.com/getindata/kedro-kubeflow), by [GetInData](https://github.com/getindata), lets you run and schedule pipelines on Kubernetes clusters using [Kubeflow Pipelines](https://www.kubeflow.org/docs/components/pipelines/overview/) +- [kedro-airflow-k8s](https://github.com/getindata/kedro-airflow-k8s), by [GetInData](https://github.com/getindata), enables running a Kedro pipeline with Airflow on a Kubernetes cluster +- [kedro-vertexai](https://github.com/getindata/kedro-vertexai), by [GetInData](https://github.com/getindata), enables running a Kedro pipeline with Vertex AI Pipelines service +- [kedro-azureml](https://github.com/getindata/kedro-azureml), by [GetInData](https://github.com/getindata), enables running a Kedro pipeline with Azure ML Pipelines service +- [kedro-sagemaker](https://github.com/getindata/kedro-sagemaker), by [GetInData](https://github.com/getindata), enables running a Kedro pipeline with Amazon SageMaker service +- [kedro-partitioned](https://github.com/ProjetaAi/kedro-partitioned), by [Gabriel Daiha Alves](https://github.com/gabrieldaiha) and [Nickolas da Rocha Machado](https://github.com/nickolasrm), extends the functionality on processing partitioned data. +- [kedro-auto-catalog](https://github.com/WaylonWalker/kedro-auto-catalog), by [Waylon Walker](https://github.com/WaylonWalker) A configurable replacement for `kedro catalog create` that allows you to create default dataset types other than MemoryDataset. diff --git a/docs/source/faq/faq.md b/docs/source/faq/faq.md new file mode 100644 index 0000000000..7847e1991a --- /dev/null +++ b/docs/source/faq/faq.md @@ -0,0 +1,48 @@ +# Frequently asked questions + +## Visualisation + +* [Can I annotate a Kedro-Viz visualisation to show different data layers](../visualisation/kedro-viz_visualisation.md#visualise-layers)? + +## Working with Jupyter + +* [How can I convert functions from Jupyter Notebooks into Kedro nodes](../notebooks_and_ipython/kedro_and_notebooks.md#convert-functions-from-jupyter-notebooks-into-kedro-nodes)? + +* [How do I connect a Kedro project kernel to other Jupyter clients like JupyterLab](../notebooks_and_ipython/kedro_and_notebooks.md#ipython-jupyterlab-and-other-jupyter-clients)? + +## Kedro project development + +* [How do I write my own Kedro starter projects](../kedro_project_setup/starters.md#how-to-create-a-kedro-starter)? + +## Configuration + +* [How do I change the setting for a configuration source folder](../configuration/configuration_basics.md#how-to-change-the-setting-for-a-configuration-source-folder)? +* [How do I change the configuration source folder at run time](../configuration/configuration_basics.md#how-to-change-the-configuration-source-folder-at-runtime)? +* [How do I specify parameters at run time](../configuration/parameters.md#how-to-specify-parameters-at-runtime)? +* [How do I read configuration from a compressed file](../configuration/configuration_basics.md#how-to-read-configuration-from-a-compressed-file)? +* [How do I access configuration in code](../configuration/configuration_basics.md#how-to-access-configuration-in-code)? +* [How do I load credentials in code](../configuration/credentials.md#how-to-load-credentials-in-code)? +* [How do I load parameters in code](../configuration/parameters.md#how-to-load-parameters-in-code)? +* [How do I specify additional configuration environments](../configuration/configuration_basics.md#how-to-specify-additional-configuration-environments)? +* [How do I change the default overriding configuration environment](../configuration/configuration_basics.md#how-to-change-the-default-overriding-environment)? +* [How do I use only one configuration environment](../configuration/configuration_basics.md#how-to-use-only-one-configuration-environment)? + +### Advanced topics + +* [How do I change which configuration files are loaded](../configuration/advanced_configuration.md#how-to-change-which-configuration-files-are-loaded)? +* [How do I ensure non default configuration files get loaded](../configuration/advanced_configuration.md#how-to-ensure-non-default-configuration-files-get-loaded)? +* [How do I bypass the configuration loading rules](../configuration/advanced_configuration.md#how-to-bypass-the-configuration-loading-rules)? +* [How do I use Jinja2 syntax in configuration](../configuration/advanced_configuration.md#how-to-use-jinja2-syntax-in-configuration)? +* [How do I do templating with the `OmegaConfigLoader`](../configuration/advanced_configuration.md#how-to-do-templating-with-the-omegaconfigloader)? +* [How do I use custom resolvers in the `OmegaConfigLoader`](../configuration/advanced_configuration.md#how-to-use-custom-resolvers-in-the-omegaconfigloader)? +* [How do I load credentials through environment variables](../configuration/advanced_configuration.md#how-to-load-credentials-through-environment-variables)? + +## Datasets and the Data Catalog + +* [Can I read the same data file using two different dataset implementations](../data/data_catalog.md#transcode-datasets)? + +## Nodes and pipelines + +* [How do I create a modular pipeline](../nodes_and_pipelines/modular_pipelines.md#how-do-i-create-a-modular-pipeline)? + +* [Can I use generator functions in a node](../nodes_and_pipelines/nodes.md#how-to-use-generator-functions-in-a-node)? diff --git a/docs/source/get_started/index.md b/docs/source/get_started/index.md new file mode 100644 index 0000000000..59e5ae38e5 --- /dev/null +++ b/docs/source/get_started/index.md @@ -0,0 +1,11 @@ +# First steps + +This section explains the first steps to set up and explore Kedro: + +```{toctree} +:maxdepth: 1 + +install +new_project +kedro_concepts +``` diff --git a/docs/source/get_started/install.md b/docs/source/get_started/install.md new file mode 100644 index 0000000000..0ce17301c5 --- /dev/null +++ b/docs/source/get_started/install.md @@ -0,0 +1,190 @@ +# Set up Kedro + +## Installation prerequisites +* **Python**: Kedro supports macOS, Linux, and Windows and is built for Python 3.7+. You'll select a version of Python when you create a virtual environment for your Kedro project. + +* **Virtual environment**: You should create a new virtual environment for *each* new Kedro project you work on to isolate its Python dependencies from those of other projects. + +* **git**: You must install `git` onto your machine if you do not already have it. Type `git -v` into your terminal window to confirm it is installed; it will return the version of `git` available or an error message. [You can download `git` from the official website](https://git-scm.com/). + +## Create a virtual environment for your Kedro project + +We strongly recommend [installing `conda` as your virtual environment manager](https://docs.conda.io/projects/conda/en/latest/user-guide/install/) if you don't already use it. + +``` {tip} +[Read more about virtual environments for Python projects](https://realpython.com/python-virtual-environments-a-primer/) or [watch an explainer video about them](https://youtu.be/YKfAwIItO7M). +``` + +### How to create a new virtual environment using `conda` + +The recommended approach. From your terminal: + +```bash +conda create --name kedro-environment python=3.10 -y +``` + +The example below uses Python 3.10, and creates a virtual environment called `kedro-environment`. You can opt for a different version of Python (any version >= 3.7 and <3.11) for your project, and you can name it anything you choose. + +The `conda` virtual environment is not dependent on your current working directory and can be activated from any directory: + +```bash +conda activate kedro-environment +``` + +To confirm that a valid version of Python is installed in your virtual environment, type the following in your terminal (macOS and Linux): + +```bash +python3 --version +``` + +On Windows: + +```bash +python --version +``` + +To exit `kedro-environment`: + +```bash +conda deactivate +``` + +### How to create a new virtual environment without using `conda` + +Depending on your preferred Python installation, you can create virtual environments to work with Kedro using `venv` or `pipenv` instead of `conda`. + +
    +Click to expand instructions for venv + +If you use Python 3, you should already have the `venv` module installed with the standard library. Create a directory for working with your project and navigate to it. For example: + +```bash +mkdir kedro-environment && cd kedro-environment +``` + +Next, create a new virtual environment in this directory with `venv`: + +```bash +python -m venv .venv +``` + +Activate this virtual environment: + +```bash +source .venv/bin/activate # macOS / Linux +.\.venv\Scripts\activate # Windows +``` + +To exit the environment: + +```bash +deactivate +``` +
    + +
    +Click to expand instructions for pipenv + +Install `pipenv` as follows: + +```bash +pip install pipenv +``` + +Create a directory for working with your project and navigate to it. For example: + +```bash +mkdir kedro-environment && cd kedro-environment +``` + +To start a session with the correct virtual environment activated: + +```bash +pipenv shell +``` + +To exit the shell session: + +```bash +exit +``` + +
    + + +## How to install Kedro using `pip` + +To install Kedro from the Python Package Index (PyPI): + +```bash +pip install kedro +``` + +You can also install Kedro using `conda install -c conda-forge kedro`. + +## How to verify your Kedro installation + +To check that Kedro is installed: + +```bash +kedro info +``` + +You should see an ASCII art graphic and the Kedro version number. For example: + +![](../meta/images/kedro_graphic.png) + +If you do not see the graphic displayed, or have any issues with your installation, check out the [searchable archive of Slack discussions](https://www.linen.dev/s/kedro), or post a new query on the [Slack organisation](https://slack.kedro.org). + + +## How to upgrade Kedro + +The best way to safely upgrade is to check our [release notes](https://github.com/kedro-org/kedro/blob/main/RELEASE.md) for any notable breaking changes. Follow the steps in the migration guide included for that specific release. + +Once Kedro is installed, you can check your version as follows: + +```bash +kedro --version +``` + +To later upgrade Kedro to a different version, simply run: + +```bash +pip install kedro -U +``` + +When migrating an existing project to a newer Kedro version, make sure you also update the `kedro_init_version`: + +* For projects generated with versions of Kedro > 0.17.0, you'll do this in the `pyproject.toml` file from the project root directory. +* If your project was generated with a version of Kedro <0.17.0, you will instead need to update the `ProjectContext`, which is found in `src//run.py`. + +## How to install a development version of Kedro + +This section explains how to try out a development version of Kedro direct from the [Kedro GitHub repository](https://github.com/kedro-org/kedro). + +```{important} +The development version of Kedro is not guaranteed to be bug-free and/or compatible with any of the [stable versions](https://pypi.org/project/kedro/#history). We do not recommend that you use a development version of Kedro in any production systems. Please install and use with caution. +``` + +To try out latest, unreleased functionality from the `develop` branch of the Kedro GitHub repository, run the following installation command: + +```bash +pip install git+https://github.com/kedro-org/kedro.git@develop +``` + +This will install Kedro from the `develop` branch of the GitHub repository, which is always the most up to date. This command will install Kedro from source, unlike `pip install kedro` which installs Kedro from PyPI. + +If you want to roll back to a stable version of Kedro, execute the following in your environment: + +```bash +pip uninstall kedro -y +pip install kedro +``` + +## Summary + +* Kedro can be used on Windows, macOS or Linux. +* Installation prerequisites include a virtual environment manager like `conda`, Python 3.7+, and `git`. +* You should install Kedro using `pip install kedro`. + +If you encounter any problems as you set up Kedro, ask for help on Kedro's [Slack organisation](https://slack.kedro.org) or review the [searchable archive of Slack discussions](https://www.linen.dev/s/kedro). diff --git a/docs/source/get_started/kedro_concepts.md b/docs/source/get_started/kedro_concepts.md new file mode 100644 index 0000000000..4a6d771da0 --- /dev/null +++ b/docs/source/get_started/kedro_concepts.md @@ -0,0 +1,107 @@ +# Kedro concepts + +This page introduces the most basic elements of Kedro. You can find further information about these and more advanced Kedro concepts in the [Kedro glossary](../resources/glossary.md). + +You may prefer to skip to the next section to [create a Kedro project for hands-on Kedro experience](./new_project.md). + +## Summary + +* Kedro nodes are the building blocks of pipelines. A node is a wrapper for a Python function that names the inputs and outputs of that function. +* A pipeline organises the dependencies and execution order of a collection of nodes. +* Kedro has a registry of all data sources the project can use called the Data Catalog. There is inbuilt support for various file types and file systems. +* Kedro projects follow a default template that uses specific folders to store datasets, notebooks, configuration and source code. + + +## Node + +In Kedro, a node is a wrapper for a [pure Python function](../resources/glossary.md#node) that names the inputs and outputs of that function. Nodes are the building block of a pipeline, and the output of one node can be the input of another. + +Here are two simple nodes as an example: + +```python +from kedro.pipeline import node + +# First node +def return_greeting(): + return "Hello" + + +return_greeting_node = node(func=return_greeting, inputs=None, outputs="my_salutation") + +# Second node +def join_statements(greeting): + return f"{greeting} Kedro!" + + +join_statements_node = node( + join_statements, inputs="my_salutation", outputs="my_message" +) +``` + +## Pipeline + +A pipeline organises the dependencies and execution order of a collection of nodes and connects inputs and outputs while keeping your code modular. The pipeline determines the **node execution order** by resolving dependencies and does *not* necessarily run the nodes in the order in which they are passed in. + +Here is a pipeline comprised of the nodes shown above: + +```python +from kedro.pipeline import pipeline + +# Assemble nodes into a pipeline +greeting_pipeline = pipeline([return_greeting_node, join_statements_node]) +``` + +## Data Catalog + +The Kedro Data Catalog is the registry of all data sources that the project can use to manage loading and saving data. It maps the names of node inputs and outputs as keys in a `DataCatalog`, a Kedro class that can be specialised for different types of data storage. + +[Kedro provides different built-in datasets](/kedro_datasets) for numerous file types and file systems, so you don’t have to write the logic for reading/writing data. + +## Kedro project directory structure + +One of the main advantages of working with Kedro projects is that they follow a default template that makes collaboration straightforward. Kedro uses semantic naming to set up a default project with specific folders to store datasets, notebooks, configuration and source code. We advise you to retain the default Kedro project structure to make it easy to share your projects with other Kedro users, although you can adapt the folder structure if you need to. + +The default Kedro project structure is as follows: + +``` +project-dir # Parent directory of the template +├── .gitignore # Hidden file that prevents staging of unnecessary files to `git` +├── conf # Project configuration files +├── data # Local project data (not committed to version control) +├── docs # Project documentation +├── notebooks # Project-related Jupyter notebooks (can be used for experimental code before moving the code to src) +├── pyproject.toml # Identifies the project root and contains configuration information +├── README.md # Project README +├── setup.cfg # Configuration options for `pytest` when doing `kedro test` and for the `isort` utility when doing `kedro lint` +└── src # Project source code +``` + +### `conf` + +The `conf` folder contains two subfolders for storing configuration information: `base` and `local`. + +#### `conf/base` + +Use the `base` subfolder for project-specific settings to share across different installations (for example, with other users). + +The folder contains three files for the example, but you can add others as you require: + +- `catalog.yml` - [Configures the Data Catalog](../data/data_catalog.md#use-the-data-catalog-within-kedro-configuration) with the file paths and load/save configuration needed for different datasets +- `logging.yml` - Uses Python's default [`logging`](https://docs.python.org/3/library/logging.html) library to set up logging +- `parameters.yml` - Allows you to define parameters for machine learning experiments, for example, train/test split and the number of iterations + +#### `conf/local` + +The `local` subfolder is specific to each user and installation and its contents is ignored by `git` (through inclusion in `.gitignore`). + +Use the `local` subfolder for **settings that should not be shared**, such as access credentials, custom editor configuration, personal IDE configuration and other sensitive or personal content. + +By default, Kedro creates one file, `credentials.yml`, in `conf/local`. + +### `data` + +The `data` folder contains multiple subfolders to store project data. We recommend you put raw data into `raw` and move processed data to other subfolders according to the [commonly accepted data engineering convention](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71). + +### `src` + +This subfolder contains the project's source code in one subfolder and another folder that you can use to add unit tests for your project. Projects are preconfigured to run tests using `pytest` when you call `kedro test` from the project's root directory. diff --git a/docs/source/get_started/new_project.md b/docs/source/get_started/new_project.md new file mode 100644 index 0000000000..1048c49e17 --- /dev/null +++ b/docs/source/get_started/new_project.md @@ -0,0 +1,156 @@ +# Create a new Kedro project + +## Summary + +There are a few ways to create a new project once you have [set up Kedro](install.md): + +* You can use `kedro new` to [create a basic Kedro project](#create-a-new-empty-project) containing project directories and basic code, but empty to extend as you need. +* You can use `kedro new` and [pass in a configuration file](#create-a-new-project-from-a-configuration-file) to manually control project details such as the name, folder and package name. +* You can [create a Kedro project populated with template code](#create-a-new-project-containing-example-code) that acts as a starter example. This guide illustrates with the `pandas-iris` starter, and there is a [range of Kedro starter projects](../kedro_project_setup/starters.md#list-of-official-starters). + + +Once you've created a project: + +* You need to **navigate to its project folder** and **install its dependencies**: `pip install -r src/requirements.txt` +* **To run the project**: `kedro run` +* **To visualise the project**: `kedro viz` + +## Create a new empty project + +The simplest way to create a default Kedro project is to navigate to your preferred directory and type: + +```bash +kedro new +``` + +Enter a name for the project, which can be human-readable and may contain alphanumeric symbols, spaces, underscores and hyphens. It must be at least two characters long. + +It's best to keep the name simple because the choice is set as the value of `project_name` and is also used to generate the folder and package names for the project automatically. + +So, if you enter "Get Started", the folder for the project (`repo_name`) is automatically set to be `get-started`, and the Python package name (`python_package`) for the project is set to be `get_started`. + +| Description | Setting | Example | +| --------------------------------------------------------------- | ---------------- | ------------- | +| A human-readable name for the new project | `project_name` | `Get Started` | +| Local directory to store the project | `repo_name` | `get-started` | +| The Python package name for the project (short, all-lowercase) | `python_package` | `get_started` | + + +The output of `kedro new` is a directory containing all the project files and subdirectories required for a basic Kedro project, ready to extend with the code. + +## Create a new project from a configuration file + +To customise a new project's directory and package name, use a configuration file to specify those values. The configuration file must contain: + +- `output_dir` The path in which to create the project directory +- `project_name` +- `repo_name` +- `python_package` + +The `output_dir` can be set to customised. For example, `~` for the home directory or `.` for the current working directory. Here is an example `config.yml`, which assumes that a directory named `~/code` already exists: + +```yaml +output_dir: ~/code +project_name: My First Kedro Project +repo_name: testing-kedro +python_package: test_kedro +``` + +To create this new project: + +```bash +kedro new --config=/config.yml +``` + +## Create a new project containing example code + +Use a [Kedro starter](../kedro_project_setup/starters.md) to create a project containing template code, to run as-is or to adapt and extend. + +The following illustrates a project created with example code based on the familiar [Iris dataset](https://www.kaggle.com/uciml/iris). + +The first step is to create the Kedro project using a starter to add the example code and data. + +```bash +kedro new --starter=pandas-iris +``` + +## Run the project + +However you create a Kedro project, once `kedro new` has completed, the next step is to navigate to the project folder (`cd `) and install dependencies with `pip` as follows: + +```bash +pip install -r src/requirements.txt +``` + +Now run the project: + +```bash +kedro run +``` + +```{note} +The first time you type a `kedro` command in a new project, you will be asked whether you wish to opt into [usage analytics](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry). Your decision is recorded in the `.telemetry` file so that subsequent calls to `kedro` in this project do not ask this question again. +``` + +## Visualise a Kedro project + +This section swiftly introduces project visualisation using Kedro-Viz. See the [visualisation documentation](../visualisation/kedro-viz_visualisation) for more detail. + +The Kedro-Viz package needs to be installed into your virtual environment separately as it is not part of the standard Kedro installation: + +```bash +pip install kedro-viz +``` + +To start Kedro-Viz, enter the following in your terminal: + +```bash +kedro viz +``` + +This command automatically opens a browser tab to serve the visualisation at `http://127.0.0.1:4141/`. + +To exit the visualisation, close the browser tab. To regain control of the terminal, enter `^+c` on Mac or `Ctrl+c` on Windows or Linux machines. + +## Where next? +You have completed the section on Kedro project creation for new users. Now choose how to learn more: + +* Understand more about Kedro: The following page explains the [fundamental Kedro concepts](./kedro_concepts.md). + +* Learn hands-on: If you prefer to learn hands-on, move on to the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md). The tutorial illustrates how to set up a working project, add dependencies, create nodes, register pipelines, set up the Data Catalog, add documentation, and package the project. + +* How-to guides: The documentation sections following the tutorial explain [how to visualise a Kedro project](../visualisation/kedro-viz_visualisation.md) and [how to combine Kedro with a Jupyter notebook](../notebooks_and_ipython/kedro_and_notebooks.md). + +If you've worked through the documentation listed and are unsure where to go next, review the [Kedro repositories on GitHub](https://github.com/kedro-org) and [Kedro's Slack channels](https://slack.kedro.org). + + +## More information about the `pandas-iris` example project + +If you used the `pandas-iris` starter to create an example project, the rest of this page gives further information. + +
    +Expand for more details. + +### Background information +The Iris dataset was generated in 1936 by the British statistician and biologist Ronald Fisher. The dataset contains 150 samples, comprising 50 each of 3 different species of Iris plant (*Iris Setosa*, *Iris Versicolour* and *Iris Virginica*). For each sample, the flower measurements are recorded for the sepal length, sepal width, petal length and petal width. + +![](../meta/images/iris_measurements.png) + +A machine learning model can use the Iris dataset to illustrate classification (a method used to determine the type of an object by comparison with similar objects that have previously been categorised). Once trained on known data, the machine learning model can make a predictive classification by comparing a test object to the output of its training data. + +The Kedro starter contains a single [pipeline](../resources/glossary.md#pipeline) comprising three [nodes](../resources/glossary.md#node) responsible for splitting the data into training and testing samples, running a 1-nearest neighbour classifier algorithm to make predictions and accuracy-reporting. + +The nodes are stored in `src/get_started/nodes.py`: + +| Node | Description | +| --------------- | ----------------------------------------------------------------------------------- | +| `split_data` | Splits the example Iris dataset into train and test samples | +| `make_predictions`| Makes class predictions (using 1-nearest neighbour classifier and train-test set) | +| `report_accuracy` | Reports the accuracy of the predictions performed by the previous node. | + +### Iris example: visualisation + +If you [visualise your project with Kedro-Viz](#visualise-a-kedro-project) you should see the following: + +![](../meta/images/pipeline_visualisation.png) +
    diff --git a/docs/source/hooks/common_use_cases.md b/docs/source/hooks/common_use_cases.md new file mode 100644 index 0000000000..085bcf7136 --- /dev/null +++ b/docs/source/hooks/common_use_cases.md @@ -0,0 +1,202 @@ +# Common use cases + +## Use Hooks to extend a node's behaviour + +You can use the [`before_node_run` and `after_node_run` Hooks](/kedro.framework.hooks.specs.NodeSpecs) to add extra behavior before and after a node's execution. Furthermore, you can apply extra behavior to not only an individual node or an entire Kedro pipeline, but also to a _subset_ of nodes, based on their tags or namespaces: for example, suppose we want to add the following extra behavior to a node: + +```python +from kedro.pipeline.node import Node + + +def say_hello(node: Node): + """An extra behaviour for a node to say hello before running.""" + print(f"Hello from {node.name}") +``` + +Then you can either add it to a single node based on the node's name: + +```python +# src//hooks.py + +from kedro.framework.hooks import hook_impl +from kedro.pipeline.node import Node + + +class ProjectHooks: + @hook_impl + def before_node_run(self, node: Node): + # adding extra behaviour to a single node + if node.name == "hello": + say_hello(node) +``` + +Or add it to a group of nodes based on their tags: + + +```python +# src//hooks.py + +from kedro.framework.hooks import hook_impl +from kedro.pipeline.node import Node + + +class ProjectHooks: + @hook_impl + def before_node_run(self, node: Node): + if "hello" in node.tags: + say_hello(node) +``` + +Or add it to all nodes in the entire pipeline: + +```python +# src//hooks.py + +from kedro.framework.hooks import hook_impl +from kedro.pipeline.node import Node + + +class ProjectHooks: + @hook_impl + def before_node_run(self, node: Node): + # adding extra behaviour to all nodes in the pipeline + say_hello(node) +``` + +If your use case takes advantage of a decorator, for example to retry a node's execution using a library such as [tenacity](https://tenacity.readthedocs.io/en/latest/), you can still decorate the node's function directly: + +```python +from tenacity import retry + + +@retry +def my_flaky_node_function(): + ... +``` + +Or applying it in the `before_node_run` Hook as follows: + +```python +# src//hooks.py +from tenacity import retry + +from kedro.framework.hooks import hook_impl +from kedro.pipeline.node import Node + + +class ProjectHooks: + @hook_impl + def before_node_run(self, node: Node): + # adding retrying behaviour to nodes tagged as flaky + if "flaky" in node.tags: + node.func = retry(node.func) +``` +## Use Hooks to customise the dataset load and save methods +We recommend using the `before_dataset_loaded`/`after_dataset_loaded` and `before_dataset_saved`/`after_dataset_saved` Hooks to customise the dataset `load` and `save` methods where appropriate. + +For example, you can add logging about the dataset load runtime as follows: + +```python +import logging +import time +from typing import Any + +from kedro.framework.hooks import hook_impl +from kedro.pipeline.node import Node + + +class LoggingHook: + """A hook that logs how many time it takes to load each dataset.""" + + def __init__(self): + self._timers = {} + + @property + def _logger(self): + return logging.getLogger(__name__) + + @hook_impl + def before_dataset_loaded(self, dataset_name: str, node: Node) -> None: + start = time.time() + self._timers[dataset_name] = start + + @hook_impl + def after_dataset_loaded(self, dataset_name: str, data: Any, node: Node) -> None: + start = self._timers[dataset_name] + end = time.time() + self._logger.info( + "Loading dataset %s before node '%s' takes %0.2f seconds", + dataset_name, + node.name, + end - start, + ) +``` + +## Use Hooks to load external credentials +We recommend using the `after_context_created` Hook to add credentials to the session's config loader instance from any external credentials manager. In this example we show how to load credentials from [Azure KeyVault](https://learn.microsoft.com/en-us/azure/key-vault/general/). + +Here is the example KeyVault instance, note the KeyVault and secret names: + +![](../meta/images/example_azure_keyvault.png) + +These credentials will be used to access these datasets in the data catalog: + +```yaml +weather: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/weather* + file_format: csv + credentials: s3_creds + +cars: + type: pandas.CSVDataSet + filepath: https://your_data_store.blob.core.windows.net/data/01_raw/cars.csv + file_format: csv + credentials: abs_creds +``` + +We can then use the following hook implementation to fetch and inject these credentials: + +```python +# hooks.py + +from kedro.framework.hooks import hook_impl +from azure.keyvault.secrets import SecretClient +from azure.identity import DefaultAzureCredential + + +class AzureSecretsHook: + @hook_impl + def after_context_created(self, context) -> None: + keyVaultName = "keyvault-0542abb" # or os.environ["KEY_VAULT_NAME"] if you would like to provide it through environment variables + KVUri = f"https://{keyVaultName}.vault.azure.net" + + my_credential = DefaultAzureCredential() + client = SecretClient(vault_url=KVUri, credential=my_credential) + + secrets = { + "abs_creds": "azure-blob-store", + "s3_creds": "s3-bucket-creds", + } + azure_creds = { + cred_name: client.get_secret(secret_name).value + for cred_name, secret_name in secrets.items() + } + + context.config_loader["credentials"] = { + **context.config_loader["credentials"], + **azure_creds, + } +``` + +Finally, [register the hook](./introduction.md#registering-your-hook-implementations-with-kedro) in your `settings.py` file: + +```python +from my_project.hooks import AzureSecretsHook + +HOOKS = (AzureSecretsHook(),) +``` + +```{note} +Note: `DefaultAzureCredential()` is Azure's recommended approach to authorise access to data in your storage accounts. For more information, consult the [documentation about how to authenticate to Azure and authorize access to blob data](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python). +``` diff --git a/docs/source/hooks/examples.md b/docs/source/hooks/examples.md new file mode 100644 index 0000000000..f556879319 --- /dev/null +++ b/docs/source/hooks/examples.md @@ -0,0 +1,412 @@ +# Hooks examples + +## Add memory consumption tracking + +This example illustrates how to track memory consumption using `memory_profiler`. + +* Install dependencies: + +```console +pip install memory_profiler +``` + +* Implement `before_dataset_loaded` and `after_dataset_loaded` + +```python +# src//hooks.py +import logging + +from kedro.framework.hooks import hook_impl +from memory_profiler import memory_usage + + +def _normalise_mem_usage(mem_usage): + # memory_profiler < 0.56.0 returns list instead of float + return mem_usage[0] if isinstance(mem_usage, (list, tuple)) else mem_usage + + +class MemoryProfilingHooks: + def __init__(self): + self._mem_usage = {} + + @hook_impl + def before_dataset_loaded(self, dataset_name: str) -> None: + before_mem_usage = memory_usage( + -1, + interval=0.1, + max_usage=True, + retval=True, + include_children=True, + ) + before_mem_usage = _normalise_mem_usage(before_mem_usage) + self._mem_usage[dataset_name] = before_mem_usage + + @hook_impl + def after_dataset_loaded(self, dataset_name: str) -> None: + after_mem_usage = memory_usage( + -1, + interval=0.1, + max_usage=True, + retval=True, + include_children=True, + ) + # memory_profiler < 0.56.0 returns list instead of float + after_mem_usage = _normalise_mem_usage(after_mem_usage) + + logging.getLogger(__name__).info( + "Loading %s consumed %2.2fMiB memory", + dataset_name, + after_mem_usage - self._mem_usage[dataset_name], + ) +``` + +* Register Hooks implementation by updating the `HOOKS` variable in `settings.py` as follows: + +```python +HOOKS = (MemoryProfilingHooks(),) +``` + +Then re-run the pipeline: + +```console +$ kedro run +``` + +The output should look similar to the following: + +``` +... +[01/25/23 21:38:23] INFO Loading data from 'example_iris_data' (CSVDataSet)... data_catalog.py:343 + INFO Loading example_iris_data consumed 0.99MiB memory hooks.py:67 + INFO Loading data from 'parameters' (MemoryDataSet)... data_catalog.py:343 + INFO Loading parameters consumed 0.48MiB memory hooks.py:67 + INFO Running node: split: split_data([example_iris_data,parameters]) -> [X_train,X_test,y_train,y_test] node.py:327 + INFO Saving data to 'X_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'X_test' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_test' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 1 out of 3 tasks sequential_runner.py:85 + INFO Loading data from 'X_train' (MemoryDataSet)... data_catalog.py:343 + INFO Loading X_train consumed 0.49MiB memory hooks.py:67 + INFO Loading data from 'X_test' (MemoryDataSet)... +... +``` + +## Add data validation + +This example adds data validation to node inputs and outputs using [Great Expectations](https://docs.greatexpectations.io/en/latest/). + +* Install dependencies: + +```console +pip install great-expectations +``` + +* Implement `before_node_run` and `after_node_run` Hooks to validate inputs and outputs data respectively leveraging `Great Expectations`: + +### V2 API +```python +# src//hooks.py +from typing import Any, Dict + +from kedro.framework.hooks import hook_impl +from kedro.io import DataCatalog + +import great_expectations as ge + + +class DataValidationHooks: + + # Map expectation to dataset + DATASET_EXPECTATION_MAPPING = { + "companies": "raw_companies_dataset_expectation", + "preprocessed_companies": "preprocessed_companies_dataset_expectation", + } + + @hook_impl + def before_node_run( + self, catalog: DataCatalog, inputs: Dict[str, Any], session_id: str + ) -> None: + """Validate inputs data to a node based on using great expectation + if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``. + """ + self._run_validation(catalog, inputs, session_id) + + @hook_impl + def after_node_run( + self, catalog: DataCatalog, outputs: Dict[str, Any], session_id: str + ) -> None: + """Validate outputs data from a node based on using great expectation + if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``. + """ + self._run_validation(catalog, outputs, session_id) + + def _run_validation( + self, catalog: DataCatalog, data: Dict[str, Any], session_id: str + ): + for dataset_name, dataset_value in data.items(): + if dataset_name not in self.DATASET_EXPECTATION_MAPPING: + continue + + dataset = catalog._get_dataset(dataset_name) + dataset_path = str(dataset._filepath) + expectation_suite = self.DATASET_EXPECTATION_MAPPING[dataset_name] + + expectation_context = ge.data_context.DataContext() + batch = expectation_context.get_batch( + {"path": dataset_path, "datasource": "files_datasource"}, + expectation_suite, + ) + expectation_context.run_validation_operator( + "action_list_operator", + assets_to_validate=[batch], + session_id=session_id, + ) +``` + +* Register Hooks implementation, as described in the [hooks documentation](introduction.md#registering-your-hook-implementations-with-kedro) and run Kedro. + +`Great Expectations` example report: + +![](../meta/images/data_validation.png) + +### V3 API +* Create new checkpoint: + +```bash +great_expectations checkpoint new raw_companies_dataset_checkpoint +``` + +* Remove `data_connector_query` from the `batch_request` in the checkpoint config file: + +```python +yaml_config = f""" +name: {my_checkpoint_name} +config_version: 1.0 +class_name: SimpleCheckpoint +run_name_template: "%Y%m%d-%H%M%S-my-run-name-template" +validations: + - batch_request: + datasource_name: {my_datasource_name} + data_connector_name: default_runtime_data_connector_name + data_asset_name: my_runtime_asset_name + data_connector_query: + index: -1 + expectation_suite_name: {my_expectation_suite_name} +""" +``` + +```python +# src//hooks.py +from typing import Any, Dict + +from kedro.framework.hooks import hook_impl +from kedro.io import DataCatalog + +import great_expectations as ge + + +class DataValidationHooks: + + # Map checkpoint to dataset + DATASET_CHECKPOINT_MAPPING = { + "companies": "raw_companies_dataset_checkpoint", + } + + @hook_impl + def before_node_run( + self, catalog: DataCatalog, inputs: Dict[str, Any], session_id: str + ) -> None: + """Validate inputs data to a node based on using great expectation + if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``. + """ + self._run_validation(catalog, inputs, session_id) + + @hook_impl + def after_node_run( + self, catalog: DataCatalog, outputs: Dict[str, Any], session_id: str + ) -> None: + """Validate outputs data from a node based on using great expectation + if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``. + """ + self._run_validation(catalog, outputs, session_id) + + def _run_validation( + self, catalog: DataCatalog, data: Dict[str, Any], session_id: str + ): + for dataset_name, dataset_value in data.items(): + if dataset_name not in self.DATASET_CHECKPOINT_MAPPING: + continue + + data_context = ge.data_context.DataContext() + + data_context.run_checkpoint( + checkpoint_name=self.DATASET_CHECKPOINT_MAPPING[dataset_name], + batch_request={ + "runtime_parameters": { + "batch_data": dataset_value, + }, + "batch_identifiers": { + "runtime_batch_identifier_name": dataset_name + }, + }, + run_name=session_id, + ) +``` + +## Add observability to your pipeline + +This example adds observability to your pipeline using [statsd](https://statsd.readthedocs.io/en/v3.3/configure.html) and makes it possible to visualise dataset size and node execution time using [Grafana](https://grafana.com/). + +* Install dependencies: + +```console +pip install statsd +``` + +* Implement `before_node_run` and `after_node_run` Hooks to collect metrics (DataSet size and node execution time): + +```python +# src//hooks.py +import sys +from typing import Any, Dict + +import statsd +from kedro.framework.hooks import hook_impl +from kedro.pipeline.node import Node + + +class PipelineMonitoringHooks: + def __init__(self): + self._timers = {} + self._client = statsd.StatsClient(prefix="kedro") + + @hook_impl + def before_node_run(self, node: Node) -> None: + node_timer = self._client.timer(node.name) + node_timer.start() + self._timers[node.short_name] = node_timer + + @hook_impl + def after_node_run(self, node: Node, inputs: Dict[str, Any]) -> None: + self._timers[node.short_name].stop() + for dataset_name, dataset_value in inputs.items(): + self._client.gauge(dataset_name + "_size", sys.getsizeof(dataset_value)) + + @hook_impl + def after_pipeline_run(self): + self._client.incr("run") +``` + +* Register Hooks implementation, as described in the [hooks documentation](introduction.md#registering-your-hook-implementations-with-kedro) and run Kedro. + +`Grafana` example page: + +![](../meta/images/pipeline_observability.png) + +## Add metrics tracking to your model + +This examples adds metrics tracking using [MLflow](https://mlflow.org/). + +* Install dependencies: + +```console +pip install mlflow +``` + +* Implement `before_pipeline_run`, `after_pipeline_run` and `after_node_run` Hooks to collect metrics using `MLflow`: + +```python +# src//hooks.py +from typing import Any, Dict + +import mlflow +import mlflow.sklearn +from kedro.framework.hooks import hook_impl +from kedro.pipeline.node import Node + + +class ModelTrackingHooks: + """Namespace for grouping all model-tracking hooks with MLflow together.""" + + @hook_impl + def before_pipeline_run(self, run_params: Dict[str, Any]) -> None: + """Hook implementation to start an MLflow run + with the session_id of the Kedro pipeline run. + """ + mlflow.start_run(run_name=run_params["session_id"]) + mlflow.log_params(run_params) + + @hook_impl + def after_node_run( + self, node: Node, outputs: Dict[str, Any], inputs: Dict[str, Any] + ) -> None: + """Hook implementation to add model tracking after some node runs. + In this example, we will: + * Log the parameters after the data splitting node runs. + * Log the model after the model training node runs. + * Log the model's metrics after the model evaluating node runs. + """ + if node._func_name == "split_data": + mlflow.log_params( + {"split_data_ratio": inputs["params:example_test_data_ratio"]} + ) + + elif node._func_name == "train_model": + model = outputs["example_model"] + mlflow.sklearn.log_model(model, "model") + mlflow.log_params(inputs["parameters"]) + + @hook_impl + def after_pipeline_run(self) -> None: + """Hook implementation to end the MLflow run + after the Kedro pipeline finishes. + """ + mlflow.end_run() +``` + +* Register Hooks implementation, as described in the [hooks documentation](introduction.md#registering-your-hook-implementations-with-kedro) and run Kedro. + +`MLflow` example page: + +![](../meta/images/mlflow.png) + +## Modify node inputs using `before_node_run` hook + +If the `before_node_run` hook is implemented _and_ returns a dictionary, that dictionary is used to update the corresponding node inputs. + +For example, if a pipeline contains a node named `my_node`, which takes 2 inputs: `first_input` and `second_input`, to overwrite the value of `first_input` that is passed to `my_node`, we can implement the following hook: + +```python +# src//hooks.py +from typing import Any, Dict, Optional + +from kedro.framework.hooks import hook_impl +from kedro.pipeline.node import Node +from kedro.io import DataCatalog + + +class NodeInputReplacementHook: + @hook_impl + def before_node_run( + self, node: Node, catalog: DataCatalog + ) -> Optional[Dict[str, Any]]: + """Replace `first_input` for `my_node`""" + if node.name == "my_node": + # return the string filepath to the `first_input` dataset + # instead of the underlying data + dataset_name = "first_input" + filepath = catalog._get_dataset(dataset_name)._filepath + return {"first_input": filepath} # `second_input` is not affected + return None +``` + +Node input overwrites implemented in `before_node_run` affect only a specific node and do not modify the corresponding datasets in the `DataCatalog`. + + +```{note} +In the example above, the `before_node_run` hook implementation must return datasets present in the `inputs` dictionary. If they are not in `inputs`, the node fails with the following error: `Node expected X input(s) , but got the following Y input(s) instead: `. +``` + + +To apply the changes once you have implemented a new hook, you must register it, as described in the [hooks documentation](introduction.md#registering-your-hook-implementations-with-kedro), and then run Kedro. diff --git a/docs/source/hooks/index.md b/docs/source/hooks/index.md new file mode 100644 index 0000000000..71bb1cf695 --- /dev/null +++ b/docs/source/hooks/index.md @@ -0,0 +1,15 @@ +# Hooks + +Hooks are a mechanism to add extra behaviour to Kedro's main execution in an easy and consistent manner. Some examples might include: + +* Adding a log statement after the data catalog is loaded. +* Adding data validation to the inputs before a node runs, and to the outputs after a node has run. This makes it possible to integrate with other tools like [Great-Expectations](https://docs.greatexpectations.io/en/latest/). +* Adding machine learning metrics tracking, e.g. using [MLflow](https://mlflow.org/), throughout a pipeline run. + +```{toctree} +:maxdepth: 1 + +introduction +common_use_cases +examples +``` diff --git a/docs/source/hooks/introduction.md b/docs/source/hooks/introduction.md new file mode 100644 index 0000000000..297b8be44f --- /dev/null +++ b/docs/source/hooks/introduction.md @@ -0,0 +1,131 @@ +# Hooks + +## Concepts + +A Hook consists of a Hook specification, and Hook implementation. To add Hooks to your project, you must: + +* Create or modify the file `src//hooks.py` to define a Hook implementation for an existing Kedro-defined Hook specification +* Register your Hook implementation in the [`src//settings.py`](../kedro_project_setup/settings.md) file under the `HOOKS` key + +### Hook specification + +Kedro defines Hook specifications for particular execution points where users can inject additional behaviour. Currently, the following Hook specifications are provided in [kedro.framework.hooks](/kedro.framework.hooks): + +* `after_catalog_created` +* `before_node_run` +* `after_node_run` +* `on_node_error` +* `before_pipeline_run` +* `after_pipeline_run` +* `on_pipeline_error` +* `before_dataset_loaded` +* `after_dataset_loaded` +* `before_dataset_saved` +* `after_dataset_saved` +* `after_context_created` + +The naming convention for non-error Hooks is `__`, in which: + +* `` and `` refers to when the Hook executed, e.g. `before was run` or `after was created`. +* `` refers to the relevant component in the Kedro execution timeline for which this Hook adds extra behaviour, e.g. `catalog`, `node` and `pipeline`. + +The naming convention for error hooks is `on__error`, in which: + +* `` refers to the relevant component in the Kedro execution timeline that throws the error. + +[kedro.framework.hooks](/kedro.framework.hooks) lists the full specifications for which you can inject additional behaviours by providing an implementation. + + +#### CLI hooks + +Lastly, Kedro defines a small set of CLI hooks that inject additional behaviour around execution of a Kedro CLI command: + +* `before_command_run` +* `after_command_run` + +This is what the [`kedro-telemetry` plugin](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry) relies on under the hood in order to be able to collect CLI usage statistics. + +### Hook implementation + +You should provide an implementation for the specification that describes the point at which you want to inject additional behaviour. The Hook implementation should have the same name as the specification. The Hook must provide a concrete implementation with a subset of the corresponding specification's parameters (you do not need to use them all). + +To declare a Hook implementation, use the `@hook_impl` decorator. + +For example, the full signature of the [`after_data_catalog_created`](/kedro.framework.hooks.specs.DataCatalogSpecs) Hook specification is: + +```python +@hook_spec +def after_catalog_created( + self, + catalog: DataCatalog, + conf_catalog: Dict[str, Any], + conf_creds: Dict[str, Any], + save_version: str, + load_versions: Dict[str, str], +) -> None: + pass +``` + +However, if you just want to use this Hook to list the contents of a data catalog after it is created, your Hook implementation can be as simple as: + +```python +# src//hooks.py +import logging + +from kedro.framework.hooks import hook_impl +from kedro.io import DataCatalog + + +class DataCatalogHooks: + @property + def _logger(self): + return logging.getLogger(self.__class__.__name__) + + @hook_impl + def after_catalog_created(self, catalog: DataCatalog) -> None: + self._logger.info(catalog.list()) +``` + +```{note} +The name of a module that contains Hooks implementation is arbitrary and is not restricted to `hooks.py`. +``` + +We recommend that you group related Hook implementations under a namespace, preferably a class, within a `hooks.py` file that you create in your project. + +#### Registering your Hook implementations with Kedro + +Hook implementations should be registered with Kedro using the [`src//settings.py`](../kedro_project_setup/settings.md) file under the `HOOKS` key. + +You can register more than one implementation for the same specification. They will be called in LIFO (last-in, first-out) order. + +The following example sets up a Hook so that the `after_data_catalog_created` implementation is called every time after a data catalog is created. + +```python +# src//settings.py +from .hooks import ProjectHooks, DataCatalogHooks + +HOOKS = (ProjectHooks(), DataCatalogHooks()) +``` + +Kedro also has auto-discovery enabled by default. This means that any installed plugins that declare a Hooks entry-point will be registered. To learn more about how to enable this for your custom plugin, see our [plugin development guide](../extend_kedro/plugins.md#hooks). + +```{note} +Auto-discovered Hooks will run *first*, followed by the ones specified in `settings.py`. +``` + + +#### Disable auto-registered plugins' Hooks + +Auto-registered plugins' Hooks can be disabled via `settings.py` as follows: + +```python +# src//settings.py + +DISABLE_HOOKS_FOR_PLUGINS = ("",) +``` + +where `` is the name of an installed plugin for which the auto-registered Hooks must be disabled. + +## Under the hood + +Under the hood, we use [pytest's pluggy](https://pluggy.readthedocs.io/en/latest/) to implement Kedro's Hook mechanism. We recommend reading their documentation if you have more questions about the underlying implementation. diff --git a/docs/source/index.rst b/docs/source/index.rst index 24dcf487ce..f9c78a2748 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,28 +4,28 @@ contain the root `toctree` directive. -.. image:: https://raw.githubusercontent.com/quantumblacklabs/kedro/develop/static/img/kedro_banner.png +.. image:: https://raw.githubusercontent.com/kedro-org/kedro/main/static/img/kedro_banner.png :alt: Kedro logo :class: kedro-logo Welcome to Kedro's documentation! ============================================= -.. image:: https://img.shields.io/circleci/build/github/quantumblacklabs/kedro/master?label=master - :target: https://circleci.com/gh/quantumblacklabs/kedro/tree/master - :alt: CircleCI - Master Branch +.. image:: https://img.shields.io/circleci/build/github/kedro-org/kedro/main?label=main + :target: https://circleci.com/gh/kedro-org/kedro/tree/main + :alt: CircleCI - Main Branch -.. image:: https://img.shields.io/circleci/build/github/quantumblacklabs/kedro/develop?label=develop - :target: https://circleci.com/gh/quantumblacklabs/kedro/tree/develop +.. image:: https://img.shields.io/circleci/build/github/kedro-org/kedro/develop?label=develop + :target: https://circleci.com/gh/kedro-org/kedro/tree/develop :alt: CircleCI - Develop Branch .. image:: https://img.shields.io/badge/license-Apache%202.0-blue.svg - :target: https://opensource.org/licenses/Apache-2.0 + :target: https://opensource.org/license/apache2-0-php/ :alt: License is Apache 2.0 -.. image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue.svg +.. image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue.svg :target: https://pypi.org/project/kedro/ - :alt: Python version 3.7, 3.8, 3.9 + :alt: Python version 3.7, 3.8, 3.9, 3.10 .. image:: https://badge.fury.io/py/kedro.svg :target: https://pypi.org/project/kedro/ @@ -36,155 +36,116 @@ Welcome to Kedro's documentation! :alt: Conda package version .. image:: https://readthedocs.org/projects/kedro/badge/?version=stable - :target: https://kedro.readthedocs.io/ + :target: https://docs.kedro.org/ :alt: Docs build status -.. image:: https://img.shields.io/discord/778216384475693066.svg?color=7289da&label=Kedro%20Discord&logo=discord&style=flat-square - :target: https://discord.gg/akJDeVaxnB - :alt: Discord Server +.. image:: https://img.shields.io/badge/slack-chat-blueviolet.svg?label=Kedro%20Slack&logo=slack + :target: https://slack.kedro.org + :alt: Kedro's Slack organisation + +.. image:: https://img.shields.io/badge/slack-archive-blue.svg?label=Kedro%20Slack%20 + :target: https://www.linen.dev/s/kedro + :alt: Kedro's Slack archive .. image:: https://img.shields.io/badge/code%20style-black-black.svg :target: https://github.com/psf/black :alt: Code style is Black -.. image:: https://zenodo.org/badge/182067506.svg - :target: https://zenodo.org/badge/latestdoi/182067506 - :alt: Citation Reference +.. image:: https://bestpractices.coreinfrastructure.org/projects/6711/badge + :target: https://bestpractices.coreinfrastructure.org/projects/6711 + :alt: OpenSSF Best Practices Badge Program .. toctree:: :maxdepth: 2 - :caption: Introduction + :caption: Learn about Kedro + + introduction/index.md - 01_introduction/01_introduction +.. toctree:: + :maxdepth: 2 + get_started/index.md .. toctree:: :maxdepth: 2 - :caption: Get started + :caption: Tutorial and basic Kedro usage - 02_get_started/01_prerequisites - 02_get_started/02_install - 02_get_started/03_hello_kedro + tutorial/spaceflights_tutorial.md .. toctree:: :maxdepth: 2 - :caption: Make a project - 02_get_started/04_new_project - 02_get_started/05_example_project - 02_get_started/06_starters - 02_get_started/07_standalone_use_of_datacatalog + visualisation/index.md .. toctree:: :maxdepth: 2 - :caption: Tutorial - 03_tutorial/01_spaceflights_tutorial - 03_tutorial/02_tutorial_template - 03_tutorial/03_set_up_data - 03_tutorial/04_create_pipelines - 03_tutorial/05_package_a_project - 03_tutorial/06_visualise_pipeline + experiment_tracking/index.md .. toctree:: :maxdepth: 2 - :caption: Kedro project setup - 04_kedro_project_setup/01_dependencies - 04_kedro_project_setup/02_configuration - 04_kedro_project_setup/03_session + notebooks_and_ipython/index.md + resources/index.md .. toctree:: :maxdepth: 2 - :caption: Data Catalog + :caption: Kedro projects - 05_data/01_data_catalog - 05_data/02_kedro_io + kedro_project_setup/index.md .. toctree:: :maxdepth: 2 - :caption: Nodes and pipelines - 06_nodes_and_pipelines/01_nodes - 06_nodes_and_pipelines/02_pipeline_introduction - 06_nodes_and_pipelines/03_modular_pipelines - 06_nodes_and_pipelines/04_run_a_pipeline - 06_nodes_and_pipelines/05_slice_a_pipeline + configuration/index.md .. toctree:: :maxdepth: 2 - :caption: Extend Kedro - 07_extend_kedro/01_common_use_cases - 07_extend_kedro/02_hooks - 07_extend_kedro/03_custom_datasets - 07_extend_kedro/04_plugins - 07_extend_kedro/05_create_kedro_starters - 07_extend_kedro/06_transformers - 07_extend_kedro/07_decorators + data/index.md +.. toctree:: + :maxdepth: 2 + + nodes_and_pipelines/index.md .. toctree:: :maxdepth: 2 - :caption: Logging + :caption: Advanced usage - 08_logging/01_logging + extend_kedro/index.md .. toctree:: :maxdepth: 2 - :caption: Development - 09_development/01_set_up_vscode - 09_development/02_set_up_pycharm - 09_development/03_commands_reference - 09_development/04_debugging + hooks/index.md + .. toctree:: :maxdepth: 2 - :caption: Deployment - 10_deployment/01_deployment_guide - 10_deployment/02_single_machine - 10_deployment/03_distributed - 10_deployment/04_argo - 10_deployment/05_prefect - 10_deployment/06_kubeflow - 10_deployment/07_aws_batch - 10_deployment/08_databricks - 10_deployment/09_aws_sagemaker - 10_deployment/10_aws_step_functions - 10_deployment/11_airflow_astronomer + logging/index.md .. toctree:: :maxdepth: 2 - :caption: Tools integration - 11_tools_integration/01_pyspark - 11_tools_integration/02_ipython + integrations/pyspark_integration.md .. toctree:: :maxdepth: 2 - :caption: FAQs - 12_faq/01_faq - 12_faq/02_architecture_overview - 12_faq/03_kedro_principles + development/index.md .. toctree:: :maxdepth: 2 - :caption: Resources - 13_resources/01_logos - 13_resources/02_glossary + deployment/index.md .. toctree:: :maxdepth: 2 :caption: Contribute to Kedro - 14_contribution/01_contribute_to_kedro - 14_contribution/02_developer_contributor_guidelines - 14_contribution/03_backwards_compatibility - 14_contribution/04_documentation_contributor_guidelines + contribution/index.md API documentation ================= @@ -196,6 +157,7 @@ API documentation :recursive: kedro + kedro_datasets Indices and tables ================== diff --git a/docs/source/11_tools_integration/01_pyspark.md b/docs/source/integrations/pyspark_integration.md similarity index 56% rename from docs/source/11_tools_integration/01_pyspark.md rename to docs/source/integrations/pyspark_integration.md index 7f8a3dcd6f..3afaf084c7 100644 --- a/docs/source/11_tools_integration/01_pyspark.md +++ b/docs/source/integrations/pyspark_integration.md @@ -1,8 +1,4 @@ -# Build a Kedro pipeline with PySpark - -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` +# PySpark integration This page outlines some best practices when building a Kedro pipeline with [`PySpark`](https://spark.apache.org/docs/latest/api/python/index.html). It assumes a basic understanding of both Kedro and `PySpark`. @@ -15,47 +11,36 @@ spark.driver.maxResultSize: 3g spark.scheduler.mode: FAIR ``` -```eval_rst -.. note:: Optimal configuration for Spark depends on the setup of your Spark cluster. +```{note} +Optimal configuration for Spark depends on the setup of your Spark cluster. ``` -## Initialise a `SparkSession` in custom project context class +## Initialise a `SparkSession` using a hook -Before any `PySpark` operations are performed, you should initialise your [`SparkSession`](https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession) in your custom project context class, which is the entrypoint for your Kedro project. This ensures that a `SparkSession` has been initialised before the Kedro pipeline is run. +Before any `PySpark` operations are performed, you should initialise your [`SparkSession`](https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession) using an `after_context_created` [hook](../hooks/introduction). This ensures that a `SparkSession` has been initialised before the Kedro pipeline is run. -Below is an example implementation to initialise the `SparkSession` in `/src//.py` by reading configuration from the `spark.yml` configuration file created in the previous section: +Below is an example implementation to initialise the `SparkSession` in `src//hooks.py` by reading configuration from the `spark.yml` configuration file created in the previous section: ```python -from typing import Any, Dict, Union -from pathlib import Path - +from kedro.framework.hooks import hook_impl from pyspark import SparkConf from pyspark.sql import SparkSession -from kedro.framework.context import KedroContext - -class CustomContext(KedroContext): - def __init__( - self, - package_name: str, - project_path: Union[Path, str], - env: str = None, - extra_params: Dict[str, Any] = None, - ): - super().__init__(package_name, project_path, env, extra_params) - self.init_spark_session() - - def init_spark_session(self) -> None: - """Initialises a SparkSession using the config defined in project's conf folder.""" +class SparkHooks: + @hook_impl + def after_context_created(self, context) -> None: + """Initialises a SparkSession using the config + defined in project's conf folder. + """ # Load the spark configuration in spark.yaml using the config loader - parameters = self.config_loader.get("spark*", "spark*/**") + parameters = context.config_loader.get("spark*", "spark*/**") spark_conf = SparkConf().setAll(parameters.items()) # Initialise the spark session spark_session_conf = ( - SparkSession.builder.appName(self.package_name) + SparkSession.builder.appName(context.project_path.name) .enableHiveSupport() .config(conf=spark_conf) ) @@ -69,23 +54,24 @@ Call `SparkSession.builder.getOrCreate()` to obtain the `SparkSession` anywhere We don't recommend storing Spark session on the context object, as it cannot be serialised and therefore prevents the context from being initialised for some plugins. -Now, you need to configure Kedro to use `CustomContext`. All you need to do is just set `CONTEXT_CLASS` in `/src//settings.py` as follow: +You will also need to register `SparkHooks` by updating the `HOOKS` variable in `src//settings.py` as follows: ```python -from . import CustomContext +from .hooks import SparkHooks -CONTEXT_CLASS = CustomContext +HOOKS = (SparkHooks(),) ``` ## Use Kedro's built-in Spark datasets to load and save raw data -We recommend using Kedro's built-in Spark datasets to load raw data into Spark's [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis), as well as to write them back to storage. Some of our built-in Spark datasets include: +We recommend using Kedro's built-in Spark datasets to load raw data into Spark's [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html), as well as to write them back to storage. Some of our built-in Spark datasets include: -* [spark.SparkDataSet](/kedro.extras.datasets.spark.SparkDataSet) -* [spark.SparkJDBCDataSet](/kedro.extras.datasets.spark.SparkJDBCDataSet) -* [spark.SparkHiveDataSet](/kedro.extras.datasets.spark.SparkHiveDataSet) +* [spark.DeltaTableDataSet](/kedro_datasets.spark.DeltaTableDataSet) +* [spark.SparkDataSet](/kedro_datasets.spark.SparkDataSet) +* [spark.SparkJDBCDataSet](/kedro_datasets.spark.SparkJDBCDataSet) +* [spark.SparkHiveDataSet](/kedro_datasets.spark.SparkHiveDataSet) -The example below illustrates how to use `spark.SparkDataSet` to read a CSV file located in S3 into a `DataFrame` in `/conf/base/catalog.yml`: +The example below illustrates how to use `spark.SparkDataSet` to read a CSV file located in S3 into a `DataFrame` in `conf/base/catalog.yml`: ```yaml weather: @@ -105,7 +91,7 @@ Or using the Python API: ```python import pyspark.sql from kedro.io import DataCatalog -from kedro.extras.datasets.spark import SparkDataSet +from kedro_datasets.spark import SparkDataSet spark_ds = SparkDataSet( filepath="s3a://your_bucket/data/01_raw/weather*", @@ -119,6 +105,89 @@ df = catalog.load("weather") assert isinstance(df, pyspark.sql.DataFrame) ``` +## Spark and Delta Lake interaction + +[Delta Lake](https://delta.io/) is an open-source project that enables building a Lakehouse architecture on top of data lakes. It provides ACID transactions and unifies streaming and batch data processing on top of existing data lakes, such as S3, ADLS, GCS, and HDFS. +To setup PySpark with Delta Lake, have a look at [the recommendations in Delta Lake's documentation](https://docs.delta.io/latest/quick-start.html#python). + +We recommend the following workflow, which makes use of the [transcoding feature in Kedro](../data/data_catalog.md): + +* To create a Delta table, use a `SparkDataSet` with `file_format="delta"`. You can also use this type of dataset to read from a Delta table and/or overwrite it. +* To perform [Delta table deletes, updates, and merges](https://docs.delta.io/latest/delta-update.html#language-python), load the data using a `DeltaTableDataSet` and perform the write operations within the node function. + +As a result, we end up with a catalog that looks like this: + +```yaml +temperature: + type: spark.SparkDataSet + filepath: data/01_raw/data.csv + file_format: "csv" + load_args: + header: True + inferSchema: True + save_args: + sep: '|' + header: True + +weather@spark: + type: spark.SparkDataSet + filepath: s3a://my_bucket/03_primary/weather + file_format: "delta" + save_args: + mode: "overwrite" + versionAsOf: 0 + +weather@delta: + type: spark.DeltaTableDataSet + filepath: s3a://my_bucket/03_primary/weather +``` + +The `DeltaTableDataSet` does not support `save()` operation, as the updates happen in place inside the node function, i.e. through `DeltaTable.update()`, `DeltaTable.delete()`, `DeltaTable.merge()`. + + +```{note} +If you have defined an implementation for the Kedro `before_dataset_saved`/`after_dataset_saved` hook, the hook will not be triggered. This is because the save operation happens within the `node` itself, via the DeltaTable API. +``` + +```python +pipeline( + [ + node( + func=process_barometer_data, inputs="temperature", outputs="weather@spark" + ), + node( + func=update_meterological_state, + inputs="weather@delta", + outputs="first_operation_complete", + ), + node( + func=estimate_weather_trend, + inputs=["first_operation_complete", "weather@delta"], + outputs="second_operation_complete", + ), + ] +) +``` + +`first_operation_complete` is a `MemoryDataSet` and it signals that any Delta operations which occur "outside" the Kedro DAG are complete. This can be used as input to a downstream node, to preserve the shape of the DAG. Otherwise, if no downstream nodes need to run after this, the node can simply not return anything: + +```python +pipeline( + [ + node(func=..., inputs="temperature", outputs="weather@spark"), + node(func=..., inputs="weather@delta", outputs=None), + ] +) +``` + +The following diagram is the visual representation of the workflow explained above: + +![Spark and Delta Lake workflow](../meta/images/spark_delta_workflow.png) + +```{note} +This pattern of creating "dummy" datasets to preserve the data flow also applies to other "out of DAG" execution operations such as SQL operations within a node. +``` + ## Use `MemoryDataSet` for intermediary `DataFrame` For nodes operating on `DataFrame` that doesn't need to perform Spark actions such as writing the `DataFrame` to storage, we recommend using the default `MemoryDataSet` to hold the `DataFrame`. In other words, there is no need to specify it in the `DataCatalog` or `catalog.yml`. This allows you to take advantage of Spark's optimiser and lazy evaluation. @@ -130,7 +199,7 @@ Sometimes, you might want to use Spark objects that aren't `DataFrame` as inputs ```python from typing import Any, Dict -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node, pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.sql import DataFrame @@ -147,8 +216,8 @@ def predict(model: RandomForestClassifier, testing_data: DataFrame) -> DataFrame return predictions -def create_pipeline(**kwargs): - return Pipeline( +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( [ node(train_model, inputs=["training_data"], outputs="example_classifier"), node( @@ -178,7 +247,7 @@ Under the hood, every Kedro node that performs a Spark action (e.g. `save`, `col kedro run --runner=ThreadRunner ``` -To further increase the concurrency level, if you are using Spark >= 0.8, you can also give each node a roughly equal share of the Spark cluster by turning on fair sharing and therefore giving them a roughly equal chance of being executed concurrently. By default, they are executed in a FIFO manner, which means if a job takes up too much resources, it could hold up the execution of other jobs. In order to turn on fair sharing, put the following in your `conf/base/spark.yml` file, which was created in the [Initialise a `SparkSession`](#initialise-a-sparksession-in-projectcontext) section: +To further increase the concurrency level, if you are using Spark >= 0.8, you can also give each node a roughly equal share of the Spark cluster by turning on fair sharing and therefore giving them a roughly equal chance of being executed concurrently. By default, they are executed in a FIFO manner, which means if a job takes up too much resources, it could hold up the execution of other jobs. In order to turn on fair sharing, put the following in your `conf/base/spark.yml` file, which was created in the [Initialise a `SparkSession`](#initialise-a-sparksession-using-a-hook) section: ```yaml spark.scheduler.mode: FAIR diff --git a/docs/source/introduction/index.md b/docs/source/introduction/index.md new file mode 100644 index 0000000000..dee8eaeee3 --- /dev/null +++ b/docs/source/introduction/index.md @@ -0,0 +1,10 @@ +# Introduction to Kedro + +```{toctree} +:hidden: +introduction +``` + +Kedro is an open-source Python framework to create reproducible, maintainable, and modular data science code. It uses software engineering best practices to help you build production-ready data science pipelines. + +Kedro is hosted by the [LF AI & Data Foundation](https://lfaidata.foundation/), and you can find the [Kedro source code on GitHub](https://github.com/kedro-org/kedro). diff --git a/docs/source/introduction/introduction.md b/docs/source/introduction/introduction.md new file mode 100644 index 0000000000..45fea38bd2 --- /dev/null +++ b/docs/source/introduction/introduction.md @@ -0,0 +1,16 @@ +# Learn how to use Kedro + +In the following chapters, you will learn [how to set up Kedro](../get_started/install.md) and discover the [key Kedro concepts](../get_started/kedro_concepts.md). You can then review the [spaceflights tutorial](../tutorial/tutorial_template.md) to get hands-on experience with a Kedro project. + +For new and intermediate Kedro users, there's a comprehensive section on [visualising Kedro projects using Kedro-Viz](../visualisation/kedro-viz_visualisation.md) and [working with Kedro and Jupyter notebooks](../notebooks_and_ipython/kedro_and_notebooks). + +Use the left-hand table of contents to explore the documentation available for more advanced Kedro usage and deployment. We also recommend the [glossary](../resources/glossary.md) and the [API reference documentation](/kedro). + +## Assumptions + +We have designed the preliminary documentation and the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) for anyone new to Kedro. The more knowledge of Python you have, the easier you will find the learning curve. + +```{note} +There are many excellent online resources for learning Python; you should choose those that reference Python 3, as Kedro is built for Python 3.7+. There are curated lists of online resources, such as the [official Python programming language website](https://www.python.org/) and this list of [free programming books and tutorials](https://github.com/EbookFoundation/free-programming-books/blob/master/books/free-programming-books-langs.md#python). + +``` diff --git a/docs/source/15_api_docs/kedro.config.rst b/docs/source/kedro.config.rst similarity index 91% rename from docs/source/15_api_docs/kedro.config.rst rename to docs/source/kedro.config.rst index cb5719480e..a8105745cd 100644 --- a/docs/source/15_api_docs/kedro.config.rst +++ b/docs/source/kedro.config.rst @@ -13,6 +13,7 @@ kedro.config kedro.config.ConfigLoader kedro.config.TemplatedConfigLoader + kedro.config.OmegaConfigLoader .. rubric:: Exceptions diff --git a/docs/source/15_api_docs/kedro.extras.datasets.rst b/docs/source/kedro.extras.datasets.rst similarity index 71% rename from docs/source/15_api_docs/kedro.extras.datasets.rst rename to docs/source/kedro.extras.datasets.rst index 210dc4ecb3..6c5deee72c 100644 --- a/docs/source/15_api_docs/kedro.extras.datasets.rst +++ b/docs/source/kedro.extras.datasets.rst @@ -11,7 +11,6 @@ kedro.extras.datasets :toctree: :template: autosummary/class.rst - kedro.extras.datasets.api.APIDataSet kedro.extras.datasets.biosequence.BioSequenceDataSet kedro.extras.datasets.dask.ParquetDataSet kedro.extras.datasets.email.EmailMessageDataSet @@ -19,22 +18,33 @@ kedro.extras.datasets kedro.extras.datasets.holoviews.HoloviewsWriter kedro.extras.datasets.json.JSONDataSet kedro.extras.datasets.matplotlib.MatplotlibWriter - kedro.extras.datasets.networkx.NetworkXDataSet + kedro.extras.datasets.networkx.GMLDataSet + kedro.extras.datasets.networkx.GraphMLDataSet + kedro.extras.datasets.networkx.JSONDataSet kedro.extras.datasets.pandas.CSVDataSet kedro.extras.datasets.pandas.ExcelDataSet kedro.extras.datasets.pandas.FeatherDataSet + kedro.extras.datasets.pandas.GBQQueryDataSet kedro.extras.datasets.pandas.GBQTableDataSet + kedro.extras.datasets.pandas.GenericDataSet kedro.extras.datasets.pandas.HDFDataSet kedro.extras.datasets.pandas.JSONDataSet kedro.extras.datasets.pandas.ParquetDataSet kedro.extras.datasets.pandas.SQLQueryDataSet kedro.extras.datasets.pandas.SQLTableDataSet + kedro.extras.datasets.pandas.XMLDataSet kedro.extras.datasets.pickle.PickleDataSet kedro.extras.datasets.pillow.ImageDataSet + kedro.extras.datasets.plotly.JSONDataSet kedro.extras.datasets.plotly.PlotlyDataSet + kedro.extras.datasets.redis.PickleDataSet + kedro.extras.datasets.spark.DeltaTableDataSet kedro.extras.datasets.spark.SparkDataSet kedro.extras.datasets.spark.SparkHiveDataSet kedro.extras.datasets.spark.SparkJDBCDataSet + kedro.extras.datasets.svmlight.SVMLightDataSet kedro.extras.datasets.tensorflow.TensorFlowModelDataset kedro.extras.datasets.text.TextDataSet + kedro.extras.datasets.tracking.JSONDataSet + kedro.extras.datasets.tracking.MetricsDataSet kedro.extras.datasets.yaml.YAMLDataSet diff --git a/docs/source/kedro.extras.rst b/docs/source/kedro.extras.rst new file mode 100644 index 0000000000..054606dc3d --- /dev/null +++ b/docs/source/kedro.extras.rst @@ -0,0 +1,13 @@ +kedro.extras +============ + +.. rubric:: Description + +.. automodule:: kedro.extras + +.. rubric:: Modules + +.. toctree:: + :hidden: + + kedro.extras.datasets diff --git a/docs/source/15_api_docs/kedro.framework.cli.cli.KedroCLI.rst b/docs/source/kedro.framework.cli.cli.KedroCLI.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.cli.cli.KedroCLI.rst rename to docs/source/kedro.framework.cli.cli.KedroCLI.rst diff --git a/docs/source/15_api_docs/kedro.framework.cli.hooks.manager.CLIHooksManager.rst b/docs/source/kedro.framework.cli.hooks.manager.CLIHooksManager.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.cli.hooks.manager.CLIHooksManager.rst rename to docs/source/kedro.framework.cli.hooks.manager.CLIHooksManager.rst diff --git a/docs/source/kedro.framework.cli.jupyter.JupyterCommandGroup.rst b/docs/source/kedro.framework.cli.jupyter.JupyterCommandGroup.rst new file mode 100644 index 0000000000..3562a7950f --- /dev/null +++ b/docs/source/kedro.framework.cli.jupyter.JupyterCommandGroup.rst @@ -0,0 +1,10 @@ +kedro.framework.cli.jupyter.JupyterCommandGroup +=============================================== + +.. currentmodule:: kedro.framework.cli.jupyter + +.. autoclass:: JupyterCommandGroup + :members: + +.. Removed all methods and properties, +.. see https://github.com/kedro-org/kedro/issues/2453 diff --git a/docs/source/15_api_docs/kedro.framework.cli.utils.CommandCollection.rst b/docs/source/kedro.framework.cli.utils.CommandCollection.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.cli.utils.CommandCollection.rst rename to docs/source/kedro.framework.cli.utils.CommandCollection.rst diff --git a/docs/source/15_api_docs/kedro.framework.cli.utils.rst b/docs/source/kedro.framework.cli.utils.rst similarity index 96% rename from docs/source/15_api_docs/kedro.framework.cli.utils.rst rename to docs/source/kedro.framework.cli.utils.rst index a70cf40fd0..24529f1b0d 100644 --- a/docs/source/15_api_docs/kedro.framework.cli.utils.rst +++ b/docs/source/kedro.framework.cli.utils.rst @@ -17,7 +17,6 @@ kedro.framework.cli.utils find_stylesheets forward_command get_pkg_version - ipython_message python_call split_string diff --git a/docs/source/15_api_docs/kedro.framework.context.rst b/docs/source/kedro.framework.context.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.context.rst rename to docs/source/kedro.framework.context.rst diff --git a/docs/source/kedro.framework.session.shelvestore.ShelveStore.rst b/docs/source/kedro.framework.session.shelvestore.ShelveStore.rst new file mode 100644 index 0000000000..bb1b278487 --- /dev/null +++ b/docs/source/kedro.framework.session.shelvestore.ShelveStore.rst @@ -0,0 +1,6 @@ +kedro.framework.session.shelvestore.ShelveStore +================================================ + +.. currentmodule:: kedro.framework.session.shelvestore + +.. autoclass:: ShelveStore diff --git a/docs/source/15_api_docs/kedro.framework.session.store.BaseSessionStore.rst b/docs/source/kedro.framework.session.store.BaseSessionStore.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.session.store.BaseSessionStore.rst rename to docs/source/kedro.framework.session.store.BaseSessionStore.rst diff --git a/docs/source/15_api_docs/kedro.io.rst b/docs/source/kedro.io.rst similarity index 71% rename from docs/source/15_api_docs/kedro.io.rst rename to docs/source/kedro.io.rst index 64a4552bb7..f86bb0558d 100644 --- a/docs/source/15_api_docs/kedro.io.rst +++ b/docs/source/kedro.io.rst @@ -13,14 +13,17 @@ kedro.io kedro.io.AbstractDataSet kedro.io.AbstractVersionedDataSet - kedro.io.AbstractTransformer + kedro.io.CachedDataSet + kedro.io.CachedDataset kedro.io.DataCatalog + kedro.io.IncrementalDataSet + kedro.io.IncrementalDataset kedro.io.LambdaDataSet + kedro.io.LambdaDataset kedro.io.MemoryDataSet + kedro.io.MemoryDataset kedro.io.PartitionedDataSet - kedro.io.IncrementalDataSet - kedro.io.CachedDataSet - kedro.io.DataCatalogWithDefault + kedro.io.PartitionedDataset kedro.io.Version .. rubric:: Exceptions @@ -32,3 +35,6 @@ kedro.io kedro.io.DataSetAlreadyExistsError kedro.io.DataSetError kedro.io.DataSetNotFoundError + kedro.io.DatasetAlreadyExistsError + kedro.io.DatasetError + kedro.io.DatasetNotFoundError diff --git a/docs/source/kedro.logging.RichHandler.rst b/docs/source/kedro.logging.RichHandler.rst new file mode 100644 index 0000000000..b14b9e91aa --- /dev/null +++ b/docs/source/kedro.logging.RichHandler.rst @@ -0,0 +1,6 @@ +kedro.logging.RichHandler +========================= + +.. currentmodule:: kedro.logging + +.. autoclass:: RichHandler diff --git a/docs/source/kedro.logging.rst b/docs/source/kedro.logging.rst new file mode 100644 index 0000000000..9476656b50 --- /dev/null +++ b/docs/source/kedro.logging.rst @@ -0,0 +1,20 @@ +kedro.logging +============= + +.. rubric:: Description + +.. automodule:: kedro.logging + + + + + + + + .. rubric:: Classes + + .. autosummary:: + :toctree: + :template: autosummary/class.rst + + RichHandler diff --git a/docs/source/15_api_docs/kedro.pipeline.rst b/docs/source/kedro.pipeline.rst similarity index 87% rename from docs/source/15_api_docs/kedro.pipeline.rst rename to docs/source/kedro.pipeline.rst index b8c67323b4..c99f493c9e 100644 --- a/docs/source/15_api_docs/kedro.pipeline.rst +++ b/docs/source/kedro.pipeline.rst @@ -25,12 +25,6 @@ kedro.pipeline .. rubric:: Modules -.. autosummary:: - :toctree: - :recursive: - - kedro.pipeline.decorators - .. rubric:: Exceptions .. autosummary:: diff --git a/docs/source/15_api_docs/kedro.runner.rst b/docs/source/kedro.runner.rst similarity index 100% rename from docs/source/15_api_docs/kedro.runner.rst rename to docs/source/kedro.runner.rst diff --git a/docs/source/kedro_datasets.rst b/docs/source/kedro_datasets.rst new file mode 100644 index 0000000000..b3d3ab328b --- /dev/null +++ b/docs/source/kedro_datasets.rst @@ -0,0 +1,56 @@ +kedro_datasets +============== + +.. rubric:: Description + +.. automodule:: kedro_datasets + +.. rubric:: Classes + +.. autosummary:: + :toctree: + :template: autosummary/class.rst + + kedro_datasets.api.APIDataSet + kedro_datasets.biosequence.BioSequenceDataSet + kedro_datasets.dask.ParquetDataSet + kedro_datasets.databricks.ManagedTableDataSet + kedro_datasets.email.EmailMessageDataSet + kedro_datasets.geopandas.GeoJSONDataSet + kedro_datasets.holoviews.HoloviewsWriter + kedro_datasets.json.JSONDataSet + kedro_datasets.matplotlib.MatplotlibWriter + kedro_datasets.networkx.GMLDataSet + kedro_datasets.networkx.GraphMLDataSet + kedro_datasets.networkx.JSONDataSet + kedro_datasets.pandas.CSVDataSet + kedro_datasets.pandas.ExcelDataSet + kedro_datasets.pandas.FeatherDataSet + kedro_datasets.pandas.GBQQueryDataSet + kedro_datasets.pandas.GBQTableDataSet + kedro_datasets.pandas.GenericDataSet + kedro_datasets.pandas.HDFDataSet + kedro_datasets.pandas.JSONDataSet + kedro_datasets.pandas.ParquetDataSet + kedro_datasets.pandas.SQLQueryDataSet + kedro_datasets.pandas.SQLTableDataSet + kedro_datasets.pandas.XMLDataSet + kedro_datasets.pickle.PickleDataSet + kedro_datasets.pillow.ImageDataSet + kedro_datasets.plotly.JSONDataSet + kedro_datasets.plotly.PlotlyDataSet + kedro_datasets.polars.CSVDataSet + kedro_datasets.redis.PickleDataSet + kedro_datasets.snowflake.SnowparkTableDataSet + kedro_datasets.spark.DeltaTableDataSet + kedro_datasets.spark.SparkDataSet + kedro_datasets.spark.SparkHiveDataSet + kedro_datasets.spark.SparkJDBCDataSet + kedro_datasets.spark.SparkStreamingDataSet + kedro_datasets.svmlight.SVMLightDataSet + kedro_datasets.tensorflow.TensorFlowModelDataSet + kedro_datasets.text.TextDataSet + kedro_datasets.tracking.JSONDataSet + kedro_datasets.tracking.MetricsDataSet + kedro_datasets.video.VideoDataSet + kedro_datasets.yaml.YAMLDataSet diff --git a/docs/source/kedro_logo.svg b/docs/source/kedro_logo.svg new file mode 100644 index 0000000000..e3c139a523 --- /dev/null +++ b/docs/source/kedro_logo.svg @@ -0,0 +1,3 @@ + + + diff --git a/docs/source/kedro_project_setup/dependencies.md b/docs/source/kedro_project_setup/dependencies.md new file mode 100644 index 0000000000..862ee4e49d --- /dev/null +++ b/docs/source/kedro_project_setup/dependencies.md @@ -0,0 +1,72 @@ +# Dependencies + +Both `pip install kedro` and `conda install -c conda-forge kedro` install the core Kedro module, which includes the CLI tool, project template, pipeline abstraction, framework, and support for configuration. + +When you create a project, you then introduce additional dependencies for the tasks it performs. + +## Project-specific dependencies +You can specify a project's exact dependencies in the `src/requirements.txt` file to make it easier for you and others to run your project in the future, +and to avoid version conflicts downstream. This can be achieved with the help of [`pip-tools`](https://pypi.org/project/pip-tools/). +To install `pip-tools` in your virtual environment, run the following command: +```bash +pip install pip-tools +``` + +To add or remove dependencies to a project, edit the `src/requirements.txt` file, then run the following: + +```bash +pip-compile --output-file=/src/requirements.txt --input-file=/src/requirements.txt +``` + +This will [pip compile](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) the requirements listed in +the `src/requirements.txt` file into a `src/requirements.lock` that specifies a list of pinned project dependencies +(those with a strict version). You can also use this command with additional CLI arguments such as `--generate-hashes` +to use `pip`'s Hash Checking Mode or `--upgrade-package` to update specific packages to the latest or specific versions. +[Check out the `pip-tools` documentation](https://pypi.org/project/pip-tools/) for more information. + +```{note} +The `src/requirements.txt` file contains "source" requirements, while `src/requirements.lock` contains the compiled version of those and requires no manual updates. +``` + +To further update the project requirements, modify the `src/requirements.txt` file (not `src/requirements.lock`) and re-run the `pip-compile` command above. + + +## Install project-specific dependencies + +To install the project-specific dependencies, navigate to the root directory of the project and run: + +```bash +pip install -r src/requirements.txt +``` + +## Workflow dependencies + +To install all the dependencies recorded in Kedro's [`setup.py`](https://github.com/kedro-org/kedro/blob/develop/setup.py), run: + +```bash +pip install "kedro[all]" +``` + +### Install dependencies related to the Data Catalog + +The [Data Catalog](../data/data_catalog.md) is your way of interacting with different data types in Kedro. The modular dependencies in this category include `pandas`, `numpy`, `pyspark`, `matplotlib`, `pillow`, `dask`, and more. + +#### Install dependencies at a group-level + +Data types are broken into groups e.g. `pandas`, `spark` and `pickle`. Each group has a collection of data types e.g.`pandas.CSVDataSet`, `pandas.ParquetDataSet` and more. You can install dependencies for an entire group of dependencies as follows: + +```bash +pip install "kedro-datasets[]" +``` + +This installs Kedro and dependencies related to the data type group. An example of this could be a workflow that depends on the data types in `pandas`. Run `pip install "kedro-datasets[pandas]"` to install Kedro and the dependencies for the data types in the [`pandas` group](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets/kedro_datasets/pandas). + +#### Install dependencies at a type-level + +To limit installation to dependencies specific to a data type: + +```bash +pip install "kedro-datasets[.]" +``` + +For example, your workflow might require use of the `pandas.ExcelDataSet`, so to install its dependencies, run `pip install "kedro-datasets[pandas.ExcelDataSet]"`. diff --git a/docs/source/kedro_project_setup/index.md b/docs/source/kedro_project_setup/index.md new file mode 100644 index 0000000000..2b3f882950 --- /dev/null +++ b/docs/source/kedro_project_setup/index.md @@ -0,0 +1,10 @@ +# Kedro project setup + +```{toctree} +:maxdepth: 1 + +starters +dependencies +session +settings +``` diff --git a/docs/source/04_kedro_project_setup/03_session.md b/docs/source/kedro_project_setup/session.md similarity index 65% rename from docs/source/04_kedro_project_setup/03_session.md rename to docs/source/kedro_project_setup/session.md index 699f516d21..5c498dde61 100644 --- a/docs/source/04_kedro_project_setup/03_session.md +++ b/docs/source/kedro_project_setup/session.md @@ -1,10 +1,6 @@ # Lifecycle management with `KedroSession` -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -### Overview +## Overview A `KedroSession` allows you to: * Manage the lifecycle of a Kedro run @@ -18,33 +14,26 @@ The main methods and properties of `KedroSession` are: - `create()`: Create a new instance of ``KedroSession`` with session data - `load_context()`: Instantiate `KedroContext` object - `close()`: Close the current session — although we recommend that you [use the session object as a context manager](#create-a-session), which will call `close()` automatically, as opposed to calling the method explicitly -- `run()`: Run the pipeline with the arguments provided; see [Running pipelines](../06_nodes_and_pipelines/04_run_a_pipeline) for details +- `run()`: Run the pipeline with the arguments provided; see [Running pipelines](../nodes_and_pipelines/run_a_pipeline) for details -### Create a session +## Create a session The following code creates a `KedroSession` object as a context manager and runs a pipeline inside the context, with session data provided. The session automatically closes after exit: ```python from kedro.framework.session import KedroSession +from kedro.framework.startup import bootstrap_project +from pathlib import Path -with KedroSession.create("") as session: +bootstrap_project(Path.cwd()) +with KedroSession.create() as session: session.run() ``` -You need to tell `KedroSession` the package name of your Kedro project so it can load your settings, nodes and pipelines. Additionally, you can provide the following optional arguments in `KedroSession.create()`: +You can provide the following optional arguments in `KedroSession.create()`: - `project_path`: Path to the project root directory - `save_on_close`: A boolean value to indicate whether or not to save the session to disk when it's closed - `env`: Environment for the `KedroContext` - `extra_params`: Optional dictionary containing extra project parameters for the underlying `KedroContext`; if specified, this will update (and therefore take precedence over) parameters retrieved from the project configuration - -When you want to access to the most recent session object, use the helper function `get_current_session()` as follows: - -```python -from kedro.framework.session import get_current_session - -session = get_current_session() -context = session.load_context() -context.catalog.load("my_data").head() -``` diff --git a/docs/source/kedro_project_setup/settings.md b/docs/source/kedro_project_setup/settings.md new file mode 100644 index 0000000000..965ef07126 --- /dev/null +++ b/docs/source/kedro_project_setup/settings.md @@ -0,0 +1,42 @@ +# Project settings + +## Application settings + +A Kedro project's `settings.py` file contains the application settings for the project, including registration of Hooks and library components. This page explains how settings work, and which settings are available. + +```{note} +Application settings is distinct from [run time configuration](../configuration/configuration_basics.md), which is stored in the `conf` folder and can vary by configuration environment, and [pyproject.toml](#project-metadata) , which provides project metadata and build configuration. +``` + +By default, all code in `settings.py` is commented out. When settings are not supplied, Kedro chooses sensible default values. You only need to edit `settings.py` if you wish to change to values other than the defaults. + +| Setting | Default value | Use | +| --------------------------- | ------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | +| `HOOKS` | `tuple()` | Inject additional behaviour into the execution timeline with [project Hooks](../hooks/introduction.md). | +| `DISABLE_HOOKS_FOR_PLUGINS` | `tuple()` | Disable [auto-registration of Hooks from plugins](../hooks/introduction.md#disable-auto-registered-plugins-hooks). | +| `SESSION_STORE_CLASS` | `kedro.framework.session.session.BaseSessionStore`| Customise how [session data](session.md) is stored. | +| `SESSION_STORE_ARGS` | `dict()` | Keyword arguments for the `SESSION_STORE_CLASS` constructor. | +| `CONTEXT_CLASS` | `kedro.framework.context.KedroContext` | Customise how Kedro library components are managed. | +| `CONF_SOURCE` | `"conf"` | Directory that holds [configuration](../configuration/configuration_basics.md). | +| `CONFIG_LOADER_CLASS` | `kedro.config.ConfigLoader` | Customise how project configuration is handled. | +| `CONFIG_LOADER_ARGS` | `dict()` | Keyword arguments for the `CONFIG_LOADER_CLASS` constructor. | +| `DATA_CATALOG_CLASS` | `kedro.io.DataCatalog` | Customise how the [Data Catalog](../data/data_catalog.md) is handled. | + +## Project metadata +The `pyproject.toml` file is the standard way to store build metadata and tool settings for Python projects. +Every Kedro project comes with a default pre-populated `pyproject.toml` file in your project root directory with the following keys specified under the `[tool.kedro]` section: + +```toml +[tool.kedro] +package_name = package_name +project_name = project_name +kedro_init_version = kedro_version +``` + +The `package_name` should be a [valid Python package name](https://peps.python.org/pep-0423/) and the `project_name` should be a human-readable name. They are both mandatory keys for your project. +`kedro_init_version` specifies the version of Kedro the project was created with. When you upgrade to a newer Kedro version, +this value should also be updated. + +You can also use `pyproject.toml` to specify settings for functionalities such as [micro-packaging](../nodes_and_pipelines/micro_packaging.md). +You can also store the settings for the other tools you've used in your project, such as [`pytest` for automated testing](../development/automated_testing.md). +Consult the respective documentation for the tools you have used to check how you can configure the settings with the `pyproject.toml` file for your project. diff --git a/docs/source/kedro_project_setup/starters.md b/docs/source/kedro_project_setup/starters.md new file mode 100644 index 0000000000..4ad60cb0c2 --- /dev/null +++ b/docs/source/kedro_project_setup/starters.md @@ -0,0 +1,173 @@ +# Kedro starters + +A Kedro starter contains code in the form of a [Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/) template for a Kedro project. Metaphorically, a starter is similar to using a pre-defined layout when creating a presentation or document. + +Kedro starters provide pre-defined example code and configuration that can be reused, for example: + +* As template code for a typical Kedro project +* To add a `docker-compose` setup to launch Kedro next to a monitoring stack +* To add deployment scripts and CI/CD setup for your targeted infrastructure + +You can create your own starters for reuse within a project or team, as described in the documentation about [how to create a Kedro starter](../kedro_project_setup/starters.md#how-to-create-a-kedro-starter). + +## How to use Kedro starters + +To create a Kedro project using a starter, apply the `--starter` flag to `kedro new`: + +```bash +kedro new --starter= +``` + +```{note} +`path-to-starter` could be a local directory or a VCS repository, as long as [Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/usage.html) supports it. +``` + +To create a project using the `PySpark` starter: + +```bash +kedro new --starter=pyspark +``` + +## Starter aliases + +We provide aliases for common starters maintained by the Kedro team so that users don't have to specify the full path. For example, to use the `PySpark` starter to create a project: + +```bash +kedro new --starter=pyspark +``` + +To list all the aliases we support: + +```bash +kedro starter list +``` + +## List of official starters + +The Kedro team maintains the following starters for a range of Kedro projects: + +* [`astro-airflow-iris`](https://github.com/kedro-org/kedro-starters/tree/main/astro-airflow-iris): The [Kedro Iris dataset example project](../get_started/new_project.md) with a minimal setup for deploying the pipeline on Airflow with [Astronomer](https://www.astronomer.io/). +* [`standalone-datacatalog`](https://github.com/kedro-org/kedro-starters/tree/main/standalone-datacatalog): A minimum setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../data/data_catalog.md), which is a core component of Kedro. This starter is of use in the exploratory phase of a project. For more information, read the guide to [standalone use of the `DataCatalog`](../notebooks_and_ipython/kedro_and_notebooks.md). This starter was formerly known as `mini-kedro`. +* [`pandas-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris): The [Kedro Iris dataset example project](../get_started/new_project.md) +* [`pyspark-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark-iris): An alternative Kedro Iris dataset example, using [PySpark](../integrations/pyspark_integration.md) +* [`pyspark`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark): The configuration and initialisation code for a [Kedro pipeline using PySpark](../integrations/pyspark_integration.md) +* [`spaceflights`](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): The [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) example code + +## Starter versioning + +By default, Kedro will use the latest version available in the repository, but if you want to use a specific version of a starter, you can pass a `--checkout` argument to the command: + +```bash +kedro new --starter=pyspark --checkout=0.1.0 +``` + +The `--checkout` value points to a branch, tag or commit in the starter repository. + +Under the hood, the value will be passed to the [`--checkout` flag in Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/usage.html#works-directly-with-git-and-hg-mercurial-repos-too). + + +## Use a starter with a configuration file + +By default, when you create a new project using a starter, `kedro new` asks you to enter the `project_name`, which it uses to set the `repo_name` and `python_package` name. This is the same behavior as when you [create a new empty project](../get_started/new_project.md#create-a-new-empty-project) + +However, Kedro also allows you to [specify a configuration file](../get_started/new_project.md#create-a-new-project-from-a-configuration-file) when you create a project using a Kedro starter. Use the `--config` flag alongside the starter: + +```bash +kedro new --config=my_kedro_pyspark_project.yml --starter=pyspark +``` + +This option is useful when the starter requires more configuration than the default mode requires. + +## How to create a Kedro starter + +Kedro starters are used to create projects that contain code to run as-is, or to adapt and extend. A good example is the Iris dataset example of basic Kedro project layout, configuration and initialisation code. A team may find it useful to build Kedro starters to create reusable projects that bootstrap a common base and can be extended. + +A Kedro starter is a [Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/) template that contains the boilerplate code for a Kedro project. + +To create a Kedro starter, you need a base project to convert to a `cookiecutter` template, which forms the boilerplate for all projects that use the Kedro starter. + +Install `cookiecutter` as follows: + +```bash +pip install cookiecutter +``` + +You then need to decide which are: + +* the common, boilerplate parts of the project +* the configurable elements, which need to be replaced by `cookiecutter` strings + +### Configuration variables + +By default, when you create a new project using a Kedro starter, `kedro new` launches in interactive mode. The user is then prompted for the variables that have been set in `prompts.yml`. + +The most basic and empty starter triggered by `kedro new` is set up with the following variable: + +* `project_name` - A human readable name for the new project + +Kedro will then automatically generate the following two variables from the entered `project_name`: + +* `repo_name` - A name for the directory that holds the project repository +* `python_package` - A Python package name for the project package (see [Python package naming conventions](https://www.python.org/dev/peps/pep-0008/#package-and-module-names)) + +See the configuration for this basic configuration in [the default starter setup](https://github.com/kedro-org/kedro/blob/main/kedro/templates/project/prompts.yml). + +As the creator of the Kedro starter you can customise the prompts triggered by `kedro new` by adding your own prompts in `prompts.yml`. This is an example of a custom prompt: + +```yaml +custom_prompt: + title: "Prompt title" + text: | + Prompt description that explains to the user what + information they should provide. +``` + +At the very least, the prompt `title` must be defined for the prompt to be valid. After Kedro gets the user's input for each prompt, we pass the value to [`cookiecutter`](https://cookiecutter.readthedocs.io/en/1.7.2/), so every key in your `prompts.yml` must have a corresponding key in [`cookiecutter.json`](https://cookiecutter.readthedocs.io/en/1.7.2/tutorial1.html#cookiecutter-json). + +If the input to the prompts needs to be **validated**, for example to make sure it only has alphanumeric characters, you can add regex validation rules via the `regex_validator` key. For more complex validation, have a look at [cookiecutter pre/post-generate hooks](https://cookiecutter.readthedocs.io/en/1.7.2/advanced/hooks.html#using-pre-post-generate-hooks-0-7-0). + +If you want `cookiecutter` to provide sensible **defaults** in case a user doesn't provide any input, you can add those to `cookiecutter.json`. See [the default starter `cookiecutter.json`](https://github.com/kedro-org/kedro/blob/main/kedro/templates/project/cookiecutter.json) as example. + +### Example Kedro starter + +To review an example Kedro starter, check out the [`pandas-iris` starter on GitHub](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris). + +When you create an Iris dataset example project by calling `kedro new`, you supply configuration variables as the documentation in [Create a new project](../get_started/new_project.md) describes. When you go through the interactive flow you must supply the `project_name` variable, which is then used to generate the `repo_name` and `python_package` variables. If you use a configuration file, you must supply all three variables in the file. You can see how these variables are used by inspecting the template: + +**project_name** + +The human-readable `project_name` variable is used in the [README.md](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris/README.md) for the new project. + +**repo_name** + +The project structure contains a folder labelled [`{{ cookiecutter.repo_name }}`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris/%7B%7B%20cookiecutter.repo_name%20%7D%7D), which forms the top-level folder to contain the Iris dataset example when it is created. The folder storing the example project is represented by `cookiecutter.repo_name`, which is a customisable variable, as you would expect. + +**python_package** + +Within the parent folder, inside the `src` subfolder, is another configurable variable [{{ cookiecutter.python_package }}](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris/%7B%7B%20cookiecutter.repo_name%20%7D%7D/src/%7B%7B%20cookiecutter.python_package%20%7D%7D) which contains the source code for the example pipelines. The variable is also used within [`__main__.py`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris/%7B%7B%20cookiecutter.repo_name%20%7D%7D/src/%7B%7B%20cookiecutter.python_package%20%7D%7D/__main__.py). + +Here is the layout of the project as a Cookiecutter template: + +``` +{{ cookiecutter.repo_name }} # Parent directory of the template +├── conf # Project configuration files +├── data # Local project data (not committed to version control) +├── docs # Project documentation +├── notebooks # Project related Jupyter notebooks (can be used for experimental code before moving the code to src) +├── README.md # Project README +├── setup.cfg # Configuration options for tools e.g. `pytest` or `black` +└── src # Project source code + └── {{ cookiecutter.python_package }} + ├── __init.py__ + ├── pipelines + ├── pipeline_registry.py + ├── __main__.py + └── settings.py + ├── requirements.txt + ├── setup.py + └── tests +``` + +```{note} +You can [add an alias by creating a plugin using `kedro.starters` entry point](../extend_kedro/plugins.md#extend-starter-aliases), which will allows you to do `kedro new --starter=your_starters` and shows up on shows up on `kedro starter list`. +``` diff --git a/docs/source/logging/index.md b/docs/source/logging/index.md new file mode 100644 index 0000000000..23fb20d26d --- /dev/null +++ b/docs/source/logging/index.md @@ -0,0 +1,185 @@ +# Logging + + +Kedro uses [Python's `logging` library](https://docs.python.org/3/library/logging.html). Configuration is provided as a dictionary according to the [Python logging configuration schema](https://docs.python.org/3/library/logging.config.html#logging-config-dictschema) in Kedro's default logging configuration, as described below. + +By default, Python only shows logging messages at level `WARNING` and above. Kedro's logging configuration specifies that `INFO` level messages from Kedro should also be emitted. This makes it easier to track the progress of your pipeline when you perform a `kedro run`. + +# Default logging configuration +Kedro's [default logging configuration](https://github.com/kedro-org/kedro/blob/main/kedro/framework/project/default_logging.yml) defines a handler called `rich` that uses the [Rich logging handler](https://rich.readthedocs.io) to format messages. We also use the Rich traceback handler to render exceptions. + +## How to perform logging in your Kedro project +To add logging to your own code (e.g. in a node): + +```python +import logging + +logger = logging.getLogger(__name__) +logger.warning("Issue warning") +logger.info("Send information") +logger.debug("Useful information for debugging") +``` + +You can use Rich's [console markup](https://rich.readthedocs.io/en/stable/markup.html) in your logging calls: + +```python +log.error("[bold red blink]Important error message![/]", extra={"markup": True}) +``` + +## How to customise Kedro logging + +To customise logging in your Kedro project, you need to specify the path to a project-specific logging configuration file. Change the environment variable `KEDRO_LOGGING_CONFIG` to override the default logging configuration. Point the variable instead to your project-specific configuration, which we recommend you store inside the project's`conf` folder, and name `logging.yml`. + +For example, you can set `KEDRO_LOGGING_CONFIG` by typing the following into your terminal: + +```bash +export KEDRO_LOGGING_CONFIG=/conf/logging.yml +``` + +After setting the environment variable, any subsequent Kedro commands use the logging configuration file at the specified path. + +```{note} +If the `KEDRO_LOGGING_CONFIG` environment variable is not set, Kedro will use the [default logging configuration](https://github.com/kedro-org/kedro/blob/main/kedro/framework/project/default_logging.yml). +``` + +### How to show DEBUG level messages +To see `DEBUG` level messages, change the level of logging in your project-specific logging configuration file (`logging.yml`). We provide a `logging.yml` template: + +
    +Click to expand the logging.yml template + + +```yaml +version: 1 + +disable_existing_loggers: False + +formatters: + simple: + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +handlers: + console: + class: logging.StreamHandler + level: INFO + formatter: simple + stream: ext://sys.stdout + + info_file_handler: + class: logging.handlers.RotatingFileHandler + level: INFO + formatter: simple + filename: info.log + maxBytes: 10485760 # 10MB + backupCount: 20 + encoding: utf8 + delay: True + + rich: + class: kedro.logging.RichHandler + rich_tracebacks: True + # Advance options for customisation. + # See https://docs.kedro.org/en/stable/logging/logging.html#project-side-logging-configuration + # tracebacks_show_locals: False + +loggers: + kedro: + level: INFO + + your_python_package: + level: INFO + +root: + handlers: [rich] +``` + +
    + +You need to change the line: +```diff +loggers: + kedro: + level: INFO + + your_python_package: +- level: INFO ++ level: DEBUG +``` + +```{note} +The name of a logger corresponds to a key in the `loggers` section of the logging configuration file (e.g. `kedro`). See [Python's logging documentation](https://docs.python.org/3/library/logging.html#logger-objects) for more information. +``` + +By changing the level value to `DEBUG` for the desired logger (e.g. ``), you will start seeing `DEBUG` level messages in the log output. + +## Advanced logging + +In addition to the `rich` handler defined in Kedro's framework, we provide two additional handlers in the template. + +* `console`: show logs on standard output (typically your terminal screen) without any rich formatting +* `info_file_handler`: write logs of level `INFO` and above to `info.log` + +The following section illustrates some common examples of how to change your project's logging configuration. + +## How to customise the `rich` handler + +Kedro's `kedro.logging.RichHandler` is a subclass of [`rich.logging.RichHandler`](https://rich.readthedocs.io/en/stable/reference/logging.html#rich.logging.RichHandler) and supports the same set of arguments. By default, `rich_tracebacks` is set to `True` to use `rich` to render exceptions. However, you can disable it by setting `rich_tracebacks: False`. + +```{note} +If you want to disable `rich`'s tracebacks, you must set `KEDRO_LOGGING_CONFIG` to point to your local config i.e. `conf/logging.yml`. +``` + +When `rich_tracebacks` is set to `True`, the configuration is propagated to [`rich.traceback.install`](https://rich.readthedocs.io/en/stable/reference/traceback.html#rich.traceback.install). If an argument is compatible with `rich.traceback.install`, it will be passed to the traceback's settings. + +For instance, you can enable the display of local variables inside `logging.yml` to aid with debugging. + +```diff + rich: + class: kedro.logging.RichHandler + rich_tracebacks: True ++ tracebacks_show_locals: True +``` + +A comprehensive list of available options can be found in the [RichHandler documentation](https://rich.readthedocs.io/en/stable/reference/logging.html#rich.logging.RichHandler). + +## How to enable file-based logging + +File-based logging in Python projects aids troubleshooting and debugging. It offers better visibility into application's behaviour and it's easy to search. However, it does not work well with read-only systems such as [Databricks Repos](https://docs.databricks.com/repos/index.html). + +To enable file-based logging, add `info_file_handler` in your `root` logger as follows in your `conf/logging.yml` as follows: + +```diff + root: +- handlers: [rich] ++ handlers: [rich, info_file_handler] +``` + +By default it only tracks `INFO` level messages, but it can be configured to capture any level of logs. + +## How to use plain console logging + +To use plain rather than rich logging, swap the `rich` handler for the `console` one as follows: + +```diff + root: +- handlers: [rich] ++ handlers: [console] +``` + +## How to enable rich logging in a dumb terminal + +Rich [detects whether your terminal is capable](https://rich.readthedocs.io/en/stable/console.html#terminal-detection) of displaying richly formatted messages. If your terminal is "dumb" then formatting is automatically stripped out so that the logs are just plain text. This is likely to happen if you perform `kedro run` on CI (e.g. GitHub Actions or CircleCI). + +If you find that the default wrapping of the log messages is too narrow but do not wish to switch to using the `console` logger on CI then the simplest way to control the log message wrapping is through altering the `COLUMNS` and `LINES` environment variables. For example: + +```bash +export COLUMNS=120 LINES=25 +``` + +```{note} +You must provide a value for both `COLUMNS` and `LINES` even if you only wish to change the width of the log message. Rich's default values for these variables are `COLUMNS=80` and `LINE=25`. +``` + +## How to enable rich logging in Jupyter + +Rich also formats the logs in JupyterLab and Jupyter Notebook. The size of the output console does not adapt to your window but can be controlled through the `JUPYTER_COLUMNS` and `JUPYTER_LINES` environment variables. The default values (115 and 100 respectively) should be suitable for most users, but if you require a different output console size then you should alter the values of `JUPYTER_COLUMNS` and `JUPYTER_LINES`. diff --git a/docs/source/meta/images/KedroArchitecture.drawio b/docs/source/meta/images/KedroArchitecture.drawio index f546a57896..0d32eeeebd 100644 --- a/docs/source/meta/images/KedroArchitecture.drawio +++ b/docs/source/meta/images/KedroArchitecture.drawio @@ -1 +1 @@ -7V1Zc9s4Ev41rt19kIr38egjM8nEkzjrTTl5clEkJCEmCRUJ2VJ+/QC8CUA0LfFSNs5DRBAkpa+7P3Q3GuCFeh3s/oyczfpv5AH/QpG83YV6c6Eoii0b5D/ask9bZFmV0pZVBL2srWy4hz9B1ph320IPxLWOGCEfw0290UVhCFxca3OiCL3Uuy2RX3/qxlkBruHedXy+9QF6eJ22WrpUtr8HcLXOnyxL2ZnAyTtnDfHa8dBLpUl9d6FeRwjh9FOwuwY+RS/HJb3ujwNniy8WgRC3uSB+MDYftlsX4PhG/vIHfrI+/5xp6V2eHX+b/eALxfDJ/a4W5MOKfvgIvAjRJ0XoB0U4O08eVHTJfiDe56hFaBt6gD5YIqdf1hCD+43j0rMvRFFI2xoHPjmSycdnEGFIEL/04SokbRjRDkvo+9fIR1FyR9XTgeVppD3GEXoClTOWslANo/gWVUAyjOgDwK7SlAH0J0ABwNGedMkVNpPVvq6EL6XklVx/1xWpG7nQnUzbVsWdS4GQD5lMxPJZP/58Dj4+f/sWvH8w/tq49vP3TzOFk08cufSWJyHuOfE66Zsf3DkYgyhMWhTpsFCc7MgHSyp331kA/8pxn1bJ43OhhCgkz71C5PkQUyh1qRvxaKx4ePnItkA+qtWBfIT2I5vtDWgZOQF4QdHTSCYEZGJEpsiEbMNUnX5MyDB0TkaaLpCRrPcmI05Ed3ADfEiUVJFuwDPw0QZEnAAIX2/ox23gX7qYQlXAfEv1/g7FEENE4V4gjFFwUA4VQaEtpg++Lkarjixjpth13FWBbYip63TUravPd9rtl/U7A351d0HwfQb3M1XlEAUeGVqzQxThNVqh0PHfla1XdaUv+9wiCmSC4A+A8T7zE5wtRnV8wQ7ib/TyuZ4dfa+cudlld04O9vlBSH5v5SJ6+L16rrwsOcqvq7HoQRnGaBu5oEE9syEYO9EK4IZ+SgYoRbFRJSLgOxg+1x2Y7uXLuw7RNoyFQk/spS6ofCRxCV4gEphOAD0v1QkQw5/OIrkfhX2DYIiTX6NfXeg3QuwbNZKzqsJ7zJ5Sc9CE1ibNJVkzawaXjdStwc9ufkd/TXmXmT3P75tfhJbLmCgGK6/iWx1PjDonQYLD8mTvogM6M/UatppgnFcEZKbpfQ0h+qhkVqGyktjEZNYRKdktSUkbi5PEYjI4jV6BEEQOBhMhpkbtOp2YpHnOQzmfyKfRUu7UqfW72kNRlM0J9KB7HRNlJRKLR/Kul8ul4roi79ozFobekXetG3U3z+K9a6Gbp/XnXUuciDZOSGhoBiPIW93QI4khm/P6YFJkoKqjSW6CNcyKKzsHzTgb57ij8aTI3b3q5dpjjShin9EcU1DyRfuBv4xiajFMGdL0G8W0FrA6LQHnruVvAXcmYGtaAubTP9t4Ku5go0p2EqcaUj7WnOgCkntZFuNbkjBYHcoTVHg5XiMikx3mRPmKV+HEmzT3toQ7ahidZD3lul9mSrxjZgr8MrO39NuoEetbeK0rflLbehjypPhJ5WNWNwLTiVgbtasTipKtPJd4IkWJGGowelI5Md6DOKbzAxOgJ70eAg3JTo1efgWtPIz34SJyskf0HLFbLhBH7AtL17qas2TmwyxBxC6eD1P7Al/lVZWMpEu4KlIrUZ43uUWOJ5gaG1+DbYUP4gfVYMH0RzmrOD5cMpMnGh8vfpj775aMFJNQruJnTwYtvqzgA7nR5ymAVRRTTAYsi+ez2w+nDSA94GTqgryjcBarN6D4Ufc9Qk/jJ2lZqAx7bKg0fozMHRQScYKQOnYn4tbOR9HpP5GPYiR/9AoSBVfa079+ZhsMwUSscLpB7S2u1cbNnB+Tr5NHyddpbetKus/XJZdeRpGzr3TIYs2DwZuuso4yU876Sn/TlBjlSr9Bp1Gexnt9/yZjzX9I052/XcFTOaEDmzXkuqdsGQNyqdhk7d8m285k9bYm2/kcynEmy3o4tt5ssmx/eQiT5auc3u1w5JCmaTg/hqwwM9SmyecIBjbZUac9jzPZuSQZjNka5iuGmxzdgQgS2GjqtFNrNlpa88SmvDU+nILBxgcByMhgojnpXGe7qKLSLKbgSUkPT63uNOaGVr+ov4y0xmdcrrcxrVNXpNs8ySpdo2CDQqFoxydCOx/oRosDDR7ETYSekzVpkzCFQyi/WemL4tncb+yimFY8ecRBSiwAw3AVzzcn5v070EKm9Fg227rPfcHFhxybLNEcT7BUWxasmRsWL0FVuw8nqFmKVdDdaFgJ2G0fkx/5NDpYgrpN2xTh1V/lphAxPjGfITbVcldFsABvaNB4jzKmICx9+vQpgqaq44MmKKwOEQaLNGidHPFrrXP2vQHGV4ht9tmq9Tnxe/2pQabrY/O/rHCQBTCEs6d0rmNydqkJClcGt0vef+WQAqF3STefIEeu78QxdOvg1FIMr2dWDmL5asqggpOoyCRvO7U2VGGSztxkb5otyS4rBcDdibEQgxFimkvhbvPWxOWMLddUzObJBu4Co5bpPDlzKVY0gedfxJ6S52Caw3TJfz5aXSjXfPUOURgnSWzFyWnJjYBHNAg6ftYAsDvndbc5jM22fjghiG3eNYLX9WYr7KICUZHU+mLebkqmdSaJJDPa3EnuR4wOHwSdyFFHLj44gduqedOmPMJ0OJDhFHbTkbYcaDFxjzoQCcq2MgCp8SFnBFYwTlmqrP4kVlwmJllic0KPCjH1hKUiO+1gYVXGmRBalyXVPRHajNGYPhZ/iMERrQM+Mz6bKk/J7BYwR/pqsirNJcUu/5h9LHqjLbXbWWShAip8MF7xxSoJWYk4Y0TzBuegtiyTG9KUWYaZiZj1sSGKWMp8BuHsaOZVt0mfNh0pusKG7kc6TopG7jQCIamSPh8gPMx18wAlFbs1OgHNzISLmP6XFEZT3yogPpTAswLzFYkIpSz/I0VbfvnSebhT3U2M9kh0qjyOOyVYrHZ2PDdV/tLYvUCPdKd0mePBvghLz5Ip/dIVn80q1rlKYLchRu5glEiuMs1xntxzDrmpsVysXyAzNVnqYdPlx1KPyc2e9EY9pjUA9TQVcZWekkSe46Rp9fOknTPIII1FO7pgbripjm+C+2QMtzhBnG1vG1/mc8ojL06wmM3CTXZ7/Ff6a1lZdq/MpPJqWdTo3hf7GTL6OX5drinaeH3YBQq8G1GZRngbfU+8MFdX7LltW5aikk+aLNe3ODF728BbFzhmY60BUZm/ZqIkB+yajpHJs/VaEK3tpOYwa0F0PlGw2ELfm4iNNeptN9t86/nqxVOXfvSyFVGjstVXvKVr1i9+mTce2Mx4bQlKwwZ94YHOTxH9nw5JXbwNRwzxhHZj7W5I6n0/TzGWbRcbT2x5os57fuVqtq/xudMa9yIXW7BddV+8JnwFFR+izOd81eAEaoMNay5aON1febAQLt5jSuqUprgASRGg1Vf5uRAr0dvnfueCm8mz7TQ6W2x6ZC6YCH+Uqp5c6bpKvwj1jx9KKnlhN0/E5HWG/Gz52VUeNlrhhNPGbElQH4vohdjwodTjY+DA8PFxgoSuDbn2VgiX6G2Ivwm9E0Ln3mF5JKFr6mB1BZrabV2BUOX4taUVDk+0gso5+6a4KGpiqRyv6dsgy7lAGCbq4T7R9wMrUoA8fl/X86D3zrZFOe9CqE+fvn7436X/5Yv2Ufvx12L1sH3YCWKbvJT3Mc3dRPuOeF4AP0c+bXle1QZ03IW4KWfG8wypt8J+LJ7vyHFXjcGKOFSj2yIOocrxcXWF52vs/S9BXStbB7sp3wAcOsHbN/MZhttbc3ZdoQ+b7K/B7eQwQghXuxPWXf9Nxmja4x8= +7V3Zctu4Ev0a1dx5kIr78uglM8nEN3Gub8rJk4siIQkxSahIyJby9QNwJwDTtMRNM5EfLIKbeLr7oLvRAGfqVbD/M3K2m/8iD/gzRfL2M/V6piiKrljkH205pC2WYqQN6wh6aZNcNtzBnyBrlLLWHfRAXDsQI+RjuK03uigMgYtrbU4Uoef6YSvk1++6ddaAa7hzHZ9vvYce3mRPoUtl+3sA15v8zrKU7Qmc/OCsId44HnquNKnvZupVhBBOvwX7K+BT8HJc0vP+eGFv8cMiEOI2J8T3xvbDbucCHF/LX/7Aj9bnn3MtvcqT4++yB54phk+ud7kkX9b0y0fgRYjeKUI/KMLZfnKj4pDsAfEhRy1Cu9AD9MYS2f28gRjcbR2X7n0mekLaNjjwyZZMvj6BCEOC+IUP1yFpw4gesIK+f4V8FCVXVD0dWJ5G2mMcoUdQ2WMpS9Uwil9RBSTDiN4A7CtNGUB/AhQAHB3IIbm+ZrI61JXwuZS8omZtm4rUjVzoTqZt6+LKpUDIl0wmYvlsHn4+BR+fvn0L3t8bf21d++n7p7nCySeOXHrJkxD3nHiTHJtv3DoYgyhMWhTpZaE42ZYPVlTuvrME/qXjPq6T2+dCCVFI7nuJyP0hplDqUjfi0Vjx8PKRbYF8VKsD+QjtRzbbG9AqcgLwjKLHkUwIyMSITJEJ2YapOv2YkGHonIw0XSAjWe9NRpyIbuEW+JAoqSJdgyfgoy2IOAEQvt7Sr7vAv3AxhaqA+Ybq/S2KIYaIwr1EGKPgRTlUBIV2mN74quitOrKMuWLXcVcFtiGmrtNRty4/32o3XzbvDPjV3QfB9zk8zFWVQxR4pGvNNlGEN2iNQsd/V7Ze1pW+POYGUSATBH8AjA+Zn+DsMKrjC/YQf6OnL/Rs63tlz/U+u3Kyccg3QvK8lZPo5vfqvvK0ZCs/r8aiL8owRrvIBQ3qmXXB2InWADccp2SAUhQbVSICvoPhU92B6V6+vOsQ7cJYKPTEXuqCynsSl+AFIoHpBNDzUp0AMfzpLJPrUdi3CIY4eRr9cqZfC7Fv1EjOqgrvMbtLzUETWpu0kGTNrBlc1lO3Bj+7+C19mvIqc3uRXzc/Ca1WMVEMVl7FrzqeGHVOggSH1cneRQd0Zuo1bDVBP68IyEzT++pC9FHJrEJlJbGJyawjUrJbkpI2FieJxWRwGr0GIYgcDCZCTI3adToxSYuch3I+kU+jpdypU+tXtYeiKJsT6IvudUyUlUgsHsm7Xq1WiuuKvGvPWBp6R961btTdPIv3roVuntafdy1xIto6IaGhOYwgb3VD9ySGbC7qnYmsCnqT3ARrmBVndg6acTbOcUf9SZG7e9XLtcfqUcQ+ozmmoORZ+46/jGJqMUwZ0vQbxbQWsDotAeeu5S8BdyZga1oC5tM/u3gq7mCjSnYSpxpS3tec6AKSa1kW41uSMFgdyhNUeDleISKTPeZE+YpX4cTbNPe2gntqGJ1kPeW6X2ZKvGNmCvwys7f026gR61t4rSt+Utt6GPKk+EnlY1Y3AtOJWBu1qxOKkq08l3giRYkYajB6Ujkx3oE4puMDE6AnvR4CDclOjV5+Ba08jPfhMnKyW/QcsVsuEEfsS0vXuhqzZMbDLEHELh4PU/sCX+VVlfSkK7guUitRnje5QY4nGBobX4NthQ/iB9VgwfBHOao4PlwykycaHy++m/vfjvQUk1Cu4rEngxZfVvCBXOjzFMAqiikmA5bF89nNh9M6kB5wMnVB3lE4itUbUHyv+x6hx/GTtCxUhj02VBrfR+YOCok4QUgduxNxa+ej6PRP5KMYyYeeQaLgSnv66We0wRAMxAqHG9Te4lpt3Mz5Mfk6eZR8nda2rqT7fF1y6kUUOYfKAVms+WLwpquso8yUs75yvGlKjHKlv6DTKE/jvb7/kL7md9J06+/W8FRO6MBmDbnuKVvGgFwqNln7l8m2M1m9rcl2PoZynMmyHo6tN5sse7w8hMnyVU7v9jhySNM0nB9DVpgRatPkcwQDm+yow57HmexCkgzGbA3zFcNNtm5BBAlsNHXaqTUbLa15YkPeGh9OwWDrgwBkZDDRnHSus11UUWkWU/CkpJunVncaC0Orn9RfRlrjMy5Xu5jWqSvSTZ5kla5QsEWhULTjE6Gdd3SjxYEGD+I2Qk/JnLRJmMJLKL9Z6Yvi2dxv7KKYVjx4xEFKLADDcB0vtifm/TvQQqb0WM6ZYIjaYzGv8RqYJZrjCZZqy4I5c8PiJahq9+EENYvw3aKtn9cXVgJ2O8TkIR9HB0tQt2mbIrz6q9wUIsYn5jPEplruqggm4A0NGu9RxhSElU/vPkXQVHV80ASF1SHCYJkGrZMjfq11zr43wPgKse0hm7W+IH6vPzXIdH1s/pcVDrIAhnD+mI51TM4uNUHhyuB2yfuvHFIg9C7o4hNky/WdOIZuHZxaiuH1zMqLWL6aMqjgJCoyydtOrQ1VmKQzN9ibZkuy00oBcFdiLMRghJjmUrjLvDVxOWfLNRWzebCBO8GoZTpPzlyKFU3g+Rexp+Q5mOYwXfLPR+uZcsVX7xCFcZLEVpzsltwIeESDoONnDQC7C153m8PYbOmHE4LY5lUjeF1vtsIuKhAVSa1P5u2mZFpnkkgyo82d5H7E6PBB0IkcdeTkgxO4rZo3bcojTIcDGU6xmdr4thxoMXFPnpDtnQSlIUiNDzkjsIZxylJl9Sex4jIxyRKbE3pUiKknLBXZaQcLqzLOhNC6LKnuidDmjMb0MflDDI5oHvCZ8dlUeUpml4A50leTVWkhKXb5Ydax6Iu28kLpXmlL4YPxii9WSchKxBkjmjc4B7VlmdyQpswyzEjEvI8FUcRS5jMIZ0czr7pN+rTpSFFlNnQ/0nEiv2ahj0BIiqUtBvCkct18gZKK1RqdgGZmwmVM/yWF0dS3CogPJfCswGJNIkIpy/9I0Y6fvnQe7lR3A6M9Ep0qj+NOCSarnR3PTZW/NHYt0CPdKb2shuibsPQsmdIvXfHZrGKeqwT2W2LkDkaJ5CrDHOfJPeeQmxrLxfoHZKYmSz1suvxY6jG50ZPeqMe0BqCepiKu0lOSyH2cNK1+nrRzBhmksWhHF4wNN9XxTXCdjOEmJ4iz7W3jy3xMeeTJCRazWLjJLo//yvFaVpbdKzOpvFoWNbp3xXqGjH6OX5drihZeH3aCAu9GVIYR3kbfEy/M1RV7YduWpajkmybL9SVOzN4W8NYFjtlYc0BU5tNMlGSDndMxMnm2nguitR3UHGYuiM4nCpY76HsTsbFGve1mmW89n7146tSPXpYialS2+oy3dM767B/zxgOb6a8tQWnYoC880Pkhon9pl9TF23DEEE9oNdbuuqTe1/MUY9l2svHEpifqvOdXzmb7Gp87rXEvcrEFy1X3xWvCV1DxIcpiwVcNTqA22LAWoonT/ZUHC+HiO9+Hh8CB4cPDBOcgaa0Dub7gEr0/61c6uJk/W+ZWVXbN/yPTwZoy2EiUpnQ7EiVUOX42UiUbnGgFlXP2S3ExDM4OleMNfX9YmT2GYaIe7iN9o6QiBcjjVwKcXCK50SYnnEceYuj806evH/5/4X/5on3Ufvy1XN/v7veC3jAv/npIvf3oMEGeV/M1P4fgeSFuCofbL57viOcLj/FEnld1fSieV3WjU54Xqhyfu6rwfI29fxNUQrGVU9vynZGhE7x9+Yfhub3RDv9d3E42yxctp4eXr6tW3/0N diff --git a/docs/source/meta/images/autoreload.gif b/docs/source/meta/images/autoreload.gif new file mode 100644 index 0000000000..6f29479ceb Binary files /dev/null and b/docs/source/meta/images/autoreload.gif differ diff --git a/docs/source/meta/images/aws_create_iam_role.png b/docs/source/meta/images/aws_create_iam_role.png deleted file mode 100644 index d67166cf13..0000000000 Binary files a/docs/source/meta/images/aws_create_iam_role.png and /dev/null differ diff --git a/docs/source/meta/images/chart-icon.png b/docs/source/meta/images/chart-icon.png new file mode 100644 index 0000000000..0fd88f028d Binary files /dev/null and b/docs/source/meta/images/chart-icon.png differ diff --git a/docs/source/meta/images/coffee-cup.png b/docs/source/meta/images/coffee-cup.png new file mode 100644 index 0000000000..e394bb0dc1 Binary files /dev/null and b/docs/source/meta/images/coffee-cup.png differ diff --git a/docs/source/meta/images/collapsible.gif b/docs/source/meta/images/collapsible.gif new file mode 100644 index 0000000000..595003163e Binary files /dev/null and b/docs/source/meta/images/collapsible.gif differ diff --git a/docs/source/meta/images/cook_disjointed.png b/docs/source/meta/images/cook_disjointed.png new file mode 100644 index 0000000000..c53304352f Binary files /dev/null and b/docs/source/meta/images/cook_disjointed.png differ diff --git a/docs/source/meta/images/cook_joined.png b/docs/source/meta/images/cook_joined.png new file mode 100644 index 0000000000..2d40243fc4 Binary files /dev/null and b/docs/source/meta/images/cook_joined.png differ diff --git a/docs/source/meta/images/cook_namespaced.gif b/docs/source/meta/images/cook_namespaced.gif new file mode 100644 index 0000000000..5b1cc2b6ac Binary files /dev/null and b/docs/source/meta/images/cook_namespaced.gif differ diff --git a/docs/source/meta/images/cook_no_namespace.png b/docs/source/meta/images/cook_no_namespace.png new file mode 100644 index 0000000000..63f7746863 Binary files /dev/null and b/docs/source/meta/images/cook_no_namespace.png differ diff --git a/docs/source/meta/images/cook_params.png b/docs/source/meta/images/cook_params.png new file mode 100644 index 0000000000..6d98bb498e Binary files /dev/null and b/docs/source/meta/images/cook_params.png differ diff --git a/docs/source/meta/images/dask_diagnostics_dashboard.png b/docs/source/meta/images/dask_diagnostics_dashboard.png new file mode 100644 index 0000000000..34442fb9a3 Binary files /dev/null and b/docs/source/meta/images/dask_diagnostics_dashboard.png differ diff --git a/docs/source/meta/images/data_engineering_convention.png b/docs/source/meta/images/data_engineering_convention.png deleted file mode 100644 index fd3798310a..0000000000 Binary files a/docs/source/meta/images/data_engineering_convention.png and /dev/null differ diff --git a/docs/source/meta/images/databricks_authenticate_repo.png b/docs/source/meta/images/databricks_authenticate_repo.png new file mode 100644 index 0000000000..7dc2cc06a1 Binary files /dev/null and b/docs/source/meta/images/databricks_authenticate_repo.png differ diff --git a/docs/source/meta/images/databricks_conf_folder_creation.png b/docs/source/meta/images/databricks_conf_folder_creation.png new file mode 100644 index 0000000000..cbe416951b Binary files /dev/null and b/docs/source/meta/images/databricks_conf_folder_creation.png differ diff --git a/docs/source/meta/images/databricks_configure_job_cluster.png b/docs/source/meta/images/databricks_configure_job_cluster.png new file mode 100644 index 0000000000..4502ae38f1 Binary files /dev/null and b/docs/source/meta/images/databricks_configure_job_cluster.png differ diff --git a/docs/source/meta/images/databricks_configure_new_job.png b/docs/source/meta/images/databricks_configure_new_job.png new file mode 100644 index 0000000000..4445f7a479 Binary files /dev/null and b/docs/source/meta/images/databricks_configure_new_job.png differ diff --git a/docs/source/meta/images/databricks_create_job_cluster.png b/docs/source/meta/images/databricks_create_job_cluster.png new file mode 100644 index 0000000000..d429dabba3 Binary files /dev/null and b/docs/source/meta/images/databricks_create_job_cluster.png differ diff --git a/docs/source/meta/images/databricks_create_new_job.png b/docs/source/meta/images/databricks_create_new_job.png new file mode 100644 index 0000000000..279bc8be68 Binary files /dev/null and b/docs/source/meta/images/databricks_create_new_job.png differ diff --git a/docs/source/meta/images/databricks_edit_file.png b/docs/source/meta/images/databricks_edit_file.png new file mode 100644 index 0000000000..3f50a01e5e Binary files /dev/null and b/docs/source/meta/images/databricks_edit_file.png differ diff --git a/docs/source/meta/images/databricks_finished_notebook.png b/docs/source/meta/images/databricks_finished_notebook.png new file mode 100644 index 0000000000..2e2e0b9943 Binary files /dev/null and b/docs/source/meta/images/databricks_finished_notebook.png differ diff --git a/docs/source/meta/images/databricks_job_status.png b/docs/source/meta/images/databricks_job_status.png new file mode 100644 index 0000000000..2f84f5943c Binary files /dev/null and b/docs/source/meta/images/databricks_job_status.png differ diff --git a/docs/source/meta/images/databricks_new_repo_popup.png b/docs/source/meta/images/databricks_new_repo_popup.png new file mode 100644 index 0000000000..623160f62b Binary files /dev/null and b/docs/source/meta/images/databricks_new_repo_popup.png differ diff --git a/docs/source/meta/images/databricks_notebook_creation.png b/docs/source/meta/images/databricks_notebook_creation.png new file mode 100644 index 0000000000..8498caac63 Binary files /dev/null and b/docs/source/meta/images/databricks_notebook_creation.png differ diff --git a/docs/source/meta/images/databricks_notebook_example.png b/docs/source/meta/images/databricks_notebook_example.png index f78fdebc5b..df792a04af 100644 Binary files a/docs/source/meta/images/databricks_notebook_example.png and b/docs/source/meta/images/databricks_notebook_example.png differ diff --git a/docs/source/meta/images/databricks_notebooks_workflow_finished_notebook.png b/docs/source/meta/images/databricks_notebooks_workflow_finished_notebook.png new file mode 100644 index 0000000000..6a594b47e2 Binary files /dev/null and b/docs/source/meta/images/databricks_notebooks_workflow_finished_notebook.png differ diff --git a/docs/source/meta/images/databricks_repo_creation.png b/docs/source/meta/images/databricks_repo_creation.png new file mode 100644 index 0000000000..dcfcaf4d9a Binary files /dev/null and b/docs/source/meta/images/databricks_repo_creation.png differ diff --git a/docs/source/meta/images/databricks_run_all.png b/docs/source/meta/images/databricks_run_all.png new file mode 100644 index 0000000000..80237bfbc5 Binary files /dev/null and b/docs/source/meta/images/databricks_run_all.png differ diff --git a/docs/source/meta/images/databricks_specify_github_repo.png b/docs/source/meta/images/databricks_specify_github_repo.png new file mode 100644 index 0000000000..0e63158052 Binary files /dev/null and b/docs/source/meta/images/databricks_specify_github_repo.png differ diff --git a/docs/source/meta/images/databricks_telemetry_consent.png b/docs/source/meta/images/databricks_telemetry_consent.png new file mode 100644 index 0000000000..b125ae1ac7 Binary files /dev/null and b/docs/source/meta/images/databricks_telemetry_consent.png differ diff --git a/docs/source/meta/images/databricks_viz_demo.png b/docs/source/meta/images/databricks_viz_demo.png new file mode 100644 index 0000000000..d14623c83f Binary files /dev/null and b/docs/source/meta/images/databricks_viz_demo.png differ diff --git a/docs/source/meta/images/databricks_viz_link.png b/docs/source/meta/images/databricks_viz_link.png new file mode 100644 index 0000000000..71a8fc9455 Binary files /dev/null and b/docs/source/meta/images/databricks_viz_link.png differ diff --git a/docs/source/meta/images/deployments.png b/docs/source/meta/images/deployments.png deleted file mode 100644 index 652c3f19cd..0000000000 Binary files a/docs/source/meta/images/deployments.png and /dev/null differ diff --git a/docs/source/meta/images/diffs-graphic.png b/docs/source/meta/images/diffs-graphic.png deleted file mode 100644 index 2c04df0dbf..0000000000 Binary files a/docs/source/meta/images/diffs-graphic.png and /dev/null differ diff --git a/docs/source/meta/images/example_azure_keyvault.png b/docs/source/meta/images/example_azure_keyvault.png new file mode 100644 index 0000000000..9ffaf1697c Binary files /dev/null and b/docs/source/meta/images/example_azure_keyvault.png differ diff --git a/docs/source/meta/images/expand-plot-comparison-view.gif b/docs/source/meta/images/expand-plot-comparison-view.gif new file mode 100644 index 0000000000..275c5137a6 Binary files /dev/null and b/docs/source/meta/images/expand-plot-comparison-view.gif differ diff --git a/docs/source/meta/images/experiment-tracking-compare-runs.png b/docs/source/meta/images/experiment-tracking-compare-runs.png new file mode 100644 index 0000000000..747c832bfb Binary files /dev/null and b/docs/source/meta/images/experiment-tracking-compare-runs.png differ diff --git a/docs/source/meta/images/experiment-tracking-folder.png b/docs/source/meta/images/experiment-tracking-folder.png new file mode 100644 index 0000000000..40c0c5f28c Binary files /dev/null and b/docs/source/meta/images/experiment-tracking-folder.png differ diff --git a/docs/source/meta/images/experiment-tracking-icon.png b/docs/source/meta/images/experiment-tracking-icon.png new file mode 100644 index 0000000000..3990a4782d Binary files /dev/null and b/docs/source/meta/images/experiment-tracking-icon.png differ diff --git a/docs/source/meta/images/experiment-tracking-metrics-comparison.gif b/docs/source/meta/images/experiment-tracking-metrics-comparison.gif new file mode 100644 index 0000000000..db868015a3 Binary files /dev/null and b/docs/source/meta/images/experiment-tracking-metrics-comparison.gif differ diff --git a/docs/source/meta/images/experiment-tracking-plots-comparison-expanded.png b/docs/source/meta/images/experiment-tracking-plots-comparison-expanded.png new file mode 100644 index 0000000000..7635d5e598 Binary files /dev/null and b/docs/source/meta/images/experiment-tracking-plots-comparison-expanded.png differ diff --git a/docs/source/meta/images/experiment-tracking-plots-comparison.png b/docs/source/meta/images/experiment-tracking-plots-comparison.png new file mode 100644 index 0000000000..f61e17cca2 Binary files /dev/null and b/docs/source/meta/images/experiment-tracking-plots-comparison.png differ diff --git a/docs/source/meta/images/experiment-tracking-runs-list.png b/docs/source/meta/images/experiment-tracking-runs-list.png new file mode 100644 index 0000000000..33e4600b4e Binary files /dev/null and b/docs/source/meta/images/experiment-tracking-runs-list.png differ diff --git a/docs/source/meta/images/experiment-tracking_demo.gif b/docs/source/meta/images/experiment-tracking_demo.gif new file mode 100644 index 0000000000..2a59c81a6a Binary files /dev/null and b/docs/source/meta/images/experiment-tracking_demo.gif differ diff --git a/docs/source/meta/images/final_conf_folder.png b/docs/source/meta/images/final_conf_folder.png new file mode 100644 index 0000000000..d5e1365013 Binary files /dev/null and b/docs/source/meta/images/final_conf_folder.png differ diff --git a/docs/source/meta/images/find_databricks_host_and_username.png b/docs/source/meta/images/find_databricks_host_and_username.png new file mode 100644 index 0000000000..409b7fe431 Binary files /dev/null and b/docs/source/meta/images/find_databricks_host_and_username.png differ diff --git a/docs/source/meta/images/focus_mode.png b/docs/source/meta/images/focus_mode.png new file mode 100644 index 0000000000..57f609a231 Binary files /dev/null and b/docs/source/meta/images/focus_mode.png differ diff --git a/docs/source/meta/images/icon-image-dataset.svg b/docs/source/meta/images/icon-image-dataset.svg new file mode 100644 index 0000000000..142c0090be --- /dev/null +++ b/docs/source/meta/images/icon-image-dataset.svg @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/docs/source/meta/images/jupyter_create_new_notebook.png b/docs/source/meta/images/jupyter_create_new_notebook.png deleted file mode 100644 index 476d30156c..0000000000 Binary files a/docs/source/meta/images/jupyter_create_new_notebook.png and /dev/null differ diff --git a/docs/source/meta/images/jupyter_new_notebook.png b/docs/source/meta/images/jupyter_new_notebook.png new file mode 100644 index 0000000000..9f6f57dcf5 Binary files /dev/null and b/docs/source/meta/images/jupyter_new_notebook.png differ diff --git a/docs/source/meta/images/jupyter_notebook_kedro_viz.png b/docs/source/meta/images/jupyter_notebook_kedro_viz.png deleted file mode 100644 index 35010a91cc..0000000000 Binary files a/docs/source/meta/images/jupyter_notebook_kedro_viz.png and /dev/null differ diff --git a/docs/source/meta/images/jupyter_notebook_loading_context.png b/docs/source/meta/images/jupyter_notebook_loading_context.png deleted file mode 100644 index b2d95f33a1..0000000000 Binary files a/docs/source/meta/images/jupyter_notebook_loading_context.png and /dev/null differ diff --git a/docs/source/meta/images/jupyter_notebook_showing_context.png b/docs/source/meta/images/jupyter_notebook_showing_context.png deleted file mode 100644 index b8707ea80d..0000000000 Binary files a/docs/source/meta/images/jupyter_notebook_showing_context.png and /dev/null differ diff --git a/docs/source/meta/images/jupyter_notebook_workflow_loading_data.png b/docs/source/meta/images/jupyter_notebook_workflow_loading_data.png deleted file mode 100644 index 944fddc4e7..0000000000 Binary files a/docs/source/meta/images/jupyter_notebook_workflow_loading_data.png and /dev/null differ diff --git a/docs/source/meta/images/jupyter_qtconsole.png b/docs/source/meta/images/jupyter_qtconsole.png new file mode 100644 index 0000000000..ac42e6ab6c Binary files /dev/null and b/docs/source/meta/images/jupyter_qtconsole.png differ diff --git a/docs/source/meta/images/jupyter_select_kernel.png b/docs/source/meta/images/jupyter_select_kernel.png new file mode 100644 index 0000000000..c2544419cf Binary files /dev/null and b/docs/source/meta/images/jupyter_select_kernel.png differ diff --git a/docs/source/meta/images/kedro_architecture.png b/docs/source/meta/images/kedro_architecture.png index 297049efc5..7db347260e 100644 Binary files a/docs/source/meta/images/kedro_architecture.png and b/docs/source/meta/images/kedro_architecture.png differ diff --git a/docs/source/meta/images/kedro_icon_no-type_blackbg.svg b/docs/source/meta/images/kedro_icon_no-type_blackbg.svg index c5f133c6a4..88bd7d0887 100644 --- a/docs/source/meta/images/kedro_icon_no-type_blackbg.svg +++ b/docs/source/meta/images/kedro_icon_no-type_blackbg.svg @@ -1 +1,3 @@ - + + + diff --git a/docs/source/meta/images/kedro_icon_no-type_whitebg.svg b/docs/source/meta/images/kedro_icon_no-type_whitebg.svg old mode 100755 new mode 100644 index 909aeff80b..88bd7d0887 --- a/docs/source/meta/images/kedro_icon_no-type_whitebg.svg +++ b/docs/source/meta/images/kedro_icon_no-type_whitebg.svg @@ -1 +1,3 @@ - + + + diff --git a/docs/source/meta/images/kedro_icon_type_blackbg.svg b/docs/source/meta/images/kedro_icon_type_blackbg.svg index 9f8925eefa..1334d10c96 100644 --- a/docs/source/meta/images/kedro_icon_type_blackbg.svg +++ b/docs/source/meta/images/kedro_icon_type_blackbg.svg @@ -1 +1,8 @@ - + + + + + + + + diff --git a/docs/source/meta/images/kedro_icon_type_whitebg.svg b/docs/source/meta/images/kedro_icon_type_whitebg.svg old mode 100755 new mode 100644 index 039ce4535a..de0fb805e4 --- a/docs/source/meta/images/kedro_icon_type_whitebg.svg +++ b/docs/source/meta/images/kedro_icon_type_whitebg.svg @@ -1 +1,8 @@ - + + + + + + + + diff --git a/docs/source/meta/images/kedro_viz_autoreload.gif b/docs/source/meta/images/kedro_viz_autoreload.gif new file mode 100644 index 0000000000..52754b73bb Binary files /dev/null and b/docs/source/meta/images/kedro_viz_autoreload.gif differ diff --git a/docs/source/meta/images/kubeflow_pipelines_dag.png b/docs/source/meta/images/kubeflow_pipelines_dag.png deleted file mode 100644 index 88d2dc572d..0000000000 Binary files a/docs/source/meta/images/kubeflow_pipelines_dag.png and /dev/null differ diff --git a/docs/source/meta/images/kubeflow_pipelines_experiment_run.png b/docs/source/meta/images/kubeflow_pipelines_experiment_run.png deleted file mode 100644 index 6d575a478f..0000000000 Binary files a/docs/source/meta/images/kubeflow_pipelines_experiment_run.png and /dev/null differ diff --git a/docs/source/meta/images/kubeflow_pipelines_upload_pipeline.png b/docs/source/meta/images/kubeflow_pipelines_upload_pipeline.png deleted file mode 100644 index 31215491e5..0000000000 Binary files a/docs/source/meta/images/kubeflow_pipelines_upload_pipeline.png and /dev/null differ diff --git a/docs/source/meta/images/modular_ds.gif b/docs/source/meta/images/modular_ds.gif new file mode 100644 index 0000000000..d4843dd088 Binary files /dev/null and b/docs/source/meta/images/modular_ds.gif differ diff --git a/docs/source/meta/images/moon-rocket.png b/docs/source/meta/images/moon-rocket.png new file mode 100644 index 0000000000..09f4efc52f Binary files /dev/null and b/docs/source/meta/images/moon-rocket.png differ diff --git a/docs/source/meta/images/pipeline_show_metrics.gif b/docs/source/meta/images/pipeline_show_metrics.gif new file mode 100644 index 0000000000..57be22618b Binary files /dev/null and b/docs/source/meta/images/pipeline_show_metrics.gif differ diff --git a/docs/source/meta/images/pipeline_visualisation.png b/docs/source/meta/images/pipeline_visualisation.png index 082a2a5700..42cddbf12b 100644 Binary files a/docs/source/meta/images/pipeline_visualisation.png and b/docs/source/meta/images/pipeline_visualisation.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_icon.png b/docs/source/meta/images/pipeline_visualisation_icon.png new file mode 100644 index 0000000000..c63f489c29 Binary files /dev/null and b/docs/source/meta/images/pipeline_visualisation_icon.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_matplotlib.png b/docs/source/meta/images/pipeline_visualisation_matplotlib.png new file mode 100644 index 0000000000..fb78d51dfb Binary files /dev/null and b/docs/source/meta/images/pipeline_visualisation_matplotlib.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_matplotlib_expand.png b/docs/source/meta/images/pipeline_visualisation_matplotlib_expand.png new file mode 100644 index 0000000000..bbc0ce121b Binary files /dev/null and b/docs/source/meta/images/pipeline_visualisation_matplotlib_expand.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_metrics.png b/docs/source/meta/images/pipeline_visualisation_metrics.png new file mode 100644 index 0000000000..578494b198 Binary files /dev/null and b/docs/source/meta/images/pipeline_visualisation_metrics.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_plotly.png b/docs/source/meta/images/pipeline_visualisation_plotly.png new file mode 100644 index 0000000000..0a3416290d Binary files /dev/null and b/docs/source/meta/images/pipeline_visualisation_plotly.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_plotly_1.png b/docs/source/meta/images/pipeline_visualisation_plotly_1.png new file mode 100644 index 0000000000..de33027d79 Binary files /dev/null and b/docs/source/meta/images/pipeline_visualisation_plotly_1.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_plotly_expand.png b/docs/source/meta/images/pipeline_visualisation_plotly_expand.png new file mode 100644 index 0000000000..4c33ddf88b Binary files /dev/null and b/docs/source/meta/images/pipeline_visualisation_plotly_expand.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_plotly_expand_1.png b/docs/source/meta/images/pipeline_visualisation_plotly_expand_1.png new file mode 100644 index 0000000000..f9b8fc6a40 Binary files /dev/null and b/docs/source/meta/images/pipeline_visualisation_plotly_expand_1.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_tutorial.png b/docs/source/meta/images/pipeline_visualisation_tutorial.png new file mode 100644 index 0000000000..27430db586 Binary files /dev/null and b/docs/source/meta/images/pipeline_visualisation_tutorial.png differ diff --git a/docs/source/meta/images/pipeline_visualisation_with_layers.png b/docs/source/meta/images/pipeline_visualisation_with_layers.png index 0ef2b78fcb..45ff7438ca 100644 Binary files a/docs/source/meta/images/pipeline_visualisation_with_layers.png and b/docs/source/meta/images/pipeline_visualisation_with_layers.png differ diff --git a/docs/source/meta/images/prefect_2_flow_deployment.png b/docs/source/meta/images/prefect_2_flow_deployment.png new file mode 100644 index 0000000000..b3f725447f Binary files /dev/null and b/docs/source/meta/images/prefect_2_flow_deployment.png differ diff --git a/docs/source/meta/images/prefect_2_flow_details.png b/docs/source/meta/images/prefect_2_flow_details.png new file mode 100644 index 0000000000..c935e974a8 Binary files /dev/null and b/docs/source/meta/images/prefect_2_flow_details.png differ diff --git a/docs/source/meta/images/preview_datasets_expanded.png b/docs/source/meta/images/preview_datasets_expanded.png new file mode 100644 index 0000000000..fdf1f4ed49 Binary files /dev/null and b/docs/source/meta/images/preview_datasets_expanded.png differ diff --git a/docs/source/meta/images/preview_datasets_metadata.png b/docs/source/meta/images/preview_datasets_metadata.png new file mode 100644 index 0000000000..429f0eb6cf Binary files /dev/null and b/docs/source/meta/images/preview_datasets_metadata.png differ diff --git a/docs/source/meta/images/pycharm_ipython_starting_script.png b/docs/source/meta/images/pycharm_ipython_starting_script.png new file mode 100644 index 0000000000..4ff60a4ac2 Binary files /dev/null and b/docs/source/meta/images/pycharm_ipython_starting_script.png differ diff --git a/docs/source/meta/images/pycharm_ipython_working_example.png b/docs/source/meta/images/pycharm_ipython_working_example.png new file mode 100644 index 0000000000..f8abba9cbf Binary files /dev/null and b/docs/source/meta/images/pycharm_ipython_working_example.png differ diff --git a/docs/source/meta/images/simple_pipeline.png b/docs/source/meta/images/simple_pipeline.png new file mode 100644 index 0000000000..1c18196d97 Binary files /dev/null and b/docs/source/meta/images/simple_pipeline.png differ diff --git a/docs/source/meta/images/spark_delta_workflow.png b/docs/source/meta/images/spark_delta_workflow.png new file mode 100644 index 0000000000..66d71a7fc6 Binary files /dev/null and b/docs/source/meta/images/spark_delta_workflow.png differ diff --git a/docs/source/meta/images/typical_workflow.png b/docs/source/meta/images/typical_workflow.png deleted file mode 100644 index 9e12c6ea25..0000000000 Binary files a/docs/source/meta/images/typical_workflow.png and /dev/null differ diff --git a/docs/source/nodes_and_pipelines/index.md b/docs/source/nodes_and_pipelines/index.md new file mode 100644 index 0000000000..893b100fc3 --- /dev/null +++ b/docs/source/nodes_and_pipelines/index.md @@ -0,0 +1,13 @@ +# Nodes and pipelines + +```{toctree} +:maxdepth: 1 + +nodes +pipeline_introduction +modular_pipelines +pipeline_registry +micro_packaging +run_a_pipeline +slice_a_pipeline +``` diff --git a/docs/source/nodes_and_pipelines/micro_packaging.md b/docs/source/nodes_and_pipelines/micro_packaging.md new file mode 100644 index 0000000000..d72a0c3b4f --- /dev/null +++ b/docs/source/nodes_and_pipelines/micro_packaging.md @@ -0,0 +1,120 @@ +# Micro-packaging + +Micro-packaging allows users to share Kedro micro-packages across codebases, organisations and beyond. A micro-package can be any part of Python code in a Kedro project including pipelines and utility functions. + +## Package a micro-package + +You can package a micro-package by executing: `kedro micropkg package ` + +* This will generate a new [source distribution](https://docs.python.org/3/distutils/sourcedist.html) for this micro-package. +* By default, the tar file will be saved into `dist/` directory inside your project. +* You can customise the target with the `--destination` (`-d`) option. + +When you package your micro-package, such as a modular pipeline for example, Kedro will also automatically package files from 3 locations: + +```text +├── conf +│ └── base +│ └── parameters +│ └── {{pipeline_name*}} <-- All parameter file(s) +└── src + ├── my_project + │ ├── __init__.py + │ └── pipelines + │ └── {{pipeline_name}} <-- Pipeline folder + └── tests + ├── __init__.py + └── pipelines + └── {{pipeline_name}} <-- Pipeline tests +``` + +Kedro will also include any requirements found in `src//pipelines//requirements.txt` in the micro-package tar file. These requirements will later be taken into account when pulling a micro-package via `kedro micropkg pull`. + +```{note} +Kedro will not package the catalog config files even if those are present in `conf//catalog/.yml`. +``` + +If you plan to publish your packaged micro-package to some Python package repository like [PyPI](https://pypi.org/), you need to make sure that your micro-package name doesn't clash with any of the existing packages in that repository. However, there is no need to rename any of your source files if that is the case. Simply alias your package with a new name by running `kedro micropkg package --alias `. + +In addition to [PyPI](https://pypi.org/), you can also share the packaged tar file directly, or via a cloud storage such as AWS S3. + +## Package multiple micro-packages + +To package multiple micro-packages in bulk, run `kedro micropkg package --all`. This will package all micro-packages specified in the `tool.kedro.micropkg.package` manifest section of the project's `pyproject.toml` file: + +```toml +[tool.kedro.micropkg.package] +cleaning_utils = {alias = "aliased_util", destination = "somewhere/else", env = "uat"} +second_pipeline = {} +``` + +* The keys (`first_pipeline`, `second_pipeline`) are the names of the micro-package folders within the codebase. +* The values are the options accepted by the `kedro micropkg package ` CLI command. + +```{note} +Make sure `destination` is specified as a POSIX path even when working on a Windows machine. +``` + +```{note} +The examples above apply to any generic Python package, modular pipelines fall under this category and can be easily addressed via the `pipelines.pipeline_name` syntax. +``` + + +## Pull a micro-package + +You can pull a micro-package from a tar file by executing `kedro micropkg pull `. + +* The `` must either be a package name on PyPI or a path to the source distribution file. +* Kedro will unpack the tar file, and install the files in following locations in your Kedro project: + * All the micro-package code in `src///` + * Configuration files in `conf//parameters/.yml`, where `` defaults to `base`. + * To place parameters from a different config environment, run `kedro micropkg pull --env ` + * Unit tests in `src/tests/` +* Kedro will also parse any requirements packaged with the micro-package and add them to project level `requirements.in`. +* It is advised to compile an updated list of requirements after pulling a micro-package using [`pip-compile`](https://pypi.org/project/pip-tools/). + +```{note} +If a micro-package has embedded requirements and a project `requirements.in` file does not already exist, it will be generated based on the project `requirements.txt` before appending the micro-package requirements. +``` + +You can pull a micro-package from different locations, including local storage, PyPI and the cloud: + +| Operation | Command | +| ------------------------------ |--------------------------------------------------------------------------------------| +| Pulling from a local directory | `kedro micropkg pull dist/-0.1-py3-none-any.tar.gz` | +| Pull from cloud storage | `kedro micropkg pull s3://my_bucket/-0.1-py3-none-any.tar.gz` | +| Pull from PyPI-like endpoint | `kedro micropkg pull ` | + +### Providing `fsspec` arguments + +* If you are pulling the micro-package from a location that isn't PyPI, Kedro uses [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to locate and pull down your micro-package. +* You can use the `--fs-args` option to point to a YAML that contains the required configuration. + +```bash +kedro micropkg pull https:// --fs-args micropkg_pull_args.yml +``` + +```yaml +# `micropkg_pull_args.yml` +client_kwargs: + headers: + Authorization: token +``` + +## Pull multiple micro-packages + +* To pull multiple micro-packages in bulk, run `kedro micropkg pull --all`. +* This will pull and unpack all micro-packages specified in the `tool.kedro.micropkg.pull` manifest section of the project's `pyproject.toml` file: + +```toml +[tool.kedro.micropkg.pull] +"src/dist/first-pipeline-0.1-py3-none-any.tar.gz" = {} +"https://www.url.to/second-pipeline.tar.gz" = {alias = "aliased_pipeline", destination = "pipelines", fs-args = "pipeline_pull_args.yml"} +``` + +* The keys (tar references in this case) are the package paths +* The values are the options that `kedro micropkg pull ` CLI command accepts. + +```{warning} +As per the [TOML specification](https://toml.io/en/v1.0.0#keys), a key that contains any character outside `A-Za-z0-9_-` must be quoted. +``` diff --git a/docs/source/nodes_and_pipelines/modular_pipelines.md b/docs/source/nodes_and_pipelines/modular_pipelines.md new file mode 100644 index 0000000000..12d69509c6 --- /dev/null +++ b/docs/source/nodes_and_pipelines/modular_pipelines.md @@ -0,0 +1,295 @@ +# Modular pipelines + +## What are modular pipelines? + +In many typical Kedro projects, a single (“main”) pipeline increases in complexity as the project evolves. To keep your project fit for purpose, we recommend that you create modular pipelines, which are logically isolated and can be reused. Modular pipelines are easier to develop, test and maintain, and are portable so they can be copied and reused between projects. + +Modular pipelines allow you to instantiate pipelines multiple times, while allowing the user to override inputs/outputs/parameters. They are reusable within the same codebase, and shareable across projects via [micro-packaging](micro_packaging.md). This is the modern way to use Kedro, and will change the way you think about your pipelines. + +```{note} +The Kedro project visualised below is representative of one that might be seen in the real world. It takes full advantage of modular pipelines for `Data Ingestion`, `Feature Engineering`, `Reporting` and `Train Evaluation` (which even includes nested instances). +``` + +### Key concepts + +In this section, you will learn about how to take advantage of modular pipelines. The key points are listed below: + +1. **A modular pipeline is defined by its folder structure** + + * You can generate this file structure with the CLI command ``kedro pipeline create ``. + * The folder structure keeps things isolated and encourages portability. + +2. **Modular pipelines are designed to be portable and reusable** + + * It's possible to re-use the same pipeline multiple times within the same project (with different inputs/outputs or parameters). + * You can also share pipelines across codebases via [micro-packaging](micro_packaging.md). + +3. **The `kedro.pipeline.modular_pipeline.pipeline` wrapper method unlocks the real power of modular pipelines** + + * Applying [namespaces](https://en.wikipedia.org/wiki/Namespace) allows you to simplify your mental model and isolate 'within pipeline' processing steps. + * [Kedro-Viz](https://demo.kedro.org) accelerates development by rendering namespaced pipelines as collapsible 'super nodes'. + + +## How do I create a modular pipeline? + +You can use a [project-specific CLI command](../development/commands_reference.md#kedro-commands) to create a modular pipeline. The pipeline name must adhere to [Python convention](https://realpython.com/python-pep8/#naming-conventions). + +```bash +kedro pipeline create +``` + +```{note} +For the full list of available CLI options, you can always run `kedro pipeline create --help` for more information. +``` + +### What does `kedro pipeline create` do? + +Running the `kedro pipeline create` command adds boilerplate folders and files for the designated pipeline to your project. For your convenience, Kedro gives you a pipeline-specific `nodes.py`, `pipeline.py`, parameters file and appropriate `tests` structure. It also adds the appropriate `__init__.py` files. You can see the generated folder structure below: + +
    +Click to see the generated folder structure + +```text +├── conf +│ └── base +│ └── parameters +│ └── {{pipeline_name}}.yml <-- Pipeline-specific parameters +└── src + ├── my_project + │ ├── __init__.py + │ └── pipelines + │ ├── __init__.py + │ └── {{pipeline_name}} <-- This folder defines the modular pipeline + │ ├── README.md <-- Pipeline-specific documentation + │ ├── __init__.py <-- So that Python treats this pipeline as a module + │ ├── nodes.py <-- To declare your nodes + │ └── pipeline.py <-- To structure the pipeline itself + └── tests + ├── __init__.py + └── pipelines + ├── __init__.py + └── {{pipeline_name}} <-- Pipeline-specific tests + ├── __init__.py + └── test_pipeline.py + +``` + +
    + +If you want to do the reverse and remove a modular pipeline, you can use ``kedro pipeline delete `` to do so. + +### Ensuring portability + +Modular pipelines are shareable between Kedro codebases via [micro-packaging](micro_packaging.md), but you must follow a couple of rules to ensure portability: + +* Modular pipelines should **not** depend on the main Python package, as this would break portability to another project. +* Catalog references are not packaged when sharing/consuming modular pipelines, i.e. the `catalog.yml` file is not packaged. +* Kedro will only look for top-level configuration in `conf/`; placing a configuration folder within the pipeline folder will have no effect. +* We recommend that you document the configuration required (parameters and catalog) in the local `README.md` file for any downstream consumers. + +### Providing modular pipeline specific dependencies + +* A modular pipeline **might** have external dependencies specified in a local `requirements.txt` file. +* Pipeline specific dependencies are scooped up during the [micro-packaging](micro_packaging.md) process. +* These dependencies are _not_ currently installed by the [`kedro install`](../development/commands_reference.md#install-all-package-dependencies) command, and must be manually installed. + +## Using the modular `pipeline()` wrapper to provide overrides + +This wrapper really unlocks the power of modular pipelines. + +* It allows you to start instantiating the same pipeline multiple times. +* These will be static in terms of structure, but dynamic in terms of `inputs`/`outputs`/`parameters`. +* It also allows you to simplify both your mental models, and Kedro-Viz visualisations via `namespaces`. + +```python +from kedro.pipeline.modular_pipeline import pipeline +``` + +The `pipeline()` wrapper method takes the following arguments: + +| Keyword argument | Description | +| ---------------- | ----------------------------------------------------------------------------------- | +| `pipe` | The `Pipeline` object you want to wrap | +| `inputs` | Any overrides provided to this instance of the underlying wrapped `Pipeline` object | +| `outputs` | Any overrides provided to this instance of the underlying wrapped `Pipeline` object | +| `parameters` | Any overrides provided to this instance of the underlying wrapped `Pipeline` object | +| `namespace` | The namespace that will be encapsulated by this pipeline instance | + +## Combining disconnected pipelines + +Sometimes two pipelines must be connected, but do not share any catalog dependencies. The wrapper can be used to solve that. + +
    +Click here to see a worked example + +In this example, there is a `lunch_pipeline`, which makes us lunch. The 'verbs', `defrost` and `eat`, are Python functions and the inputs/outputs are food at different points of the process (`frozen`, `thawed` and `food`). + +```python +cook_pipeline = pipeline( + [ + node(func=defrost, inputs="frozen_veg", outputs="veg"), + node(func=grill, inputs="veg", outputs="grilled_veg"), + ] +) + +lunch_pipeline = pipeline([node(func=eat, inputs="food", outputs=None)]) + +cook_pipeline + lunch_pipeline +``` + +This combination will visualise since it's valid pre-runtime, but it will not run since `food` is not an output of the `cook_pipeline` because the output of the `cook_pipeline` is `grilled_veg`: + +![disjoined](../meta/images/cook_disjointed.png) + +* Combining `cook_pipeline + lunch_pipeline` will not work since `food` doesn't exist as an output of the `cook_pipeline`. +* In this case, we will need to map `grilled_veg` to the expected input of `food`. + +The wrapper allows us to provide a mapping and fix this disconnect. + +```python +from kedro.pipeline.modular_pipeline import pipeline + +prep_pipeline = pipeline(pipe=cook_pipeline, outputs={"grilled_veg": "food"}) + +meal_pipeline = prep_pipeline + lunch_pipeline +``` + +Providing this input/output override will join up the pipeline nicely: + +![joined](../meta/images/cook_joined.png) + +```{note} +In this example we have used the `+` operator to join two pipelines. You can also use `sum()` or pass a list of pipelines to the `pipe` argument. +``` + +
    + +## Using a modular pipeline multiple times + +Reusing pipelines for slightly different purposes can be a real accelerator for teams and organisations when they reach a certain scale. In the real world, one could imagine pipelines with responsibilities like profiling or feature engineering being reused within the same project or even across projects via [micro-packaging](micro_packaging.md). + +* In an ideal world, we would like to use the `cook_pipeline` twice as you would `defrost` and `grill` multiple meals beyond the `veg` currently hard-coded. +* Namespaces allow you to instantiate the same pipeline multiple times and keep operations isolated. +* Like one provides arguments to a class' constructor, you can provide overriding inputs/outputs/parameters to the `pipeline()` wrapper. + +```{note} +The set of overriding inputs and outputs must be a subset of the reused pipeline's "free" inputs and outputs, respectively. A free input is an input that isn't generated by a node in the pipeline, while a free output is an output that isn't consumed by a node in the pipeline. {py:meth}`Pipeline.inputs() ` and {py:meth}`Pipeline.outputs() ` can be used to list a pipeline's free inputs and outputs, respectively. +``` + +
    +Click here to see a worked example + +```python +cook_pipeline = pipeline( + [ + node(func=defrost, inputs="frozen_veg", outputs="veg", name="defrost_node"), + node(func=grill, inputs="veg", outputs="grilled_veg"), + ] +) + +eat_breakfast_pipeline = pipeline( + [node(func=eat_breakfast, inputs="breakfast_food", outputs=None)] +) +eat_lunch_pipeline = pipeline([node(func=eat_lunch, inputs="lunch_food", outputs=None)]) + +cook_pipeline + eat_breakfast_pipeline + eat_lunch_pipeline +``` + +If we visualise the snippet above, we see a disjointed pipeline: + +* We need to "defrost" two different types of food via different pipelines. +* We cannot use the `cook_pipeline` twice because the internal dataset names will conflict. +* Mapping all datasets via the `pipeline()` wrapper will also cause conflicts. + +![cook no namespace](../meta/images/cook_no_namespace.png) + +Adding namespaces solves this issue: + +```python +cook_breakfast_pipeline = pipeline( + pipe=cook_pipeline, + inputs="frozen_veg", # inputs stay the same, don't namespace + outputs={"grilled_veg": "breakfast_food"}, + namespace="breakfast", +) +cook_lunch_pipeline = pipeline( + pipe=cook_pipeline, + inputs="frozen_veg", # inputs stay the same, don't namespace + outputs={"grilled_veg": "lunch_food"}, + namespace="lunch", +) + +final_pipeline = ( + cook_breakfast_pipeline + + eat_breakfast_pipeline + + cook_lunch_pipeline + + eat_lunch_pipeline +) +``` + +* `namespace="lunch"` renames all datasets and nodes, prefixing them with `"lunch."`. +* The datasets that we explicitly "freeze" (`frozen_veg`) or remap (`grilled_veg`) are not affected/prefixed. +* Remapping free outputs is required since "breakfast_food" and "lunch_food" are the names expected by the `eat_breakfast_pipeline` and `eat_lunch_pipeline` respectively. +* The resulting pipeline now has two separate nodes, `breakfast.defrost_node` and `lunch.defrost_node`. +* Also two separate datasets `breakfast.veg` and `lunch.veg` connect the nodes inside the pipelines, causing no confusion between them. + +![namespaced](../meta/images/cook_namespaced.gif) + +* Visualising the `final_pipeline` highlights how namespaces become 'super nodes' which encapsulate the wrapped pipeline. +* This example demonstrates how we can reuse the same `cook_pipeline` with slightly different arguments. +* Namespaces can also be arbitrarily nested with the `.` character. +* `kedro run --namespace=` could be used to only run nodes with a specific namespace. + +```{note} +`parameters` references will not be namespaced, but `params:` references will. +``` +
    + +## How to use a modular pipeline with different parameters + + Mapping parameter values is very similar to the way we map inputs and outputs. + +
    +Click here to see a worked example + +* We instantiate the `template_pipeline` twice, but pass in different parameters. +* `input1` and `input2` are 'frozen' and thus shared in both instances. +* `params:override_me` does not actually exist and is designed to be overridden in both cases. +* Providing a namespace isolates the intermediate operation and visualises nicely. + +```python +template_pipeline = pipeline( + [ + node( + func=node_func1, + inputs=["input1", "input2", "params:override_me"], + outputs="intermediary_output", + ), + node( + func=node_func2, + inputs="intermediary_output", + outputs="output", + ), + ] +) + +alpha_pipeline = pipeline( + pipe=template_pipeline, + inputs={"input1", "input2"}, + parameters={"params:override_me": "params:alpha"}, + namespace="alpha", +) + +beta_pipeline = pipeline( + pipe=template_pipeline, + inputs={"input1", "input2"}, + parameters={"params:override_me": "params:beta"}, + namespace="beta", +) + +final_pipeline = alpha_pipeline + beta_pipeline +``` + +
    + +![namespaced_params](../meta/images/cook_params.png) diff --git a/docs/source/nodes_and_pipelines/nodes.md b/docs/source/nodes_and_pipelines/nodes.md new file mode 100644 index 0000000000..825b4eaf0a --- /dev/null +++ b/docs/source/nodes_and_pipelines/nodes.md @@ -0,0 +1,341 @@ +# Nodes + +In this section, we introduce the concept of a node, for which the relevant API documentation is [kedro.pipeline.node](/kedro.pipeline.node). + +Nodes are the building blocks of pipelines, and represent tasks. Pipelines are used to combine nodes to build workflows, which range from simple machine learning workflows to end-to-end (E2E) production workflows. + +You must first import libraries from Kedro and other standard tools to run the code snippets demonstrated below. + +```python +from kedro.pipeline import * +from kedro.io import * +from kedro.runner import * + +import pickle +import os +``` + +## How to create a node + +A node is created by specifying a function, input variable names and output variable names. Let's consider a simple function that adds two numbers: + +```python +def add(x, y): + return x + y +``` + +The function has two inputs (`x` and `y`) and a single output (the sum of the inputs). + +Here is how a node is created with this function: + +```python +adder_node = node(func=add, inputs=["a", "b"], outputs="sum") +adder_node +``` + +Here is the output: + +```console +Out[1]: Node(add, ['a', 'b'], 'sum', None) +``` + +You can also add labels to nodes, which will be used to describe them in logs: + +```python +adder_node = node(func=add, inputs=["a", "b"], outputs="sum") +print(str(adder_node)) + +adder_node = node(func=add, inputs=["a", "b"], outputs="sum", name="adding_a_and_b") +print(str(adder_node)) +``` + +This gives the following output: + +```console +add([a,b]) -> [sum] +adding_a_and_b: add([a,b]) -> [sum] +``` + +Let's break down the node definition: + +* `add` is the Python function that will execute when the node runs +* `['a', 'b']` specify the input variable names +* `sum` specifies the return variable name. The value returned by `add` will be bound in this variable +* `name` is an optional label for the node, which can be used to provide description of the business logic it provides + +### Node definition syntax + +A syntax describes function inputs and outputs. This syntax allows different Python functions to be reused in nodes, and supports dependency resolution in pipelines. + +### Syntax for input variables + +| Input syntax | Meaning | Example function parameters | How function is called when node runs | +| -------------------------- | --------------- | --------------------------- | ------------------------------------- | +| `None` | No input | `def f()` | `f()` | +| `'a'` | Single input | `def f(arg1)` | `f(a)` | +| `['a', 'b']` | Multiple inputs | `def f(arg1, arg2)` | `f(a, b)` | +| `dict(arg1='x', arg2='y')` | Keyword inputs | `def f(arg1, arg2)` | `f(arg1=x, arg2=y)` | + +### Syntax for output variables + +| Output syntax | Meaning | Example return statement | +| -------------------------- | ----------------- | ----------------------------------- | +| `None` | No output | Does not return | +| `'a'` | Single output | `return a` | +| `['a', 'b']` | List output | `return [a, b]` | +| `dict(key1='a', key2='b')` | Dictionary output | `return dict(key1=a, key2=b)` | + +Any combinations of the above are possible, except nodes of the form `node(f, None, None)` (at least a single input or output must be provided). + +## `**kwargs`-only node functions + +Sometimes, when creating reporting nodes for instance, you need to know the names of the datasets that your node receives, but you might not have this information in advance. This can be solved by defining a `**kwargs`-only function: + +```python +def reporting(**kwargs): + result = [] + for name, data in kwargs.items(): + res = example_report(name, data) + result.append(res) + return combined_report(result) +``` + +Then, when it comes to constructing the `Node`, simply pass a dictionary to the node inputs: + +```python +from kedro.pipeline import node + + +uk_reporting_node = node( + reporting, + inputs={"uk_input1": "uk_input1", "uk_input2": "uk_input2", ...}, + outputs="uk", +) + +ge_reporting_node = node( + reporting, + inputs={"ge_input1": "ge_input1", "ge_input2": "ge_input2", ...}, + outputs="ge", +) +``` + +Alternatively, you can also make use of a helper function that creates the mapping for you, so you can reuse it across your codebase. + +```diff + from kedro.pipeline import node + + ++mapping = lambda x: {k: k for k in x} ++ + uk_reporting_node = node( + reporting, +- inputs={"uk_input1": "uk_input1", "uk_input2": "uk_input2", ...}, ++ inputs=mapping(["uk_input1", "uk_input2", ...]), + outputs="uk", + ) + + ge_reporting_node = node( + reporting, +- inputs={"ge_input1": "ge_input1", "ge_input2": "ge_input2", ...}, ++ inputs=mapping(["ge_input1", "ge_input2", ...]), + outputs="ge", + ) +``` + + +## How to tag a node + +Tags might be useful to run part of a pipeline without changing the code. For instance, `kedro run --tag=ds` will only run nodes that have a `ds` tag attached. + +To tag a node, you can simply specify the `tags` argument: + +```python +node(func=add, inputs=["a", "b"], outputs="sum", name="adding_a_and_b", tags="node_tag") +``` + +Moreover, you can [tag all nodes in a `Pipeline`](./pipeline_introduction.md#how-to-tag-a-pipeline). If the pipeline definition contains the `tags=` argument, Kedro will attach the corresponding tag to every node within that pipeline. + +To run a pipeline using a tag: + +```bash +kedro run --tag=pipeline_tag +``` + +This will run only the nodes found within the pipeline tagged with `pipeline_tag`. + + +## How to run a node + +To run a node, you must instantiate its inputs. In this case, the node expects two inputs: + +```python +adder_node.run(dict(a=2, b=3)) +``` + +The output is as follows: + +```console +Out[2]: {'sum': 5} +``` + +```{note} +You can also call a node as a regular Python function: `adder_node(dict(a=2, b=3))`. This will call `adder_node.run(dict(a=2, b=3))` behind the scenes. +``` + +## How to use generator functions in a node + +[Generator functions](https://learnpython.org/en/Generators) were introduced with [PEP 255](https://www.python.org/dev/peps/pep-0255). They are a special kind of function that returns lazy iterators but do not store their entire contents in memory all at once. + +The following code uses a `pandas chunksize` generator to process large datasets within the [`pandas-iris` starter](../kedro_project_setup/starters.md). First set up a project by following the [get started guide](../get_started/new_project.md#create-a-new-project-containing-example-code) to create a Kedro project with the `pandas-iris` starter example code. + +Create a [custom dataset](../extend_kedro/custom_datasets.md) called `ChunkWiseCSVDataSet` in `src/YOUR_PROJECT_NAME/extras/datasets/chunkwise_dataset.py` for your `pandas-iris` project. This dataset is a simplified version of the `pandas.CSVDataSet` where the main change is to the `_save` method which should save the data in append-or-create mode, `a+`. + +
    +Click to expand + +```python +from copy import deepcopy +from io import BytesIO +from pathlib import PurePosixPath +from typing import Any, Dict + +import fsspec +import pandas as pd + +from kedro.io.core import ( + AbstractVersionedDataSet, + Version, + get_filepath_str, + get_protocol_and_path, +) + + +class ChunkWiseCSVDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): + """``ChunkWiseCSVDataSet`` loads/saves data from/to a CSV file using an underlying + filesystem. It uses pandas to handle the CSV file. + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] + + def __init__( + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``ChunkWiseCSVDataSet`` pointing to a concrete CSV file + on a specific filesystem. + """ + _fs_args = deepcopy(fs_args) or {} + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + def _describe(self) -> Dict[str, Any]: + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _load(self) -> pd.DataFrame: + load_path = str(self._get_load_path()) + return pd.read_csv(load_path, **self._load_args) + + def _save(self, data: pd.DataFrame) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + + buf = BytesIO() + data.to_csv(path_or_buf=buf, **self._save_args) + + with self._fs.open(save_path, mode="a+") as fs_file: + fs_file.write(buf.getvalue()) +``` +
    + +Modify `example_iris_data` in `catalog.yml` by changing `type` to the custom dataset you created above. Add `chunksize: 100` to `load_args` which will return an iterable object. The `chunksize` parameter refers to the number of rows in each chunk. + +```yaml +example_iris_data: + type: YOUR_PROJECT_NAME.extras.datasets.chunkwise_dataset.ChunkWiseCSVDataSet + filepath: data/01_raw/iris.csv + load_args: + chunksize: 100 +``` + +Next, in `nodes.py` we repurpose the existing `split_data` function to process chunk-wise data: + +```python +def split_data( + data: pd.DataFrame, parameters: Dict[str, Any] +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: + """Splits data into features and target training and test sets. + + Args: + data: Data containing features and target. + parameters: Parameters defined in parameters.yml. + Returns: + Split data. + """ + # Loop through data in chunks building up the training and test sets + for chunk in data: # Iterate over the chunks from data + full_data = pd.concat( + [chunk] + ) # Converts the TextFileReader object into list of DataFrames + data_train = full_data.sample( + frac=parameters["train_fraction"], random_state=parameters["random_state"] + ) + data_test = full_data.drop(data_train.index) + + X_train = data_train.drop(columns=parameters["target_column"]) + X_test = data_test.drop(columns=parameters["target_column"]) + y_train = data_train[parameters["target_column"]] + y_test = data_test[parameters["target_column"]] + yield X_train, X_test, y_train, y_test # Use yield instead of return to get the generator object +``` + +We can now `kedro run` in the terminal. The output shows `X_train`, `X_test`, `y_train`, `y_test` saved in chunks: + +``` +... +[02/10/23 12:42:55] INFO Loading data from 'example_iris_data' (ChunkWiseCSVDataSet)... data_catalog.py:343 + INFO Loading data from 'parameters' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: split: split_data([example_iris_data,parameters]) -> node.py:329 + [X_train,X_test,y_train,y_test] + INFO Saving data to 'X_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'X_test' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_test' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'X_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'X_test' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_test' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 1 out of 3 tasks sequential_runner.py:85 +... +``` diff --git a/docs/source/06_nodes_and_pipelines/02_pipeline_introduction.md b/docs/source/nodes_and_pipelines/pipeline_introduction.md similarity index 77% rename from docs/source/06_nodes_and_pipelines/02_pipeline_introduction.md rename to docs/source/nodes_and_pipelines/pipeline_introduction.md index a71d2fa2c7..f13c5b1652 100644 --- a/docs/source/06_nodes_and_pipelines/02_pipeline_introduction.md +++ b/docs/source/nodes_and_pipelines/pipeline_introduction.md @@ -1,16 +1,12 @@ # Pipelines -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -We previously introduced [Nodes](./01_nodes.md) as building blocks that represent tasks, and which can be combined in a pipeline to build your workflow. A pipeline organises the dependencies and execution order of your collection of nodes, and connects inputs and outputs while keeping your code modular. The pipeline determines the node execution order by resolving dependencies and does *not* necessarily run the nodes in the order in which they are passed in. +We previously introduced [Nodes](./nodes.md) as building blocks that represent tasks, and can be combined in a pipeline to build your workflow. A pipeline organises the dependencies and execution order of your collection of nodes, and connects inputs and outputs while keeping your code modular. The pipeline resolves dependencies to determine the node execution order, and does *not* necessarily run the nodes in the order in which they are passed in. To benefit from Kedro's automatic dependency resolution, you can chain your nodes into a [pipeline](/kedro.pipeline.Pipeline), which is a list of nodes that use a shared set of variables. ## How to build a pipeline -In the following example, we construct a simple pipeline that computes the variance of a set of numbers. In practice, pipelines can use more complicated node definitions and the variables they use usually correspond to entire datasets: +In the following example, we construct a simple pipeline that computes the variance of a set of numbers. In practice, pipelines can use more complicated node definitions, and the variables they use usually correspond to entire datasets:
    Click to expand @@ -22,14 +18,14 @@ def mean(xs, n): def mean_sos(xs, n): - return sum(x ** 2 for x in xs) / n + return sum(x**2 for x in xs) / n def variance(m, m2): return m2 - m * m -pipeline = Pipeline( +variance_pipeline = pipeline( [ node(len, "xs", "n"), node(mean, ["xs", "n"], "m", name="mean_node"), @@ -40,13 +36,13 @@ pipeline = Pipeline( ```
    -You can use `describe` to understand what nodes are part of the pipeline: +You can use `describe` to discover what nodes are part of the pipeline:
    Click to expand ```python -print(pipeline.describe()) +print(variance_pipeline.describe()) ``` The output is as follows: @@ -71,7 +67,7 @@ Outputs: v You can also tag your pipeline by providing the `tags` argument, which will tag all of the pipeline's nodes. In the following example, both nodes are tagged with `pipeline_tag`. ```python -pipeline = Pipeline( +pipeline = pipeline( [node(..., name="node1"), node(..., name="node2")], tags="pipeline_tag" ) ``` @@ -79,7 +75,7 @@ pipeline = Pipeline( You can combine pipeline tagging with node tagging. In the following example, `node1` and `node2` are tagged with `pipeline_tag`, while `node2` also has a `node_tag`. ```python -pipeline = Pipeline( +pipeline = pipeline( [node(..., name="node1"), node(..., name="node2", tags="node_tag")], tags="pipeline_tag", ) @@ -95,15 +91,15 @@ You can merge multiple pipelines as shown below. Note that, in this case, `pipel ```python -pipeline_de = Pipeline([node(len, "xs", "n"), node(mean, ["xs", "n"], "m")]) +pipeline_de = pipeline([node(len, "xs", "n"), node(mean, ["xs", "n"], "m")]) -pipeline_ds = Pipeline( +pipeline_ds = pipeline( [node(mean_sos, ["xs", "n"], "m2"), node(variance, ["m", "m2"], "v")] ) last_node = node(print, "v", None) -pipeline_all = Pipeline([pipeline_de, pipeline_ds, last_node]) +pipeline_all = pipeline([pipeline_de, pipeline_ds, last_node]) print(pipeline_all.describe()) ``` @@ -134,7 +130,7 @@ Pipelines provide access to their nodes in a topological order to enable custom Click to expand ```python -nodes = pipeline.nodes +nodes = variance_pipeline.nodes nodes ``` @@ -166,7 +162,7 @@ You should see the following: In a similar way to the above, you can use `inputs()` and `outputs()` to check the inputs and outputs of a pipeline: ```python -pipeline.inputs() +variance_pipeline.inputs() ``` Gives the following: @@ -176,7 +172,7 @@ Out[7]: {'xs'} ``` ```python -pipeline.outputs() +variance_pipeline.outputs() ``` Displays the output: @@ -192,14 +188,14 @@ A pipelines can usually readily resolve its dependencies. In some cases, resolut ### Pipeline with bad nodes -In this case we have a pipeline consisting of a single node with no input and output: +In this case, we have a pipeline consisting of a single node with no input and output:
    Click to expand ```python try: - Pipeline([node(lambda: print("!"), None, None)]) + pipeline([node(lambda: print("!"), None, None)]) except Exception as e: print(e) ``` @@ -224,7 +220,7 @@ The first node captures the relationship of how to calculate `y` from `x` and th ```python try: - Pipeline( + pipeline( [ node(lambda x: x + 1, "x", "y", name="first node"), node(lambda y: y - 1, "y", "x", name="second node"), diff --git a/docs/source/nodes_and_pipelines/pipeline_registry.md b/docs/source/nodes_and_pipelines/pipeline_registry.md new file mode 100644 index 0000000000..adeabd7906 --- /dev/null +++ b/docs/source/nodes_and_pipelines/pipeline_registry.md @@ -0,0 +1,88 @@ +# The pipeline registry + +Projects generated using Kedro 0.17.2 or later define their pipelines in `src//pipeline_registry.py`. This, in turn, populates the `pipelines` variable in [`kedro.framework.project`](/kedro.framework.project) that the Kedro CLI and plugins use to access project pipelines. The `pipeline_registry` module must contain a top-level `register_pipelines()` function that returns a mapping from pipeline names to [`Pipeline`](/kedro.pipeline.Pipeline) objects. For example, the [pipeline registry in the Kedro starter for the completed spaceflights tutorial](https://github.com/kedro-org/kedro-starters/blob/0.18.2/spaceflights/%7B%7B%20cookiecutter.repo_name%20%7D%7D/src/%7B%7B%20cookiecutter.python_package%20%7D%7D/pipeline_registry.py) could define the following `register_pipelines()` function that exposes the data processing pipeline, the data science pipeline, and a third, default pipeline that combines both of the aforementioned pipelines: + +```python +import spaceflights.pipelines.data_processing as dp +import spaceflights.pipelines.data_science as ds + + +def register_pipelines() -> Dict[str, Pipeline]: + """Register the project's pipelines. + + Returns: + A mapping from pipeline names to ``Pipeline`` objects. + """ + data_processing_pipeline = dp.create_pipeline() + data_science_pipeline = ds.create_pipeline() + + return { + "__default__": data_processing_pipeline + data_science_pipeline, + "data_processing": data_processing_pipeline, + "data_science": data_science_pipeline, + } +``` + +As a reminder, [running `kedro run` without the `--pipeline` option runs the default pipeline](./run_a_pipeline.md#run-a-pipeline-by-name). + +```{note} +The order in which you add the pipelines together is not significant (`data_science_pipeline + data_processing_pipeline` would produce the same result), since Kedro automatically detects the data-centric execution order for all the nodes in the resulting pipeline. +``` + +## Pipeline autodiscovery + +In the above example, you need to update the `register_pipelines()` function whenever you create a pipeline that should be returned as part of the project's pipelines. Since Kedro 0.18.3, you can achieve the same result with less code using [`find_pipelines()`](/kedro.framework.project.find_pipelines). The [updated pipeline registry](https://github.com/kedro-org/kedro-starters/blob/main/spaceflights/%7B%7B%20cookiecutter.repo_name%20%7D%7D/src/%7B%7B%20cookiecutter.python_package%20%7D%7D/pipeline_registry.py) contains no project-specific code: + +```python +def register_pipelines() -> Dict[str, Pipeline]: + """Register the project's pipelines. + + Returns: + A mapping from pipeline names to ``Pipeline`` objects. + """ + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + return pipelines +``` + +Under the hood, the `find_pipelines()` function traverses the `src//pipelines/` directory and returns a mapping from pipeline directory name to [`Pipeline`](/kedro.pipeline.Pipeline) object by: + +1. Importing the `.pipelines.` module +2. Calling the `create_pipeline()` function exposed by the `.pipelines.` module +3. Validating that the constructed object is a [`Pipeline`](/kedro.pipeline.Pipeline) + +If any of these steps fail, `find_pipelines()` raises an appropriate warning and skips the current pipeline but continues traversal. + +The mapping returned by `find_pipelines()` can be modified, meaning you are not limited to the pipelines returned by each of the `create_pipeline()` functions found above. For example, to add a data engineering pipeline that isn't part of the default pipeline, add it to the dictionary *after* constructing the default pipeline: + +```python +def register_pipelines() -> Dict[str, Pipeline]: + """Register the project's pipelines. + + Returns: + A mapping from pipeline names to ``Pipeline`` objects. + """ + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + pipelines["data_engineering"] = pipeline( + pipelines["data_processing"], namespace="data_engineering" + ) + return pipelines +``` + +On the other hand, adding the same pipeline *before* assigning `pipelines["__default__"] = sum(pipelines.values())` includes it in the default pipeline, so the data engineering pipeline will be run if `kedro run` is called without specifying a pipeline name: + +```python +def register_pipelines() -> Dict[str, Pipeline]: + """Register the project's pipelines. + + Returns: + A mapping from pipeline names to ``Pipeline`` objects. + """ + pipelines = find_pipelines() + pipelines["data_engineering"] = pipeline( + pipelines["data_processing"], namespace="data_engineering" + ) + pipelines["__default__"] = sum(pipelines.values()) + return pipelines +``` diff --git a/docs/source/06_nodes_and_pipelines/04_run_a_pipeline.md b/docs/source/nodes_and_pipelines/run_a_pipeline.md similarity index 66% rename from docs/source/06_nodes_and_pipelines/04_run_a_pipeline.md rename to docs/source/nodes_and_pipelines/run_a_pipeline.md index b16b3441f6..417510fe8e 100644 --- a/docs/source/06_nodes_and_pipelines/04_run_a_pipeline.md +++ b/docs/source/nodes_and_pipelines/run_a_pipeline.md @@ -1,9 +1,5 @@ # Run a pipeline -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - ## Runners Runners are the execution mechanisms used to run pipelines. They all inherit from `AbstractRunner`. @@ -34,22 +30,11 @@ kedro run --runner=SequentialRunner #### Multiprocessing -You can alternatively run the nodes within the pipeline concurrently, using a `ParallelRunner`. To do so, add a flag as follows: - -```bash -kedro run --parallel -``` - -or - +You can alternatively run the nodes within the pipeline concurrently, using a `ParallelRunner` as follows: ```bash kedro run --runner=ParallelRunner ``` -```eval_rst -.. note:: You cannot use both ``--parallel`` and ``--runner`` flags at the same time. (That is, ``kedro run --parallel --runner=SequentialRunner`` raises an exception). -``` - #### Multithreading While `ParallelRunner` uses multiprocessing, you can also run the pipeline with multithreading for concurrent execution by specifying `ThreadRunner` as follows: @@ -57,13 +42,11 @@ While `ParallelRunner` uses multiprocessing, you can also run the pipeline with kedro run --runner=ThreadRunner ``` -```eval_rst -.. note:: ``SparkDataSet`` doesn't work correctly with ``ParallelRunner``. To add concurrency to the pipeline with ``SparkDataSet``, you must use ``ThreadRunner``. +```{note} +`SparkDataSet` doesn't work correctly with `ParallelRunner`. To add concurrency to the pipeline with `SparkDataSet`, you must use `ThreadRunner`. ``` -For more information on how to maximise concurrency when using Kedro with PySpark, please visit our guide on [how to build a Kedro pipeline with PySpark](../11_tools_integration/01_pyspark.md). - - +For more information on how to maximise concurrency when using Kedro with PySpark, please visit our guide on [how to build a Kedro pipeline with PySpark](../integrations/pyspark_integration.md). ## Custom runners @@ -73,15 +56,17 @@ If the built-in Kedro runners do not meet your requirements, you can also define Click to expand ```python -# in /src//runner.py +# in src//runner.py from kedro.io import AbstractDataSet, DataCatalog, MemoryDataSet from kedro.pipeline import Pipeline from kedro.runner.runner import AbstractRunner +from pluggy import PluginManager class DryRunner(AbstractRunner): """``DryRunner`` is an ``AbstractRunner`` implementation. It can be used to list which - nodes would be run without actually executing anything. + nodes would be run without actually executing anything. It also checks if all the + neccessary data exists. """ def create_default_data_set(self, ds_name: str) -> AbstractDataSet: @@ -96,7 +81,13 @@ class DryRunner(AbstractRunner): """ return MemoryDataSet() - def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None: + def _run( + self, + pipeline: Pipeline, + catalog: DataCatalog, + hook_manager: PluginManager = None, + session_id: str = None, + ) -> None: """The method implementing dry pipeline running. Example logs output using this implementation: @@ -108,27 +99,37 @@ class DryRunner(AbstractRunner): Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. + session_id: The id of the session. """ nodes = pipeline.nodes self._logger.info( "Actual run would execute %d nodes:\n%s", len(nodes), - "\n".join(map(str, nodes)), + pipeline.describe(), ) + self._logger.info("Checking inputs...") + input_names = pipeline.inputs() + missing_inputs = [ + input_name + for input_name in input_names + if not catalog._get_dataset(input_name).exists() + ] + if missing_inputs: + raise KeyError(f"Datasets {missing_inputs} not found.") ```
    And use it with `kedro run` through the `--runner` flag: ```console -$ kedro run --runner=.runner.DryRunner +$ kedro run --runner=.runner.DryRunner ``` ## Load and save asynchronously -```eval_rst -.. note:: ``ThreadRunner`` doesn't support asynchronous load-input or save-output operations. +```{note} +`ThreadRunner` doesn't support asynchronous load-input or save-output operations. ``` When processing a node, both `SequentialRunner` and `ParallelRunner` perform the following steps in order: @@ -147,13 +148,13 @@ $ kedro run --async ... ``` -```eval_rst -.. note:: All the datasets used in the run have to be `thread-safe `_ in order for asynchronous loading/saving to work properly. +```{note} +All the datasets used in the run have to be [thread-safe](https://www.quora.com/What-is-thread-safety-in-Python) in order for asynchronous loading/saving to work properly. ``` ## Run a pipeline by name -To run the pipeline by its name, you need to add your new pipeline to `register_pipelines()` function `src//pipeline_registry.py` as below: +To run the pipeline by its name, you need to add your new pipeline to the `register_pipelines()` function in `src//pipeline_registry.py`:
    Click to expand @@ -163,37 +164,31 @@ def register_pipelines(): """Register the project's pipelines. Returns: - A mapping from a pipeline name to a ``Pipeline`` object. - + A mapping from pipeline names to ``Pipeline`` objects. """ - - data_engineering_pipeline = de.create_pipeline() - data_science_pipeline = ds.create_pipeline() - my_pipeline = Pipeline( + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + my_pipeline = pipeline( [ # your definition goes here ] ) - - return { - "de": data_engineering_pipeline, - "my_pipeline": my_pipeline, - "__default__": data_engineering_pipeline + data_science_pipeline, - } + pipelines["my_pipeline"] = my_pipeline + return pipelines ```
    -Then from the command line, execute the following: +Then, from the command line, execute the following: ```bash -kedro run --pipeline my_pipeline +kedro run --pipeline=my_pipeline ``` -```eval_rst -.. note:: If you specify ``kedro run`` without the ``--pipeline`` option, it runs the ``__default__`` pipeline from the dictionary returned by ``register_pipelines()``. +```{note} +If you specify `kedro run` without the `--pipeline` option, it runs the `__default__` pipeline from the dictionary returned by `register_pipelines()`. ``` -Further information about `kedro run` can be found in the [Kedro CLI documentation](../09_development/03_commands_reference.md#run-the-project). +Further information about `kedro run` can be found in the [Kedro CLI documentation](../development/commands_reference.md#run-the-project). ## Run pipelines with IO @@ -326,3 +321,41 @@ except FileNotFoundError: pass ```
    + + +## Configure `kedro run` arguments + +The [Kedro CLI documentation](../development/commands_reference.md#run-the-project) lists the available CLI options for `kedro run`. You can alternatively supply a configuration file that contains the arguments to `kedro run`. + +Here is an example file named `config.yml`, but you can choose any name for the file: + + +```console +$ kedro run --config=config.yml +``` + +where `config.yml` is formatted as below (for example): + +```yaml +run: + tags: tag1, tag2, tag3 + pipeline: pipeline1 + parallel: true + nodes_names: node1, node2 + env: env1 +``` + +The syntax for the options is different when you're using the CLI compared to the configuration file. In the CLI you use dashes, for example for `kedro run --from-nodes=...`, but you have to use an underscore in the configuration file: + +```yaml +run: + from_nodes: ... +``` + +This is because the configuration file gets parsed by [Click](https://click.palletsprojects.com/en/8.1.x/), a Python package to handle command line interfaces. Click passes the options defined in the configuration file to a Python function. The option names need to match the argument names in that function. + +Variable names and arguments in Python may only contain alpha-numeric characters and underscores, so it's not possible to have a dash in the option names when using the configuration file. + +```{note} +If you provide both a configuration file and a CLI option that clashes with the configuration file, the CLI option will take precedence. +``` diff --git a/docs/source/06_nodes_and_pipelines/05_slice_a_pipeline.md b/docs/source/nodes_and_pipelines/slice_a_pipeline.md similarity index 81% rename from docs/source/06_nodes_and_pipelines/05_slice_a_pipeline.md rename to docs/source/nodes_and_pipelines/slice_a_pipeline.md index 95044ddaeb..f4f4bccf0d 100644 --- a/docs/source/06_nodes_and_pipelines/05_slice_a_pipeline.md +++ b/docs/source/nodes_and_pipelines/slice_a_pipeline.md @@ -1,12 +1,8 @@ # Slice a pipeline -```eval_rst -.. note:: This documentation is based on ``Kedro 0.17.1``. If you spot anything that is incorrect then please create an `issue `_ or pull request. -``` - -Sometimes it is desirable to run a subset, or a 'slice' of a pipeline's nodes. In this page, we illustrate the programmatic options that Kedro provides. You can also use the [Kedro CLI to pass parameters to `kedro run`](../09_development/03_commands_reference.md#run-the-project) command and slice a pipeline. +Sometimes it is desirable to run a subset, or a 'slice' of a pipeline's nodes. In this page, we illustrate the programmatic options that Kedro provides. You can also use the [Kedro CLI to pass parameters to `kedro run`](../development/commands_reference.md#run-the-project) command and slice a pipeline. -Let's look again at the example pipeline from the [pipeline introduction documentation](./02_pipeline_introduction.md#how-to-build-a-pipeline), which computes the variance of a set of numbers: +Let's look again at the example pipeline from the [pipeline introduction documentation](./pipeline_introduction.md#how-to-build-a-pipeline), which computes the variance of a set of numbers:
    Click to expand @@ -18,14 +14,14 @@ def mean(xs, n): def mean_sos(xs, n): - return sum(x ** 2 for x in xs) / n + return sum(x**2 for x in xs) / n def variance(m, m2): return m2 - m * m -pipeline = Pipeline( +full_pipeline = pipeline( [ node(len, "xs", "n"), node(mean, ["xs", "n"], "m", name="mean_node", tags="mean"), @@ -36,7 +32,7 @@ pipeline = Pipeline( ```
    -The `pipeline.describe()` method returns the following output: +The `Pipeline.describe()` method returns the following output:
    Click to expand @@ -67,7 +63,7 @@ One way to slice a pipeline is to provide a set of pre-calculated inputs which s ```python -print(pipeline.from_inputs("m2").describe()) +print(full_pipeline.from_inputs("m2").describe()) ``` `Output`: @@ -90,7 +86,7 @@ Slicing the pipeline from inputs `m` and `xs` results in the following pipeline: Click to expand ```python -print(pipeline.from_inputs("m", "xs").describe()) +print(full_pipeline.from_inputs("m", "xs").describe()) ``` `Output`: @@ -119,7 +115,7 @@ Another way of slicing a pipeline is to specify the nodes which should be used a Click to expand ```python -print(pipeline.from_nodes("mean_node").describe()) +print(full_pipeline.from_nodes("mean_node").describe()) ``` `Output`: @@ -142,7 +138,7 @@ As you can see, this will slice the pipeline and run it from the specified node You can run the resulting pipeline slice by running the following command in your terminal window: ```bash -kedro run --from-nodes="mean_node" +kedro run --from-nodes=mean_node ``` ## Slice a pipeline by specifying final nodes @@ -153,7 +149,7 @@ Similarly, you can specify the nodes which should be used to end a pipeline. For ```python -print(pipeline.to_nodes("mean_node").describe()) +print(full_pipeline.to_nodes("mean_node").describe()) ``` `Output`: @@ -174,19 +170,19 @@ Outputs: m As you can see, this will slice the pipeline, so it runs from the beginning and ends with the specified node: ```bash -kedro run --to-nodes="mean_node" +kedro run --to-nodes=mean_node ``` You can also slice a pipeline by specifying the start and finish nodes, and thus the set of nodes to be included in the pipeline slice: ```bash -kedro run --from-nodes A --to-nodes Z +kedro run --from-nodes=A --to-nodes=Z ``` or, when specifying multiple nodes: ```bash -kedro run --from-nodes A,D --to-nodes X,Y,Z +kedro run --from-nodes=A,D --to-nodes=X,Y,Z ``` ## Slice a pipeline with tagged nodes @@ -196,7 +192,7 @@ You can also slice a pipeline from the nodes that have specific tags attached to Click to expand ```python -print(pipeline.only_nodes_with_tags("mean", "variance").describe()) +print(full_pipeline.only_nodes_with_tags("mean", "variance").describe()) ``` `Output`: @@ -220,9 +216,9 @@ To slice a pipeline from nodes that have tag `mean` *OR* tag `variance`: ```python -sliced_pipeline = pipeline.only_nodes_with_tags("mean") + pipeline.only_nodes_with_tags( - "variance" -) +sliced_pipeline = full_pipeline.only_nodes_with_tags( + "mean" +) + full_pipeline.only_nodes_with_tags("variance") print(sliced_pipeline.describe()) ``` @@ -248,7 +244,7 @@ Sometimes you might need to run only some of the nodes in a pipeline, as follows Click to expand ```python -print(pipeline.only_nodes("mean_node", "mean_sos").describe()) +print(full_pipeline.only_nodes("mean_node", "mean_sos").describe()) ``` `Output`: @@ -268,8 +264,8 @@ Outputs: m, m2 This will create a sliced pipeline, comprised of the nodes you specify in the method call. -```eval_rst -.. note:: All the inputs required by the specified nodes must exist, i.e. already produced or present in the data catalog. +```{note} +All the inputs required by the specified nodes must exist, i.e. already produced or present in the data catalog. ``` ## How to recreate missing outputs @@ -280,7 +276,7 @@ Kedro can automatically generate a sliced pipeline from existing node outputs. T Click to expand ```python -print(pipeline.describe()) +print(full_pipeline.describe()) ``` `Output`: @@ -306,7 +302,7 @@ To demonstrate this, let us save the intermediate output `n` using a `JSONDataSe Click to expand ```python -from kedro.extras.datasets.pandas import JSONDataSet +from kedro_datasets.pandas import JSONDataSet from kedro.io import DataCatalog, MemoryDataSet n_json = JSONDataSet(filepath="./data/07_model_output/len.json") @@ -337,7 +333,7 @@ Running the pipeline calculates `n` and saves the result to disk: Click to expand ```python -SequentialRunner().run(pipeline, io) +SequentialRunner().run(full_pipeline, io) ``` `Output`: @@ -363,7 +359,7 @@ We can avoid re-calculating `n` (and all other results that have already been sa Click to expand ```python -SequentialRunner().run_only_missing(pipeline, io) +SequentialRunner().run_only_missing(full_pipeline, io) ``` `Ouput`: diff --git a/docs/source/notebooks_and_ipython/index.md b/docs/source/notebooks_and_ipython/index.md new file mode 100644 index 0000000000..ae6516c055 --- /dev/null +++ b/docs/source/notebooks_and_ipython/index.md @@ -0,0 +1,12 @@ +# Kedro for notebook users + + +You can take advantage of a notebook's liberal development environment for exploratory data analysis and experimentation from within a Kedro project. Later, when you need to follow software best practices as the project complexity increases, or as you scale into production, you can transfer code from the notebook into Kedro to benefit from its opinionated project framework. + + +```{toctree} +:maxdepth: 1 + +kedro_and_notebooks +kedro_as_a_data_registry +``` diff --git a/docs/source/notebooks_and_ipython/kedro_and_notebooks.md b/docs/source/notebooks_and_ipython/kedro_and_notebooks.md new file mode 100644 index 0000000000..d32139b2f8 --- /dev/null +++ b/docs/source/notebooks_and_ipython/kedro_and_notebooks.md @@ -0,0 +1,286 @@ +# Kedro and Jupyter Notebooks + +This page explains how best to combine Kedro and Jupyter Notebook development and illustrates with an example Notebook that has access to the `catalog`, `context`, `pipelines` and `session` variables for a Kedro project. + +## A custom Kedro kernel + +Kedro offers a command (`kedro jupyter notebook`) to create a Jupyter kernel named `kedro_` that is almost identical to the [default IPython kernel](https://ipython.readthedocs.io/en/stable/install/kernel_install.html) but with a slightly customised [kernel specification](https://jupyter-client.readthedocs.io/en/stable/kernels.html#kernel-specs). + +The custom kernel automatically loads `kedro.ipython`, which is an [IPython extension](https://ipython.readthedocs.io/en/stable/config/extensions/) that launches a [Kedro session](../kedro_project_setup/session.md) and makes the following Kedro variables available: + +* `catalog` (type `DataCatalog`): [Data Catalog](../data/data_catalog.md) instance that contains all defined datasets; this is a shortcut for `context.catalog` +* `context` (type `KedroContext`): Kedro project context that provides access to Kedro's library components +* `pipelines` (type `Dict[str, Pipeline]`): Pipelines defined in your [pipeline registry](../nodes_and_pipelines/run_a_pipeline.md#run-a-pipeline-by-name) +* `session` (type `KedroSession`): [Kedro session](../kedro_project_setup/session.md) that orchestrates a pipeline run + + +## Iris dataset example + +Create a sample Kedro project with the [`pandas-iris` starter](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris) as we showed in the [get started documentation](../get_started/new_project.md#create-a-new-project-containing-example-code): + +```bash +kedro new --starter=pandas-iris +``` + +We will assume you call the project `iris`, but you can call it whatever you choose. + +Navigate to the project directory and issue the following command in the terminal to launch Jupyter: + +```bash +kedro jupyter notebook +``` + +Your browser window will open, and you can then create a new Jupyter Notebook using the dropdown and selecting the `Kedro ()` kernel. + +![Create a new Jupyter Notebook with Kedro (iris) kernel](../meta/images/jupyter_new_notebook.png) + +We recommend that you store your Notebooks in the `notebooks` folder of your Kedro project. + +We will now give some examples of how to work with the Kedro variables. To explore the full range of attributes and methods available, you might like to consult the relevant [API documentation](/kedro) or use the [Python `dir` function](https://docs.python.org/3/library/functions.html#dir) (e.g. `dir(catalog)`). + +``` {note} +If the Kedro variables are not available within your Jupyter Notebook, you could have a malformed configuration file or missing dependencies. The full error message is shown on the terminal used to launch `kedro jupyter notebook`. +``` + +### `catalog` + +`catalog` can be used to explore your [Data Catalog](../data/data_catalog.md), including parameters. Useful methods include `catalog.list`, `catalog.load` and `catalog.save`. For example, add the following to a cell in your Notebook: + +```ipython +catalog.list() +``` + +When you run the cell: + +```ipython +['example_iris_data', + 'parameters', + 'params:example_test_data_ratio', + 'params:example_num_train_iter', + 'params:example_learning_rate' +] +``` +Next try the following: + +```ipython +catalog.load("example_iris_data") +``` + +The output: + +```ipython +INFO Loading data from 'example_iris_data' (CSVDataSet)... + + sepal_length sepal_width petal_length petal_width species +0 5.1 3.5 1.4 0.2 setosa +1 4.9 3.0 1.4 0.2 setosa +2 4.7 3.2 1.3 0.2 setosa +3 4.6 3.1 1.5 0.2 setosa +4 5.0 3.6 1.4 0.2 setosa +.. ... ... ... ... ... +145 6.7 3.0 5.2 2.3 virginica +146 6.3 2.5 5.0 1.9 virginica +147 6.5 3.0 5.2 2.0 virginica +148 6.2 3.4 5.4 2.3 virginica +149 5.9 3.0 5.1 1.8 virginica +``` + +Finally, try the following: + +```ipython +catalog.load("parameters") +``` +You should see the following: + +```ipython +INFO Loading data from 'parameters' (MemoryDataSet)... + +{'example_test_data_ratio': 0.2, + 'example_num_train_iter': 10000, + 'example_learning_rate': 0.01} +``` + +```{note} +If you enable [versioning](../data/data_catalog.md#version-datasets-and-ml-models) you can load a particular version of a dataset, e.g. `catalog.load("example_train_x", version="2021-12-13T15.08.09.255Z")`. +``` + +### `context` + +`context` enables you to access Kedro's library components and project metadata. For example, if you add the following to a cell and run it: + +```ipython +context.project_path +``` +You should see output similar to the following, according to your username and path: + +```ipython +PosixPath('/Users/username/kedro_projects/iris') +``` + +You can find out more about the `context` in the [API documentation](/kedro.framework.context.KedroContext). + +### `pipelines` + +`pipelines` is a dictionary containing your project's [registered pipelines](../nodes_and_pipelines/run_a_pipeline.md#run-a-pipeline-by-name): + +```ipython +pipelines +``` + +The output will be a listing as follows: + +```ipython +{'__default__': Pipeline([ +Node(split_data, ['example_iris_data', 'parameters'], ['X_train', 'X_test', 'y_train', 'y_test'], 'split'), +Node(make_predictions, ['X_train', 'X_test', 'y_train'], 'y_pred', 'make_predictions'), +Node(report_accuracy, ['y_pred', 'y_test'], None, 'report_accuracy') +])} +``` + +You can use this to explore your pipelines and the nodes they contain: + +```ipython +pipelines["__default__"].all_outputs() +``` +Should give the output: + +```ipython +{'y_pred', 'X_test', 'y_train', 'X_train', 'y_test'} +``` + +### `session` + +`session.run` allows you to run a pipeline. With no arguments, this will run your `__default__` project pipeline sequentially, much as a call to `kedro run` from the terminal: + +```ipython +session.run() +``` + +```{note} +You can only execute one *successful* run per session, as there's a one-to-one mapping between a session and a run. If you wish to do multiple runs, you'll have to run `%reload_kedro` to obtain a new `session` (see below). +``` + +You can also specify the following optional arguments for `session.run`: + +| Argument name | Accepted types | Description | +| --------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tags` | `Iterable[str]` | Construct the pipeline using only nodes which have this tag attached. A node is included in the resulting pipeline if it contains any of those tags | +| `runner` | `AbstractRunner` | An instance of Kedro [AbstractRunner](/kedro.runner.AbstractRunner). Can be an instance of a [ParallelRunner](/kedro.runner.ParallelRunner) | +| `node_names` | `Iterable[str]` | Run only nodes with specified names | +| `from_nodes` | `Iterable[str]` | A list of node names which should be used as a starting point | +| `to_nodes` | `Iterable[str]` | A list of node names which should be used as an end point | +| `from_inputs` | `Iterable[str]` | A list of dataset names which should be used as a starting point | +| `to_outputs` | `Iterable[str]` | A list of dataset names which should be used as an end point | +| `load_versions` | `Dict[str, str]` | A mapping of a dataset name to a specific dataset version (timestamp) for loading. Applies to versioned datasets only | +| `pipeline_name` | `str` | Name of the modular pipeline to run. Must be one of those returned by the `register_pipelines` function in `src//pipeline_registry.py` | + +## `%reload_kedro` line magic + +You can use `%reload_kedro` [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) within your Jupyter Notebook to reload the Kedro variables (for example, if you need to update `catalog` following changes to your Data Catalog). + +You don't need to restart the kernel to reload the Kedro IPython extension and refresh the `catalog`, `context`, `pipelines` and `session` variables. + +`%reload_kedro` accepts optional keyword arguments `env` and `params`. For example, to use configuration environment `prod`: + +```ipython +%reload_kedro --env=prod +``` + +For more details, run `%reload_kedro?`. + +## `%run_viz` line magic + +If you have [Kedro-Viz](https://github.com/kedro-org/kedro-viz) installed for the project you can display an interactive visualisation of your pipeline directly in your Notebook using the [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) `%run_viz`. + + +## Convert functions from Jupyter Notebooks into Kedro nodes + +If you are writing experimental code in your Notebook and later want to convert functions you've written to Kedro nodes, you can do this using tags. + +Say you have the following code in your Notebook: + +```ipython +def some_action(): + print("This function came from `notebooks/my_notebook.ipynb`") +``` + +1. Enable tags toolbar: `View` menu -> `Cell Toolbar` -> `Tags` +![Enable the tags toolbar graphic](../meta/images/jupyter_notebook_workflow_activating_tags.png) + +2. Add the `node` tag to the cell containing your function +![Add the node tag graphic](../meta/images/jupyter_notebook_workflow_tagging_nodes.png) + +```{note} +The Notebook can contain multiple functions tagged as `node`, each of them will be exported into the resulting Python file +``` + +3. Save your Jupyter Notebook to `notebooks/my_notebook.ipynb` +4. From your terminal, run `kedro jupyter convert notebooks/my_notebook.ipynb` from the Kedro project directory. The output is a Python file `src//nodes/my_notebook.py` containing the `some_action` function definition +5. The `some_action` function can now be used in your Kedro pipelines + +## Useful to know... +Each Kedro project has its own Jupyter kernel so you can switch between multiple Kedro projects from a single Jupyter instance simply by selecting the appropriate kernel. + +If a Jupyter kernel with the name `kedro_` already exists then it is replaced. This ensures that the kernel always points to the correct Python executable. For example, if you change conda environment in a Kedro project then you should re-run `kedro jupyter notebook` to replace the kernel specification with one that points to the new environment. + +You can use the `jupyter kernelspec` set of commands to manage your Jupyter kernels. For example, to remove a kernel, run `jupyter kernelspec remove `. + +### Managed services + +If you work within a managed Jupyter service such as a Databricks Notebook you may be unable to execute `kedro jupyter notebook`. You can explicitly load the Kedro IPython extension with the `%load_ext` line magic: + +```ipython +In [1]: %load_ext kedro.ipython +``` + +If you launch your Jupyter instance from outside your Kedro project, you will need to run a second line magic to set the project path so that Kedro can load the `catalog`, `context`, `pipelines` and `session` variables: + +```ipython +In [2]: %reload_kedro +``` +The Kedro IPython extension remembers the project path so that subsequent calls to `%reload_kedro` do not need to specify it: + +```ipython +In [1]: %load_ext kedro.ipython +In [2]: %reload_kedro +In [3]: %reload_kedro +``` + +### IPython, JupyterLab and other Jupyter clients + +You can also connect an IPython shell to a Kedro project kernel as follows: + +```bash +kedro ipython +``` + +The command launches an IPython shell with the extension already loaded and is equivalent to the command `ipython --ext kedro.ipython`. You first saw this in action in the [spaceflights tutorial](../tutorial/set_up_data.md#test-that-kedro-can-load-the-data). + + +Similarly, the following creates a custom Jupyter kernel that automatically loads the extension and launches JupyterLab with this kernel selected: + +```bash +kedro jupyter lab +``` + +You can use any other Jupyter client to connect to a Kedro project kernel such as the [Qt Console](https://qtconsole.readthedocs.io/), which can be launched using the `kedro_iris` kernel as follows: + +```bash +jupyter qtconsole --kernel=kedro_iris +``` + +This will automatically load the Kedro IPython in a console that supports graphical features such as embedded figures: +![Plot of example iris data in a Qt Console](../meta/images/jupyter_qtconsole.png) + + +## Find out more + +We recommend the following: + +* [Power is nothing without control: Don’t break up with Jupyter Notebooks. Just use Kedro too!](https://towardsdatascience.com/power-is-nothing-without-control-aa43523745b6) + +* [Two Tricks to Optimize your Kedro Jupyter Flow](https://youtu.be/ZHIqXJEp0-w) + +* [Handling Custom Jupyter Data Sources](https://youtu.be/dRnCovp1GRQ) + +* [Why transition from vanilla Jupyter Notebooks to Kedro?](https://www.youtube.com/watch?v=JLTYNPoK7nw&ab_channel=PyConUS) diff --git a/docs/source/notebooks_and_ipython/kedro_as_a_data_registry.md b/docs/source/notebooks_and_ipython/kedro_as_a_data_registry.md new file mode 100644 index 0000000000..ac53c9e1ec --- /dev/null +++ b/docs/source/notebooks_and_ipython/kedro_as_a_data_registry.md @@ -0,0 +1,45 @@ +# Kedro as a data registry + +In some projects you may want to share a Jupyter Notebook with others so you need to avoid using hard-coded file paths for data access. + +One solution is to set up a lightweight Kedro project that uses the Kedro [`DataCatalog`](../data/data_catalog.md) as a registry for the data, without using any of the other features of Kedro. + +The Kedro starter with alias `standalone-datacatalog` (formerly known as `mini-kedro`) provides this kind of minimal functionality. + +## Usage + +Use the [`standalone-datacatalog` starter](https://github.com/kedro-org/kedro-starters/tree/main/standalone-datacatalog) to create a new project: + +```bash +kedro new --starter=standalone-datacatalog +``` + +The starter comprises a minimal setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../data/data_catalog.md). + +The starter contains: + +* A `conf` directory, which contains an example `DataCatalog` configuration (`catalog.yml`): + + ```yaml +# conf/base/catalog.yml +example_dataset_1: + type: pandas.CSVDataSet + filepath: folder/filepath.csv + +example_dataset_2: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/example_dataset_2* + credentials: dev_s3 + file_format: csv + save_args: + if_exists: replace +``` + +* A `data` directory, which contains an example dataset identical to the one used by the [`pandas-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris) starter + +* An example Jupyter Notebook, which shows how to instantiate the `DataCatalog` and interact with the example dataset: + +```python +df = catalog.load("example_dataset_1") +df_2 = catalog.save("example_dataset_2") +``` diff --git a/docs/source/puppeteer-config.json b/docs/source/puppeteer-config.json new file mode 100644 index 0000000000..5adc866e94 --- /dev/null +++ b/docs/source/puppeteer-config.json @@ -0,0 +1,4 @@ +{ + "args": ["--no-sandbox"], + "headless": "old" +} diff --git a/docs/source/13_resources/02_glossary.md b/docs/source/resources/glossary.md similarity index 70% rename from docs/source/13_resources/02_glossary.md rename to docs/source/resources/glossary.md index 84fcbcc7b8..55f841c8e7 100644 --- a/docs/source/13_resources/02_glossary.md +++ b/docs/source/resources/glossary.md @@ -4,7 +4,7 @@ ## Data Catalog The Data Catalog is Kedro's registry of all data sources available for use in the data pipeline. It manages loading and saving of data. The Data Catalog maps the names of node inputs and outputs as keys in a Kedro `DataSet`, which can be specialised for different types of data storage. -[Further information about the Data Catalog](../05_data/01_data_catalog) +[Further information about the Data Catalog](../data/data_catalog.md) ## Data engineering vs Data science Data engineering is the process of wrangling data into a clean and reliable state. Data wrangling is about taking a messy or unrefined source of data and turning it into something useful by parsing and cleaning it. @@ -14,7 +14,7 @@ Data science extracts insights from data by using a combination of domain expert ## Kedro Kedro is an open-source Python framework for creating reproducible, maintainable and modular data science code. It applies software engineering best-practices to machine learning code, including modularity, separation of concerns and versioning. -[Introduction to Kedro](../01_introduction/01_introduction) +[Introduction to Kedro](../introduction/introduction.md) ## `KedroContext` A Python class that holds the configuration and Kedro’s main functionality. @@ -24,7 +24,7 @@ API documentation for [`KedroContext`](/kedro.framework.context.KedroContext) ## `KedroSession` A KedroSession allows you to manage the lifecycle of a Kedro run, persist runtime parameters and trace back runtime parameters, such as CLI command flags and environment variables. -[Further information about `KedroSession`](../04_kedro_project_setup/03_session) +[Further information about `KedroSession`](../kedro_project_setup/session.md) ## Kedro-Viz You can use Kedro-Viz to visualise your Kedro data pipelines: @@ -33,24 +33,28 @@ You can use Kedro-Viz to visualise your Kedro data pipelines: * Get a clear picture when you have lots of datasets and nodes by using tags to visualise sub-pipelines * Search for nodes and datasets -[Further information from the Kedro-Viz repository](https://github.com/quantumblacklabs/kedro-viz) and [Kedro tutorial documentation](../03_tutorial/06_visualise_pipeline) +[Further information from the Kedro-Viz repository](https://github.com/kedro-org/kedro-viz) and [Kedro-Viz documentation](../visualisation/kedro-viz_visualisation.md) ## Layers (data engineering convention) -According to [data engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention), a pipeline can be broken up into different layers according to how data is processed. This convention makes it easier to collaborate with other team members because everyone has an idea of what type of data cleaning or processing has happened. +According to [common data engineering convention](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71), a pipeline can be broken up into different layers according to how data is processed. This convention makes it easier to collaborate with other team members because everyone has an idea of what type of data cleaning or processing has happened. -Kedro-Viz makes it easy to [visualise these data processing stages](../03_tutorial/06_visualise_pipeline.md#visualise-layers) by adding a `layer` attribute to the datasets in the Data Catalog. +Kedro-Viz makes it easy to [visualise these data processing stages](../visualisation/kedro-viz_visualisation.md#visualise-layers) by adding a `layer` attribute to the `kedro-viz` section within the `metadata` of the datasets in the Data Catalog. ## Modular pipeline _(See also [Pipeline](#pipeline))_ In many typical Kedro projects, a single (“main”) pipeline increases in complexity as the project evolves. To keep your project fit for purpose, you can create modular pipelines, which are logically isolated and can be reused. Modular pipelines are easier to develop, test and maintain, and are portable so they can be copied and reused between projects. -[Further information about modular pipelines](../06_nodes_and_pipelines/03_modular_pipelines) +[Further information about modular pipelines](../nodes_and_pipelines/modular_pipelines.md) ## Node -A Kedro node is a wrapper for a Python function that names the inputs and outputs of that function. It is the building block of a pipeline. Nodes can be linked when the output of one node is the input of another. +A Kedro node is a wrapper for a pure Python function that names the inputs and outputs of that function. -[Further information about nodes](../06_nodes_and_pipelines/01_nodes) +(A [pure function](https://realpython.com/python-functional-programming/#what-is-functional-programming) is a one whose output value follows solely from its input values, without any observable side effects such as changes to state or mutable data). + +Nodes are the building block of a pipeline. Nodes can be linked when the output of one node is the input of another. + +[Further information about nodes](../nodes_and_pipelines/nodes.md) ## Node execution order The node execution order is determined by resolving the input and output data dependencies between the nodes. The pipeline determines the node execution order and does not necessarily run the nodes in the order in which they are passed in. @@ -58,7 +62,7 @@ The node execution order is determined by resolving the input and output data de ## Pipeline A Kedro pipeline organises the dependencies and execution order of a collection of nodes, and connects inputs and outputs. The pipeline determines the node execution order by resolving dependencies. -[Further information about pipelines](../06_nodes_and_pipelines/02_pipeline_introduction) +[Further information about pipelines](../nodes_and_pipelines/pipeline_introduction.md) **_Chonky pipeline_**: _Chonky is generally used to describe animals that are plump, rounded or simply heavier than average. A chonky pipeline is, likewise, a pipeline that is more bulky than usual._ @@ -71,7 +75,7 @@ This is when you run a subset, or a ‘slice’ of a pipeline’s nodes. You can * by tagging certain nodes (`pipeline.only_nodes_with_tags`) * by specifying certain nodes (`pipeline.only_nodes`) -[Further information about pipeline slicing](../06_nodes_and_pipelines/05_slice_a_pipeline) +[Further information about pipeline slicing](../nodes_and_pipelines/slice_a_pipeline.md) ## Runner Runners are different execution mechanisms to run pipelines with the specified data catalog. @@ -80,12 +84,12 @@ Runners are different execution mechanisms to run pipelines with the specified d * The parallel runner allows for concurrency by use of multiprocessing * The thread runner uses threading for concurrent execution -[Further information about runners](../06_nodes_and_pipelines/04_run_a_pipeline) +[Further information about runners](../nodes_and_pipelines/run_a_pipeline.md) ## Starters Kedro starters are used to create projects that contain code to run as-is, or to adapt and extend. They provide pre-defined example code and configuration that can be reused. A Kedro starter is a [Cookiecutter template](https://cookiecutter.readthedocs.io/) that contains the boilerplate code for a Kedro project. -[Further information about Kedro starters](../02_get_started/06_starters) +[Further information about Kedro starters](../kedro_project_setup/starters.md) ## Tags You can apply tags to nodes or pipelines as a means of filtering which are executed. diff --git a/docs/source/resources/index.md b/docs/source/resources/index.md new file mode 100644 index 0000000000..72493f112e --- /dev/null +++ b/docs/source/resources/index.md @@ -0,0 +1,9 @@ +# Resources + +```{toctree} +:maxdepth: 1 + +../faq/faq +glossary + +``` diff --git a/docs/source/robots.txt b/docs/source/robots.txt new file mode 100644 index 0000000000..9bd9ee90da --- /dev/null +++ b/docs/source/robots.txt @@ -0,0 +1,5 @@ +User-agent: * +Disallow: * +Allow: /en/stable +Allow: /en/latest +Allow: /en/0.18.* diff --git a/docs/source/tutorial/add_another_pipeline.md b/docs/source/tutorial/add_another_pipeline.md new file mode 100644 index 0000000000..3e4c0089e2 --- /dev/null +++ b/docs/source/tutorial/add_another_pipeline.md @@ -0,0 +1,523 @@ +# Create a data science pipeline + +This section explains the following: + +* How to add a second Kedro pipeline for data science code that extends the default project pipeline +* How to 'slice' the project to run just part of the entire pipeline +* (Optional) How to make a [modular pipeline](../nodes_and_pipelines/modular_pipelines.md) +* (Optional) How to specify the way the pipeline nodes are run: sequentially or in parallel + + +## Data science nodes + +The data science pipeline uses the [`LinearRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html) implementation from the [scikit-learn](https://scikit-learn.org/stable/) library. + +The data science pipeline is made up of the following: + +* Two python files within `src/spaceflights/pipelines/data_science` + * `nodes.py` (for the node functions that form the data processing) + * `pipeline.py` (to build the pipeline) +* A yaml file: `conf/base/parameters/data_science.yml` to define the parameters used when running the pipeline +* `__init__.py` files in the required folders to ensure that Python can import the pipeline + + +First, take a look at the functions for the data science nodes in `src/spaceflights/pipelines/data_science/nodes.py`: + + +
    +Click to expand + +```python +import logging +from typing import Dict, Tuple + +import pandas as pd +from sklearn.linear_model import LinearRegression +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split + + +def split_data(data: pd.DataFrame, parameters: Dict) -> Tuple: + """Splits data into features and targets training and test sets. + + Args: + data: Data containing features and target. + parameters: Parameters defined in parameters/data_science.yml. + Returns: + Split data. + """ + X = data[parameters["features"]] + y = data["price"] + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=parameters["test_size"], random_state=parameters["random_state"] + ) + return X_train, X_test, y_train, y_test + + +def train_model(X_train: pd.DataFrame, y_train: pd.Series) -> LinearRegression: + """Trains the linear regression model. + + Args: + X_train: Training data of independent features. + y_train: Training data for price. + + Returns: + Trained model. + """ + regressor = LinearRegression() + regressor.fit(X_train, y_train) + return regressor + + +def evaluate_model( + regressor: LinearRegression, X_test: pd.DataFrame, y_test: pd.Series +): + """Calculates and logs the coefficient of determination. + + Args: + regressor: Trained model. + X_test: Testing data of independent features. + y_test: Testing data for price. + """ + y_pred = regressor.predict(X_test) + score = r2_score(y_test, y_pred) + logger = logging.getLogger(__name__) + logger.info("Model has a coefficient R^2 of %.3f on test data.", score) +``` + +
    + +## Input parameter configuration + +Parameters that are used by the `DataCatalog` when the pipeline executes are stored in `conf/base/parameters/data_science.yml`: + +
    +Click to expand + +```yaml +model_options: + test_size: 0.2 + random_state: 3 + features: + - engines + - passenger_capacity + - crew + - d_check_complete + - moon_clearance_complete + - iata_approved + - company_rating + - review_scores_rating +``` +
    + +Here, the parameters `test_size` and `random_state` are used as part of the train-test split, and `features` gives the names of columns in the model input table to use as features. + +More information about [parameters is available in the configuration documentation](../configuration/parameters.md). + +## Model registration + +The following definition in `conf/base/catalog.yml` registers the dataset that saves the trained model: + +```yaml +regressor: + type: pickle.PickleDataSet + filepath: data/06_models/regressor.pickle + versioned: true +``` + +By setting `versioned` to `true`, versioning is enabled for `regressor`. This means that the pickled output of the `regressor` is saved every time the pipeline runs, which stores the history of the models built using this pipeline. You can learn more in the [Versioning section](../data/kedro_io.md#versioning). + + +## Data science pipeline + +The data science pipeline is defined in `src/spaceflights/pipelines/data_science/pipeline.py`: + +
    +Click to expand + +```python +from kedro.pipeline import Pipeline, node, pipeline + +from .nodes import evaluate_model, split_data, train_model + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( + [ + node( + func=split_data, + inputs=["model_input_table", "params:model_options"], + outputs=["X_train", "X_test", "y_train", "y_test"], + name="split_data_node", + ), + node( + func=train_model, + inputs=["X_train", "y_train"], + outputs="regressor", + name="train_model_node", + ), + node( + func=evaluate_model, + inputs=["regressor", "X_test", "y_test"], + outputs=None, + name="evaluate_model_node", + ), + ] + ) +``` +
    + + +## Test the pipelines + +When you created your project with `kedro new`, one of the files generated was `src//pipeline_registry.py` which constructs a `__default__` pipeline that includes every pipeline in the project. + +This means that you do not need to manually instruct Kedro to run each pipeline, but can execute the default pipeline, which consists of the data processing and then data science pipeline in turn. + +```bash +kedro run +``` + +You should see output similar to the following: + +
    +Click to expand + +```bash + INFO Loading data from 'companies' (CSVDataSet)... data_catalog.py:343 + INFO Running node: preprocess_companies_node: node.py:327 + preprocess_companies([companies]) -> [preprocessed_companies] + INFO Saving data to 'preprocessed_companies' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 1 out of 6 tasks sequential_runner.py:85 + INFO Loading data from 'shuttles' (ExcelDataSet)... data_catalog.py:343 +[08/09/22 16:56:15] INFO Running node: preprocess_shuttles_node: preprocess_shuttles([shuttles]) node.py:327 + -> [preprocessed_shuttles] + INFO Saving data to 'preprocessed_shuttles' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 2 out of 6 tasks sequential_runner.py:85 + INFO Loading data from 'preprocessed_shuttles' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'preprocessed_companies' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'reviews' (CSVDataSet)... data_catalog.py:343 + INFO Running node: create_model_input_table_node: node.py:327 + create_model_input_table([preprocessed_shuttles,preprocessed_companies, + reviews]) -> [model_input_table] +[08/09/22 16:56:18] INFO Saving data to 'model_input_table' (MemoryDataSet)... data_catalog.py:382 +[08/09/22 16:56:19] INFO Completed 3 out of 6 tasks sequential_runner.py:85 + INFO Loading data from 'model_input_table' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'params:model_options' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: split_data_node: node.py:327 + split_data([model_input_table,params:model_options]) -> + [X_train,X_test,y_train,y_test] + INFO Saving data to 'X_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'X_test' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_test' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 4 out of 6 tasks sequential_runner.py:85 + INFO Loading data from 'X_train' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'y_train' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: train_model_node: train_model([X_train,y_train]) -> node.py:327 + [regressor] +[08/09/22 16:56:20] INFO Saving data to 'regressor' (PickleDataSet)... data_catalog.py:382 + INFO Completed 5 out of 6 tasks sequential_runner.py:85 + INFO Loading data from 'regressor' (PickleDataSet)... data_catalog.py:343 + INFO Loading data from 'X_test' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'y_test' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: evaluate_model_node: node.py:327 + evaluate_model([regressor,X_test,y_test]) -> None + INFO Model has a coefficient R^2 of 0.462 on test data. nodes.py:55 + INFO Completed 6 out of 6 tasks sequential_runner.py:85 + INFO Pipeline execution completed successfully. runner.py:89 +``` + +
    + +As you can see, the `data_processing` and `data_science` pipelines ran successfully, generated a model and evaluated it. + + +### Slice a pipeline + +There may be occasions when you want to run just part of the default pipeline. For example, you could skip `data_processing` execution and run only the `data_science` pipeline to tune the hyperparameters of the price prediction model. + +You can 'slice' the pipeline and specify just the portion you want to run by using the `--pipeline` option. For example, to only run the pipeline named `data_science` (as labelled automatically in `register_pipelines`), execute the following command: + +```bash +kedro run --pipeline=data_science +``` + +There are a range of options to run sections of the default pipeline as described in the [pipeline slicing documentation](../nodes_and_pipelines/slice_a_pipeline.md) and the ``kedro run`` [CLI documentation](../development/commands_reference.md#modifying-a-kedro-run). + +## Modular pipelines + +In many typical Kedro projects, a single (“main”) pipeline increases in complexity as the project evolves. To keep your project fit for purpose, we recommend that you create [modular pipelines](../nodes_and_pipelines/modular_pipelines.md), which are logically isolated and can be reused. You can instantiate a modular pipeline multiple times as a "template" pipeline that can run with different inputs/outputs/parameters. + +Modular pipelines are easier to develop, test and maintain. They are reusable within the same codebase but also portable across projects via [micro-packaging](../nodes_and_pipelines/micro_packaging.md) as a scalable way to use Kedro pipelines. + +### Optional: Extend the project with namespacing and a modular pipeline +This is optional code so is **not** provided in the spaceflights starter. If you want to see this in action, you need to copy and paste the code as instructed. + +First, add namespaces to the modelling component of the data science pipeline to instantiate it as a template with different parameters for an `active_modelling_pipeline` and a `candidate_modelling_pipeline` to test the model using different combinations of features. + + +1. Update your catalog to add namespaces to the outputs of each instance. Replace the `regressor` key with the following two new dataset keys in the `conf/base/catalog.yml` file: + +
    +Click to expand + +```yaml +active_modelling_pipeline.regressor: + type: pickle.PickleDataSet + filepath: data/06_models/regressor_active.pickle + versioned: true + +candidate_modelling_pipeline.regressor: + type: pickle.PickleDataSet + filepath: data/06_models/regressor_candidate.pickle + versioned: true + +``` +

    + +2. Update the parameters file for the data science pipeline in `conf/base/parameters/data_science.yml` to replace the existing contents for `model_options` with the following for the two instances of the template pipeline: + +
    +Click to expand + +```yaml +active_modelling_pipeline: + model_options: + test_size: 0.2 + random_state: 3 + features: + - engines + - passenger_capacity + - crew + - d_check_complete + - moon_clearance_complete + - iata_approved + - company_rating + - review_scores_rating + +candidate_modelling_pipeline: + model_options: + test_size: 0.2 + random_state: 8 + features: + - engines + - passenger_capacity + - crew + - review_scores_rating +``` +

    + +3. Replace the code in `pipelines/data_science/pipeline.py` with the snippet below: + +
    +Click to expand + +```python +from kedro.pipeline import Pipeline, node +from kedro.pipeline.modular_pipeline import pipeline + +from .nodes import evaluate_model, split_data, train_model + + +def create_pipeline(**kwargs) -> Pipeline: + pipeline_instance = pipeline( + [ + node( + func=split_data, + inputs=["model_input_table", "params:model_options"], + outputs=["X_train", "X_test", "y_train", "y_test"], + name="split_data_node", + ), + node( + func=train_model, + inputs=["X_train", "y_train"], + outputs="regressor", + name="train_model_node", + ), + node( + func=evaluate_model, + inputs=["regressor", "X_test", "y_test"], + outputs=None, + name="evaluate_model_node", + ), + ] + ) + ds_pipeline_1 = pipeline( + pipe=pipeline_instance, + inputs="model_input_table", + namespace="active_modelling_pipeline", + ) + ds_pipeline_2 = pipeline( + pipe=pipeline_instance, + inputs="model_input_table", + namespace="candidate_modelling_pipeline", + ) + + return ds_pipeline_1 + ds_pipeline_2 +``` + +

    + +4. Execute `kedro run` from the terminal. You should see output as follows: + +
    +Click to expand + +```bash +[11/02/22 10:41:08] INFO Loading data from 'companies' (CSVDataSet)... data_catalog.py:343 + INFO Running node: preprocess_companies_node: preprocess_companies([companies]) -> node.py:327 + [preprocessed_companies] + INFO Saving data to 'preprocessed_companies' (ParquetDataSet)... data_catalog.py:382 + INFO Completed 1 out of 9 tasks sequential_runner.py:85 + INFO Loading data from 'shuttles' (ExcelDataSet)... data_catalog.py:343 +[11/02/22 10:41:13] INFO Running node: preprocess_shuttles_node: preprocess_shuttles([shuttles]) -> node.py:327 + [preprocessed_shuttles] + INFO Saving data to 'preprocessed_shuttles' (ParquetDataSet)... data_catalog.py:382 + INFO Completed 2 out of 9 tasks sequential_runner.py:85 + INFO Loading data from 'preprocessed_shuttles' (ParquetDataSet)... data_catalog.py:343 + INFO Loading data from 'preprocessed_companies' (ParquetDataSet)... data_catalog.py:343 + INFO Loading data from 'reviews' (CSVDataSet)... data_catalog.py:343 + INFO Running node: create_model_input_table_node: node.py:327 + create_model_input_table([preprocessed_shuttles,preprocessed_companies,reviews]) -> + [model_input_table] +^[[B[11/02/22 10:41:14] INFO Saving data to 'model_input_table' (ParquetDataSet)... data_catalog.py:382 +[11/02/22 10:41:15] INFO Completed 3 out of 9 tasks sequential_runner.py:85 + INFO Loading data from 'model_input_table' (ParquetDataSet)... data_catalog.py:343 + INFO Loading data from 'params:active_modelling_pipeline.model_options' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: split_data_node: node.py:327 + split_data([model_input_table,params:active_modelling_pipeline.model_options]) -> + [active_modelling_pipeline.X_train,active_modelling_pipeline.X_test,active_modelling_pipeline.y_t + rain,active_modelling_pipeline.y_test] + INFO Saving data to 'active_modelling_pipeline.X_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'active_modelling_pipeline.X_test' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'active_modelling_pipeline.y_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'active_modelling_pipeline.y_test' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 4 out of 9 tasks sequential_runner.py:85 + INFO Loading data from 'model_input_table' (ParquetDataSet)... data_catalog.py:343 + INFO Loading data from 'params:candidate_modelling_pipeline.model_options' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: split_data_node: node.py:327 + split_data([model_input_table,params:candidate_modelling_pipeline.model_options]) -> + [candidate_modelling_pipeline.X_train,candidate_modelling_pipeline.X_test,candidate_modelling_pip + eline.y_train,candidate_modelling_pipeline.y_test] + INFO Saving data to 'candidate_modelling_pipeline.X_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'candidate_modelling_pipeline.X_test' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'candidate_modelling_pipeline.y_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'candidate_modelling_pipeline.y_test' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 5 out of 9 tasks sequential_runner.py:85 + INFO Loading data from 'active_modelling_pipeline.X_train' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'active_modelling_pipeline.y_train' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: train_model_node: node.py:327 + train_model([active_modelling_pipeline.X_train,active_modelling_pipeline.y_train]) -> + [active_modelling_pipeline.regressor] + INFO Saving data to 'active_modelling_pipeline.regressor' (PickleDataSet)... data_catalog.py:382 + INFO Completed 6 out of 9 tasks sequential_runner.py:85 + INFO Loading data from 'candidate_modelling_pipeline.X_train' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'candidate_modelling_pipeline.y_train' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: train_model_node: node.py:327 + train_model([candidate_modelling_pipeline.X_train,candidate_modelling_pipeline.y_train]) -> + [candidate_modelling_pipeline.regressor] + INFO Saving data to 'candidate_modelling_pipeline.regressor' (PickleDataSet)... data_catalog.py:382 + INFO Completed 7 out of 9 tasks sequential_runner.py:85 + INFO Loading data from 'active_modelling_pipeline.regressor' (PickleDataSet)... data_catalog.py:343 + INFO Loading data from 'active_modelling_pipeline.X_test' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'active_modelling_pipeline.y_test' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: evaluate_model_node: node.py:327 + evaluate_model([active_modelling_pipeline.regressor,active_modelling_pipeline.X_test,active_model + ling_pipeline.y_test]) -> None + INFO Model has a coefficient R^2 of 0.462 on test data. nodes.py:60 + INFO Completed 8 out of 9 tasks sequential_runner.py:85 + INFO Loading data from 'candidate_modelling_pipeline.regressor' (PickleDataSet)... data_catalog.py:343 + INFO Loading data from 'candidate_modelling_pipeline.X_test' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'candidate_modelling_pipeline.y_test' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: evaluate_model_node: node.py:327 + evaluate_model([candidate_modelling_pipeline.regressor,candidate_modelling_pipeline.X_test,candid + ate_modelling_pipeline.y_test]) -> None + INFO Model has a coefficient R^2 of 0.449 on test data. nodes.py:60 + INFO Completed 9 out of 9 tasks sequential_runner.py:85 + INFO Pipeline execution completed successfully. +``` +

    + +### How it works: the modular `pipeline()` wrapper + +The import you added to the code introduces the pipeline wrapper, which enables you to instantiate multiple instances of pipelines with static structure, but dynamic inputs/outputs/parameters: + +```python +from kedro.pipeline.modular_pipeline import pipeline +``` + +The `pipeline()` wrapper method takes the following arguments: + +| Keyword argument | Description | +| ---------------- | ----------------------------------------------------------------------------------- | +| `pipe` | The `Pipeline` object you want to wrap | +| `inputs` | Any overrides provided to this instance of the underlying wrapped `Pipeline` object | +| `outputs` | Any overrides provided to this instance of the underlying wrapped `Pipeline` object | +| `parameters` | Any overrides provided to this instance of the underlying wrapped `Pipeline` object | +| `namespace` | The namespace that will be encapsulated by this pipeline instance | + + +You can see this snippet as part of the code you added to the example: + +
    +Click to expand + +```python +... + +ds_pipeline_1 = pipeline( + pipe=pipeline_instance, + inputs="model_input_table", + namespace="active_modelling_pipeline", +) + +ds_pipeline_2 = pipeline( + pipe=pipeline_instance, + inputs="model_input_table", + namespace="candidate_modelling_pipeline", +) +``` +
    + +The code instantiates the template_pipeline twice but passes in different parameters. The `pipeline_instance` variable is the template pipeline, and `ds_pipeline_1` and `ds_pipeline_2` are the two separately parameterised instantiations. + +#### How do namespaces affect parameters? + +All `inputs` and `outputs` within the nodes of the `ds_pipeline_1` have the `active_modelling_pipeline` prefix: + +- `params:model_options` turns into `active_modelling_pipeline.params:model_options` +- `X_train` turns into `active_modelling_pipeline.X_train` +- `X_test` turns into `active_modelling_pipeline.X_test`, and so on + +There are a separate set of parameters for `ds_pipeline_2` with the `candidate_modelling_pipeline` prefix: + +- `params:model_options` turns into `candidate_modelling_pipeline.params:model_options` +- `X_train` turns into `candidate_modelling_pipeline.X_train` +- `X_test` turns into `candidate_modelling_pipeline.X_test`, and so on + +However, `model_input_table` does not get parameterised as it needs to be shared between instances, so is frozen outside the scope of the namespace wrappers. + +This renders as follows using `kedro viz` (hover over the datasets to see their full path) : + +![modular_ds](../meta/images/modular_ds.gif) + +## Optional: Kedro runners + +There are three different Kedro runners that can run the pipeline: + +* `SequentialRunner` - runs nodes sequentially; once a node has completed its task then the next one starts. +* `ParallelRunner` - runs nodes in parallel; independent nodes are able to run at the same time, which is more efficient when there are independent branches in your pipeline and enables you to take advantage of multiple CPU cores. +* `ThreadRunner` - runs nodes in parallel, similarly to `ParallelRunner`, but uses multithreading instead of multiprocessing. + +By default, Kedro uses a `SequentialRunner`, which is instantiated when you execute `kedro run` from the terminal. If you decide to use `ParallelRunner`, `ThreadRunner` or a custom runner, you can do so through the `--runner` flag as follows: + +```bash +kedro run --runner=ParallelRunner +kedro run --runner=ThreadRunner +kedro run --runner=module.path.to.my.runner +``` + +`ParallelRunner` performs task parallelisation via multiprocessing, while `ThreadRunner` is intended for use with remote execution engines such as [Spark](../integrations/pyspark_integration.md) and [Dask](/kedro_datasets.dask.ParquetDataSet). + +You can find out more about the runners Kedro provides, and how to create your own, in the [pipeline documentation about runners](../nodes_and_pipelines/run_a_pipeline.md). diff --git a/docs/source/tutorial/create_a_pipeline.md b/docs/source/tutorial/create_a_pipeline.md new file mode 100644 index 0000000000..d0173a1cc9 --- /dev/null +++ b/docs/source/tutorial/create_a_pipeline.md @@ -0,0 +1,352 @@ +# Create a data processing pipeline + +This section explains the following: + +* How to create a Kedro node from a Python function +* How to construct a Kedro pipeline from a set of nodes +* How to persist, or save, datasets output from the pipeline by registering them in the data catalog +* How to run the pipeline + +## Introduction + +The data processing pipeline prepares the data for model building by combining the datasets to create a model input table. The data processing pipeline is made up of the following: + +* Two python files within `src/spaceflights/pipelines/data_processing` + * `nodes.py` (for the node functions that form the data processing) + * `pipeline.py` (to build the pipeline) +* A yaml file: `conf/base/parameters/data_processing.yml` to define the parameters used when running the pipeline +* `__init__.py` files in the required folders to ensure that Python can import the pipeline + +```{note} +Kedro provides the `kedro pipeline create` command to add the skeleton code for a new pipeline. If you are writing a project from scratch and want to add a new pipeline, run the following from the terminal: `kedro pipeline create `. You do **not** need to do this in the spaceflights example as it is already supplied by the starter project. +``` + +## Data preprocessing node functions + +The first step is to preprocess two of the datasets, `companies.csv`, and `shuttles.xlsx`. The preprocessing code for the nodes is in `src/spaceflights/pipelines/data_processing/nodes.py` as a pair of functions (`preprocess_companies` and `preprocess_shuttles`). Each takes a raw DataFrame as input, converts the data in several columns to different types, and outputs a DataFrame containing the preprocessed data: + +
    +Click to expand + +```python +import pandas as pd + + +def _is_true(x: pd.Series) -> pd.Series: + return x == "t" + + +def _parse_percentage(x: pd.Series) -> pd.Series: + x = x.str.replace("%", "") + x = x.astype(float) / 100 + return x + + +def _parse_money(x: pd.Series) -> pd.Series: + x = x.str.replace("$", "").str.replace(",", "") + x = x.astype(float) + return x + + +def preprocess_companies(companies: pd.DataFrame) -> pd.DataFrame: + """Preprocesses the data for companies. + + Args: + companies: Raw data. + Returns: + Preprocessed data, with `company_rating` converted to a float and + `iata_approved` converted to boolean. + """ + companies["iata_approved"] = _is_true(companies["iata_approved"]) + companies["company_rating"] = _parse_percentage(companies["company_rating"]) + return companies + + +def preprocess_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame: + """Preprocesses the data for shuttles. + + Args: + shuttles: Raw data. + Returns: + Preprocessed data, with `price` converted to a float and `d_check_complete`, + `moon_clearance_complete` converted to boolean. + """ + shuttles["d_check_complete"] = _is_true(shuttles["d_check_complete"]) + shuttles["moon_clearance_complete"] = _is_true(shuttles["moon_clearance_complete"]) + shuttles["price"] = _parse_money(shuttles["price"]) + return shuttles +``` + +
    + +## The data processing pipeline + +Next, take a look at `src/spaceflights/pipelines/data_processing/pipeline.py` which constructs a [node](../resources/glossary.md#node) for each function defined above and creates a [modular pipeline](../resources/glossary.md#modular-pipeline) for data processing: + + +
    +Click to expand + +```python +from kedro.pipeline import Pipeline, node, pipeline + +from .nodes import preprocess_companies, preprocess_shuttles + +... + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( + [ + node( + func=preprocess_companies, + inputs="companies", + outputs="preprocessed_companies", + name="preprocess_companies_node", + ), + node( + func=preprocess_shuttles, + inputs="shuttles", + outputs="preprocessed_shuttles", + name="preprocess_shuttles_node", + ), + ..., + ] + ) +``` + +
    + + +Note that the `inputs` statements for `companies` and `shuttles` refer to the datasets defined in `conf/base/catalog.yml`. They are inputs to the `preprocess_companies` and `preprocess_shuttles` functions. Kedro uses the named node inputs (and outputs) to determine interdependencies between the nodes, and their execution order. + + +## Test the example + +Run the following command in your terminal window to test the node named `preprocess_companies_node`: + +```bash +kedro run --nodes=preprocess_companies_node +``` + +You should see output similar to the below: + +
    +Click to expand + +```bash +[08/09/22 16:43:11] INFO Loading data from 'companies' (CSVDataSet)... data_catalog.py:343 + INFO Running node: preprocess_companies_node: node.py:327 + preprocess_companies([companies]) -> [preprocessed_companies] + INFO Saving data to 'preprocessed_companies' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 1 out of 1 tasks sequential_runner.py:85 + INFO Pipeline execution completed successfully. runner.py:89 + INFO Loading data from 'preprocessed_companies' (MemoryDataSet)... data_catalog.py:343 + +``` +
    + +You can run the `preprocess_shuttles` node similarly. To test both nodes together as the complete data processing pipeline: + +```bash +kedro run +``` + +You should see output similar to the following: + +
    +Click to expand + +```bash + INFO Loading data from 'companies' (CSVDataSet)... data_catalog.py:343 + INFO Running node: preprocess_companies_node: node.py:327 + preprocess_companies([companies]) -> [preprocessed_companies] + INFO Saving data to 'preprocessed_companies' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 1 out of 2 tasks sequential_runner.py:85 + INFO Loading data from 'shuttles' (ExcelDataSet)... data_catalog.py:343 +[08/09/22 16:46:08] INFO Running node: preprocess_shuttles_node: preprocess_shuttles([shuttles]) node.py:327 + -> [preprocessed_shuttles] + INFO Saving data to 'preprocessed_shuttles' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 2 out of 2 tasks sequential_runner.py:85 + INFO Pipeline execution completed successfully. runner.py:89 + INFO Loading data from 'preprocessed_companies' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'preprocessed_shuttles' (MemoryDataSet)... data_catalog.py:343 + +``` +
    + +## Preprocessed data registration + +Each of the nodes outputs a new dataset (`preprocessed_companies` and `preprocessed_shuttles`). Kedro saves these outputs in Parquet format [pandas.ParquetDataSet](/kedro_datasets.pandas.ParquetDataSet) because they are registered within the [Data Catalog](../resources/glossary.md#data-catalog) as you can see in `conf/base/catalog.yml`: + +
    +Click to expand + +```yaml +preprocessed_companies: + type: pandas.ParquetDataSet + filepath: data/02_intermediate/preprocessed_companies.pq + +preprocessed_shuttles: + type: pandas.ParquetDataSet + filepath: data/02_intermediate/preprocessed_shuttles.pq +``` +
    + +If you remove these lines from `catalog.yml`, Kedro still runs the pipeline successfully and automatically stores the preprocessed data, in memory, as temporary Python objects of the [MemoryDataSet](/kedro.io.MemoryDataSet) class. Once all nodes that depend on a temporary dataset have executed, Kedro clears the dataset and the Python garbage collector releases the memory. + + +## Create a table for model input + +The next step adds another node that joins together three datasets (`preprocessed_shuttles`, `preprocessed_companies`, and `reviews`) into a single model input table which is saved as `model_input_table`. + +The code for the `create_model_input_table()` function is in `src/spaceflights/pipelines/data_processing/nodes.py`: + +
    +Click to expand + +```python +def create_model_input_table( + shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame +) -> pd.DataFrame: + """Combines all data to create a model input table. + + Args: + shuttles: Preprocessed data for shuttles. + companies: Preprocessed data for companies. + reviews: Raw data for reviews. + Returns: + model input table. + + """ + rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id") + model_input_table = rated_shuttles.merge( + companies, left_on="company_id", right_on="id" + ) + model_input_table = model_input_table.dropna() + return model_input_table +``` + +
    + + +The node is created in `src/kedro_tutorial/pipelines/data_processing/pipeline.py`: + +
    +Click to expand + +```python +from kedro.pipeline import Pipeline, node, pipeline + +from .nodes import create_model_input_table, preprocess_companies, preprocess_shuttles + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( + [ + node( + func=preprocess_companies, + inputs="companies", + outputs="preprocessed_companies", + name="preprocess_companies_node", + ), + node( + func=preprocess_shuttles, + inputs="shuttles", + outputs="preprocessed_shuttles", + name="preprocess_shuttles_node", + ), + node( + func=create_model_input_table, + inputs=["preprocessed_shuttles", "preprocessed_companies", "reviews"], + outputs="model_input_table", + name="create_model_input_table_node", + ), + ] + ) +``` +
    + +## Model input table registration + +The following entry in `conf/base/catalog.yml` saves the model input table dataset to file (in `data/03_primary`): + +```yaml +model_input_table: + type: pandas.ParquetDataSet + filepath: data/03_primary/model_input_table.pq +``` + +## Test the example again + +To test the progress of the example: + +```bash +kedro run +``` + +You should see output similar to the following: + +
    +Click to expand + +```bash +[08/09/22 17:01:10] INFO Reached after_catalog_created hook plugin.py:17 + INFO Loading data from 'companies' (CSVDataSet)... data_catalog.py:343 + INFO Running node: preprocess_companies_node: node.py:327 + preprocess_companies([companies]) -> [preprocessed_companies] + INFO Saving data to 'preprocessed_companies' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 1 out of 3 tasks sequential_runner.py:85 + INFO Loading data from 'shuttles' (ExcelDataSet)... data_catalog.py:343 +[08/09/22 17:01:25] INFO Running node: preprocess_shuttles_node: preprocess_shuttles([shuttles]) node.py:327 + -> [preprocessed_shuttles] + + INFO Saving data to 'preprocessed_shuttles' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 2 out of 3 tasks sequential_runner.py:85 + INFO Loading data from 'preprocessed_shuttles' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'preprocessed_companies' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'reviews' (CSVDataSet)... data_catalog.py:343 + INFO Running node: create_model_input_table_node: node.py:327 + create_model_input_table([preprocessed_shuttles,preprocessed_companies, + reviews]) -> [model_input_table] +[08/09/22 17:01:28] INFO Saving data to 'model_input_table' (MemoryDataSet)... data_catalog.py:382 +[08/09/22 17:01:29] INFO Completed 3 out of 3 tasks sequential_runner.py:85 + INFO Pipeline execution completed successfully. runner.py:89 + INFO Loading data from 'model_input_table' (MemoryDataSet)... data_catalog.py:343 +``` +
    + +## Visualise the project + +This section introduces project visualisation using Kedro-Viz, which is a separate package from the standard Kedro installation. To install it your virtual environment: + +```bash +pip install kedro-viz +``` + +To start Kedro-Viz, enter the following in your terminal: + +```bash +kedro viz +``` + +This command automatically opens a browser tab to serve the visualisation at `http://127.0.0.1:4141/`. Explore the visualisation at leisure, and consult the [visualisation documentation](../visualisation/kedro-viz_visualisation) for more detail. + +To exit, close the browser tab. To regain control of the terminal, enter `^+c` on Mac or `Ctrl+c` on Windows or Linux machines. + +## Checkpoint + +This is an excellent place to take a breath and summarise what you have seen in the example so far. + +![](../meta/images/coffee-cup.png) + +Photo by Malte Helmhold on Unsplash + + +* How to create a new Kedro project from a starter and install its dependencies +* How to add three datasets to the project and set up the Kedro Data Catalog +* How to create a data processing pipeline with three nodes to transform and merge the input datasets and create a model input table +* How to persist the output from a pipeline by registering those datasets to the Data Catalog +* How to visualise the project + +The next step is to create the data science pipeline for spaceflight price prediction. diff --git a/docs/source/tutorial/package_a_project.md b/docs/source/tutorial/package_a_project.md new file mode 100644 index 0000000000..010aed2e6c --- /dev/null +++ b/docs/source/tutorial/package_a_project.md @@ -0,0 +1,156 @@ +# Package an entire Kedro project + +This section explains how to build project documentation, and how to bundle a Kedro project into a Python package. + +Kedro also has an advanced feature which supports packaging on a pipeline level allowing you share and reuse pipelines across projects! To read more about this please look at the [section on micro-packaging](../nodes_and_pipelines/micro_packaging.md). + +## Add documentation to a Kedro project + +There are several documentation frameworks for Python projects. This section describes how to use [Sphinx](https://www.sphinx-doc.org). + +To install Sphinx, run the following: + +```bash +pip install sphinx +``` + +### Set up the Sphinx project files + +```{warning} +Currently, Kedro projects are created with a `docs/source` subdirectory, which gets pre-populated with two Sphinx configuration files (`conf.py`, and `index.rst`), needed by the `kedro build-docs` command. This command is deprecated; it will be removed in Kedro version 0.19, along with those dummy files. + +Before proceeding with these instructions, back up the contents of `docs/source/index.rst` and remove both `docs/source/conf.py` and `docs/source/index.rst`. +``` + +First, run the following command: + +```bash +sphinx-quickstart docs +``` + +Sphinx will ask a series of configuration questions. The first is as follows: + +```text +You have two options for placing the build directory for Sphinx output. +Either, you use a directory "_build" within the root path, +or you separate "source" and "build" directories within the root path. + +> Separate source and build directories (y/n)? [n]: +``` + +Select `y` to separate the build files from the source files, and enter any additional information that Sphinx requests such as the project name and the documentation language, which defaults to English. + +### Build HTML documentation + +```{warning} +If you previously backed up the contents of `index.rst`, restore them before proceeding. +``` + +After the quickstart process is complete, you can build the documentation by **navigating to the `docs` directory** and running the following: + +```bash +make html +``` + +Project documentation will be written to the `docs/build/html` directory. + +You may want to add project-specific Markdown documentation within the `docs/source` folder of your Kedro project. To be able to build it, follow the [introduction instructions of MyST-Parser](https://myst-parser.readthedocs.io/en/stable/intro.html) and update the `docs/source/index.rst` file to add the markdown files to the table of contents. + +### Documentation from docstrings +If you wish to add documentation built from [`docstrings`](https://datacamp.com/community/tutorials/docstrings-python) within your project, you need to make some changes to the Sphinx configuration files found in the `docs/source` directory to use [automatic documentation generation from code](https://www.sphinx-doc.org/en/master/tutorial/automatic-doc-generation.html). + +In `conf.py`, add the following to ensure that the `sphinx.ext.autodoc` and `sphinx.ext.autosummary` extensions are specified, and `autosummary_generate` is enabled: + +```python +extensions = ["sphinx.ext.autodoc", "sphinx.ext.autosummary"] +autosummary_generate = True +``` + +Finally, to ensure that you include the autodoc modules in the build, run the following command once **from the `docs` folder**: + +```bash +sphinx-apidoc --module-first -o source ../src/ + +``` + +This will generate a `docs/src/modules.rst` file, as well as other files containing references to any docstrings. To include those in your documentation, make sure `docs/src/index.rst` has a `modules` entry in the table of contents: + +```text +.. toctree:: + + modules +``` + +**From the `docs` folder** run the following: + +```text +pip install -e ../src +``` + +Finally, **from the `docs folder`**, run this command to build a full set of documentation that automatically includes docstrings: + +```text +make html +``` + +```{note} +Consult the Sphinx project documentation for [additional options to pass to `sphinx-build`](https://www.sphinx-doc.org/en/master/man/sphinx-build.html). To customise your documentation beyond the basic template, you'll need to adjust the [Sphinx configuration settings](https://www.sphinx-doc.org/en/master/usage/configuration.html) which are stored in `docs/source/conf.py` file. +``` + +## Package a Kedro project + +To package a project, run the following in your project root directory: + +```bash +kedro package +``` + +Kedro builds the package into the `dist` folder of the project as a `.whl` file, which is a [Python packaging format for binary distribution](https://packaging.python.org/en/latest/overview/#python-binary-distributions). + +The resulting `.whl` packages only contain the Python source code of the Kedro pipeline, not any of the `conf` and `data` subfolders. This means that you can distribute the project to run elsewhere, such as on a separate computer with different configuration information, dataset and logging locations. + +The project configuration is provided separately in a `tar.gz` file, also inside the `dist` folder. This compressed version of the config files excludes any files inside the `local` directory. + +### Run a packaged project + +To run a packaged project it must first be installed. To install the package from a `.whl` file, you need to have Python and `pip` installed on your machine, but you do not need to have Kedro installed. + +To install the project, run the following command: + +```bash +pip install +``` + +```{note} +Once the packaged project is installed, you will need to add: + +* a `conf` folder +* a `data` folder if the pipeline loads/saves local data + +Alternatively, you can make use of the ``OmegaConfigLoader`` to run the configuration directly from the compressed .tar.gz configuration file by running +kedro run --conf-source .tar.gz +``` + +Once your project is installed, it can be run either from the command line or interactively using Python code. + +To do a basic run of your installed project from the command line, run `python -m `. The packaged project also exposes a command line interface which you can use to modify how your project will be run. To see a list of options, use `python -m --help` at the command line. + +To run your packaged project interactively using code, you can import `main` from the project: + +```python +from .__main__ import main + +main( + ["--pipeline", "__default__"] +) # or simply main() if you don't want to provide any arguments +``` + +This is equivalent to `python -m ` at the command line, and you can pass in all the arguments that correspond to the options described by `python -m --help`. + +### Docker, Airflow and other deployment targets + +There are various methods to deploy packaged pipelines via Kedro plugins: + +* [Kedro-Docker](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) plugin for packaging and shipping Kedro projects within [Docker](https://www.docker.com/) containers. +* [Kedro-Airflow](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow) to convert your Kedro project into an [Airflow](https://airflow.apache.org/) project. +* The [Deployment guide](../deployment/index) touches on other deployment targets such as AWS Batch and Prefect, and there is a [range of third-party plugins for deployment](../extend_kedro/plugins.md#community-developed-plugins). diff --git a/docs/source/tutorial/set_up_data.md b/docs/source/tutorial/set_up_data.md new file mode 100644 index 0000000000..364818b3a1 --- /dev/null +++ b/docs/source/tutorial/set_up_data.md @@ -0,0 +1,127 @@ +# Set up the data + +This section shows how to add datasets to the project's `data` folder. It also reviews how those datasets are registered in [Kedro's Data Catalog](../data/data_catalog.md), which is the registry of all data sources available for use by the project. + +## Project datasets + +The spaceflights tutorial makes use of three fictional datasets of companies shuttling customers to the Moon and back. The data comes in two different formats: `.csv` and `.xlsx`: + +* `companies.csv` contains data about space travel companies, such as their location, fleet count and rating +* `reviews.csv` is a set of reviews from customers for categories, such as comfort and price +* `shuttles.xlsx` is a set of attributes for spacecraft across the fleet, such as their engine type and passenger capacity + +The spaceflights starter has already added the datasets to the `data/01_raw` folder of your project. + +## Dataset registration + +The following information about a dataset must be registered before Kedro can load it: + +* File location (path) +* Parameters for the given dataset +* Type of data +* Versioning + +Open `conf/base/catalog.yml` for the spaceflights project to inspect the contents. The two `csv` datasets are registered as follows: + +
    +Click to expand + +```yaml +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + +reviews: + type: pandas.CSVDataSet + filepath: data/01_raw/reviews.csv +``` +

    + +Likewise for the `xlsx` dataset: + +
    +Click to expand + +```yaml +shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + load_args: + engine: openpyxl # Use modern Excel engine (the default since Kedro 0.18.0) +``` +

    + +The additional line, `load_args`, is passed to the excel file read method (`pd.read_excel`) as a [keyword argument](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html). Although not specified here, the equivalent output is `save_args` and the value would be passed to [`pd.DataFrame.to_excel` method](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html). + +### Test that Kedro can load the data + +Open a `kedro ipython` session in your terminal from the project root directory: + +```bash +kedro ipython +``` + +Then type the following into the IPython prompt to test load some `csv` data: + +```python +companies = catalog.load("companies") +companies.head() +``` + +* The first command creates a variable (`companies`), which is of type `pandas.DataFrame` and loads the dataset (also named `companies` as per top-level key in `catalog.yml`) from the underlying filepath `data/01_raw/companies.csv`. +* The `head` method from `pandas` displays the first five rows of the DataFrame. + +
    +Click to expand + +``` +INFO Loading data from 'companies' (CSVDataSet) +Out[1]: + id company_rating company_location total_fleet_count iata_approved +0 35029 100% Niue 4.0 f +1 30292 67% Anguilla 6.0 f +2 19032 67% Russian Federation 4.0 f +3 8238 91% Barbados 15.0 t +4 30342 NaN Sao Tome and Principe 2.0 t + +``` +

    + +Similarly, to test that the `xlsx` data is loaded as expected: + +```python +shuttles = catalog.load("shuttles") +shuttles.head() +``` + +You should see output such as the following: + +
    +Click to expand + +``` +INFO Loading data from 'shuttles' (ExcelDataSet) +Out[1]: + id shuttle_location shuttle_type engine_type ... d_check_complete moon_clearance_complete price company_id +0 63561 Niue Type V5 Quantum ... f f $1,325.0 35029 +1 36260 Anguilla Type V5 Quantum ... t f $1,780.0 30292 +2 57015 Russian Federation Type V5 Quantum ... f f $1,715.0 19032 +3 14035 Barbados Type V5 Plasma ... f f $4,770.0 8238 +4 10036 Sao Tome and Principe Type V2 Plasma ... f f $2,820.0 30342 + +``` +

    + +When you have finished, close `ipython` session with `exit()`. + +## Further information + +### Custom data + +[Kedro supports numerous datasets](/kedro_datasets) out of the box, but you can also add support for any proprietary data format or filesystem. + +You can find further information about [how to add support for custom datasets](../extend_kedro/custom_datasets.md) in specific documentation covering advanced usage. + +### Supported data locations + +Kedro uses [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to read data from a variety of data stores including local file systems, network file systems, HDFS, and all of the widely-used cloud object stores. diff --git a/docs/source/tutorial/spaceflights_tutorial.md b/docs/source/tutorial/spaceflights_tutorial.md new file mode 100644 index 0000000000..0a65d0369b --- /dev/null +++ b/docs/source/tutorial/spaceflights_tutorial.md @@ -0,0 +1,66 @@ +# Next steps: Tutorial + +In this tutorial, we construct nodes and pipelines for a price-prediction model to illustrate the steps of a typical Kedro workflow. + +The tutorial takes approximately **30 minutes** to complete. You will work in the terminal and by inspecting project files in an IDE or text editor. There is no Jupyter notebook for the project. + +*It is 2160, and the space tourism industry is booming. Globally, thousands of space shuttle companies take tourists to the Moon and back. You have been able to source data that lists the amenities offered in each space shuttle, customer reviews, and company information.* + +***Project***: *You want to construct a model that predicts the price for each trip to the Moon and the corresponding return flight.* + +```{toctree} +:maxdepth: 1 + +tutorial_template +set_up_data +create_a_pipeline +add_another_pipeline +package_a_project +spaceflights_tutorial_faqs +``` + + +![](../meta/images/moon-rocket.png) + +Photo by Ivan Diaz on Unsplash + + +## Get help +If you hit an issue with the tutorial: + +* Check the [spaceflights tutorial FAQ](spaceflights_tutorial_faqs.md) to see if we have answered the question already. +* Use [Kedro-Viz](../visualisation/kedro-viz_visualisation) to visualise your project to better understand how the datasets, nodes and pipelines fit together. +* Use the [#questions channel](https://slack.kedro.org/) on our Slack channel to ask the community for help. +* Search the [searchable archive of Slack discussions](https://www.linen.dev/s/kedro). + +## Terminology + +We explain any Kedro-specific terminology as we introduce it, and further information can be found in the [glossary](../resources/glossary.md). Some additional terminology may not be familiar to some readers, such as the concepts below. + +### Project root directory +Also known as the "root directory", this is the parent folder for the entire project. It is the top-level folder that contains all other files and directories associated with the project. + +### Dependencies +These are Python packages or libraries that an individual project depends upon to complete a task. For example, the Spaceflights tutorial project depends on the [scikit-learn](https://scikit-learn.org/stable/) library. + +### Standard development workflow +When you build a Kedro project, you will typically follow a standard development workflow: + +1. Set up the project template + + * Create a new project and install project dependencies. + * Configure credentials and any other sensitive/personal content, and logging + +2. Set up the data + + * Add data to the `data` folder + * Reference all datasets for the project + +3. Create the pipeline + + * Construct nodes to make up the pipeline + * Choose how to run the pipeline: sequentially or in parallel + +4. Package the project + * Build the project documentation + * Package the project for distribution diff --git a/docs/source/tutorial/spaceflights_tutorial_faqs.md b/docs/source/tutorial/spaceflights_tutorial_faqs.md new file mode 100644 index 0000000000..92d873dcb9 --- /dev/null +++ b/docs/source/tutorial/spaceflights_tutorial_faqs.md @@ -0,0 +1,76 @@ +# Spaceflights tutorial FAQs + +```{note} +If you can't find the answer you need here, [ask the Kedro community for help](https://slack.kedro.org)! +``` + +## How do I resolve these common errors? + +### DataSet errors +#### DataSetError: Failed while loading data from data set +You're [testing whether Kedro can load the raw test data](./set_up_data.md#test-that-kedro-can-load-the-data) and see the following: + +```python +DataSetError: Failed while loading data from data set +CSVDataSet(filepath=...). +[Errno 2] No such file or directory: '.../companies.csv' +``` + +or a similar error for the `shuttles` or `reviews` data. + +Are the [three sample data files](./set_up_data.md#project-datasets) stored in the `data/raw` folder? + +#### DataSetNotFoundError: DataSet not found in the catalog + +You see an error such as the following: + +```python +DataSetNotFoundError: DataSet 'companies' not found in the catalog +``` + +Has something changed in your `catalog.yml` from the version generated by the spaceflights starter? Take a look at the [data specification](./set_up_data.md#dataset-registration) to ensure it is valid. + + +Call `exit()` within the IPython session and restart `kedro ipython` (or type `@kedro_reload` into the IPython console to reload Kedro into the session without restarting). Then try again. + + +#### DataSetError: An exception occurred when parsing config for DataSet + +Are you seeing a message saying that an exception occurred? + +```bash +DataSetError: An exception occurred when parsing config for DataSet +'data_processing.preprocessed_companies': +Object 'ParquetDataSet' cannot be loaded from 'kedro_datasets.pandas'. Please see the +documentation on how to install relevant dependencies for kedro_datasets.pandas.ParquetDataSet: +https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html +``` + +The Kedro Data Catalog is missing [dependencies needed to parse the data](../kedro_project_setup/dependencies.md#install-dependencies-related-to-the-data-catalog). Check that you have [all the project dependencies to `requirements.txt`](./tutorial_template.md#install-project-dependencies) and then call `pip install -r src/requirements.txt` to install them. + +### Pipeline run + +To successfully run the pipeline, all required input datasets must already exist, otherwise you may get an error similar to this: + + +```bash +kedro run --pipeline=data_science + +2019-10-04 12:36:12,158 - kedro.io.data_catalog - INFO - Loading data from `model_input_table` (CSVDataSet)... +2019-10-04 12:36:12,158 - kedro.runner.sequential_runner - WARNING - There are 3 nodes that have not run. +You can resume the pipeline run with the following command: +kedro run +Traceback (most recent call last): + ... + File "pandas/_libs/parsers.pyx", line 382, in pandas._libs.parsers.TextReader.__cinit__ + File "pandas/_libs/parsers.pyx", line 689, in pandas._libs.parsers.TextReader._setup_parser_source +FileNotFoundError: [Errno 2] File b'data/03_primary/model_input_table.csv' does not exist: b'data/03_primary/model_input_table.csv' + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + ... + raise DataSetError(message) from exc +kedro.io.core.DataSetError: Failed while loading data from data set CSVDataSet(filepath=data/03_primary/model_input_table.csv, save_args={'index': False}). +[Errno 2] File b'data/03_primary/model_input_table.csv' does not exist: b'data/03_primary/model_input_table.csv' +``` diff --git a/docs/source/tutorial/tutorial_template.md b/docs/source/tutorial/tutorial_template.md new file mode 100644 index 0000000000..af23f15cfd --- /dev/null +++ b/docs/source/tutorial/tutorial_template.md @@ -0,0 +1,83 @@ +# Set up the spaceflights project + +This section shows how to create a new project (with `kedro new` using the [Kedro spaceflights starter](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights)) and install project dependencies (with `pip install -r src/requirements.txt`). + +## Create a new project + +[Set up Kedro](../get_started/install.md) if you have not already done so. + +```{important} +We recommend that you use the same version of Kedro that was most recently used to test this tutorial (0.18.6). To check the version installed, type `kedro -V` in your terminal window. +``` + +In your terminal, navigate to the folder you want to store the project. Type the following to generate the project from the [Kedro spaceflights starter](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights). The project will be populated with a complete set of working example code: + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you should accept the default choice (`Spaceflights`) as the rest of this tutorial assumes that project name. + +When Kedro has created the project, navigate to the [project root directory](./spaceflights_tutorial.md#project-root-directory): + +```bash +cd spaceflights +``` + +## Install project dependencies + +Kedro projects have a `requirements.txt` file to specify their dependencies and enable sharable projects by ensuring consistency across Python packages and versions. + +The spaceflights project dependencies are stored in `src/requirements.txt`(you may find that the versions differ slightly depending on the version of Kedro): + +```text +# code quality packages +black==22.0 +flake8>=3.7.9, <5.0 +ipython>=7.31.1, <8.0 +isort~=5.0 +nbstripout~=0.4 + +# notebook tooling +jupyter~=1.0 +jupyterlab~=3.0 +jupyterlab_server>=2.11.1, <2.16.0 + +# Pytest + useful extensions +pytest-cov~=3.0 +pytest-mock>=1.7.1, <2.0 +pytest~=7.2 + +# Kedro dependencies and datasets to work with different data formats (including CSV, Excel, and Parquet) +kedro~=0.18.10 +kedro-datasets[pandas.CSVDataSet, pandas.ExcelDataSet, pandas.ParquetDataSet]~=1.1 +kedro-telemetry~=0.2.0 +kedro-viz~=6.0 # Visualise pipelines + +# For modelling in the data science pipeline +scikit-learn~=1.0 +``` + +### Install the dependencies + +To install all the project-specific dependencies, run the following from the project root directory: + +```bash +pip install -r src/requirements.txt +``` + +## Optional: logging and configuration + +You might want to [set up logging](../logging/index.md) at this stage of the workflow, but we do not use it in this tutorial. + +You may also want to store credentials such as usernames and passwords if they are needed for specific data sources used by the project. + +To do this, add them to `conf/local/credentials.yml` (some examples are included in that file for illustration). + +### Configuration best practice to avoid leaking confidential data + +* Do not commit data to version control. +* Do not commit notebook output cells (data can easily sneak into notebooks when you don't delete output cells). +* Do not commit credentials in `conf/`. Use only the `conf/local/` folder for sensitive information like access credentials. + +You can find additional information in the [documentation on configuration](../configuration/configuration_basics.md). diff --git a/docs/source/visualisation/index.md b/docs/source/visualisation/index.md new file mode 100644 index 0000000000..fe0e197e22 --- /dev/null +++ b/docs/source/visualisation/index.md @@ -0,0 +1,18 @@ +# Visualisation with Kedro-Viz + + +[Kedro-Viz](https://github.com/kedro-org/kedro-viz) is a key part of Kedro. It visualises the pipelines in a Kedro project by showing data, nodes, and the connections between them. + +The Kedro-Viz package needs to be installed separately as it is not part of the standard Kedro installation: + +```bash +pip install kedro-viz +``` + +```{toctree} +:maxdepth: 1 + +kedro-viz_visualisation +preview_datasets +visualise_charts_with_plotly +``` diff --git a/docs/source/visualisation/kedro-viz_visualisation.md b/docs/source/visualisation/kedro-viz_visualisation.md new file mode 100644 index 0000000000..0f6e207508 --- /dev/null +++ b/docs/source/visualisation/kedro-viz_visualisation.md @@ -0,0 +1,166 @@ +# Visualise the spaceflights project + + +This section assumes you are familiar with the basic Kedro concepts described in the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md). If you have not yet worked through the tutorial, you can still follow this example. + +If you haven't installed Kedro [follow the documentation to get set up](../get_started/install.md). + +Then, in your terminal window, navigate to the folder you want to store the project. + +Generate a copy of the spaceflights tutorial project with all the code in place by using the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you can enter anything, but we will assume `Spaceflights` throughout. + +When your project is ready, navigate to the root directory of the project and install the dependencies for the project, which include Kedro-Viz: + +```bash +pip install -r src/requirements.txt +``` + +The next step is optional, but useful to check that all is working. Run the full set of pipelines for the tutorial project: + +```bash +kedro run +``` + +To start Kedro-Viz, type the following into your terminal from the project directory: + +```bash +kedro viz +``` + +The command opens a browser tab to serve the visualisation at `http://127.0.0.1:4141/`. + +You should see the following: + +![](../meta/images/pipeline_visualisation.png) + +If a visualisation panel opens up and a pipeline is not visible, refresh the view, and check that your tutorial project code is complete if you've not generated it from the starter template. If you still don't see the visualisation, the Kedro community can help: + +* use the [#questions channel](https://slack.kedro.org/) on our Slack channel to ask the community for help +* search the [searchable archive of Slack discussions](https://www.linen.dev/s/kedro) + +To exit the visualisation, close the browser tab. To regain control of the terminal, enter `^+c` on Mac or `Ctrl+c` on Windows or Linux machines. + +## Automatic visualisation updates + +You can use the `--autoreload` flag to autoreload Kedro-Viz when a `Python` or `YAML` file changes in the project. Add the flag to the command you use to start Kedro-Viz: + +```bash +kedro viz --autoreload +``` + +![](../meta/images/kedro_viz_autoreload.gif) + +The `autoreload` flag reflects changes to the project as they happen. For example, commenting out `create_model_input_table_node` in `pipeline.py` will trigger a re-render of the pipeline: + +![autoreload](../meta/images/autoreload.gif) + +## Visualise layers + +By convention, a [pipeline can be defined as having different layers](../resources/glossary.md#layers-data-engineering-convention) according to how data is processed, which makes it easier to collaborate. + +For example, the [data engineering convention](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71) labels datasets according to the stage of the pipeline (e.g. whether the data has been cleaned). + +In Kedro version 0.18.9 we changed the way layers are defined in the Data Catalog. The definition is now included under the `metadata` key for `kedro-viz` (previously it was an attribute specified within a dataset's definition). + +Here's an example of how to use the Kedro-Viz metadata to define layers: + +```yaml +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + metadata: + kedro-viz: + layer: raw +``` + +In earlier versions of Kedro, layers were specified within a dataset's definition in the Data Catalog, but this will **no longer be supported** from Kedro version 0.19.0. From that version onwards, your `catalog.yml` must specify layers as metadata. + +```diff +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv +- layer: raw ++ metadata: ++ kedro-viz: ++ layer: raw +``` + +Open `catalog.yml` for the completed spaceflights tutorial and define layers in the following way: + +```yaml +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + metadata: + kedro-viz: + layer: raw + +reviews: + type: pandas.CSVDataSet + filepath: data/01_raw/reviews.csv + metadata: + kedro-viz: + layer: raw + +shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + metadata: + kedro-viz: + layer: raw + +preprocessed_companies: + type: pandas.ParquetDataSet + filepath: data/02_intermediate/preprocessed_companies.pq + metadata: + kedro-viz: + layer: intermediate + +preprocessed_shuttles: + type: pandas.ParquetDataSet + filepath: data/02_intermediate/preprocessed_shuttles.pq + metadata: + kedro-viz: + layer: intermediate + +model_input_table: + type: pandas.ParquetDataSet + filepath: data/03_primary/model_input_table.pq + metadata: + kedro-viz: + layer: primary + +regressor: + type: pickle.PickleDataSet + filepath: data/06_models/regressor.pickle + versioned: true + metadata: + kedro-viz: + layer: models +``` + +The visualisation now includes the layers: + +![](../meta/images/pipeline_visualisation_with_layers.png) + +## Share a pipeline visualisation + +You can share a the pipeline structure within a Kedro-Viz visualisation as a JSON file from the terminal: + +```bash +kedro viz --save-file=my_shareable_pipeline.json +``` + +This command will save a visualisation of the `__default__` pipeline as a JSON file called `my_shareable_pipeline.json`. It doesn't share data, such as that in the code panel, nor can you share images or charts. + +To visualise the shared file, type the following to load it from the terminal: + +```bash +kedro viz --load-file=my_shareable_pipeline.json +``` diff --git a/docs/source/visualisation/preview_datasets.md b/docs/source/visualisation/preview_datasets.md new file mode 100644 index 0000000000..f201bd7e17 --- /dev/null +++ b/docs/source/visualisation/preview_datasets.md @@ -0,0 +1,81 @@ +# Preview data in Kedro-Viz + +This page describes how to preview data from different datasets in a Kedro project with Kedro-Viz. Dataset preview was introduced in Kedro-Viz version 6.3.0, which offers preview for `CSVDatasets` and `ExcelDatasets`. + +We use the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) to demonstrate how to add data preview for the `customer`, `shuttle` and `reviews` datasets. Even if you have not yet worked through the tutorial, you can still follow this example; you'll need to use the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights) to generate a copy of the project with working code in place. + +If you haven't installed Kedro [follow the documentation to get set up](../get_started/install.md). + +Then, in your terminal window, navigate to the folder you want to store the project. + +Generate a copy of the spaceflights tutorial project with all the code in place by using the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you can enter anything, but we will assume `Spaceflights` throughout. + +When your project is ready, navigate to the root directory of the project. + +## Configure the Data Catalog + +Kedro-Viz version 6.3.0 currently supports preview of two types of datasets: + +* `pandas.CSVDataset` +* `pandas.ExcelDataset` + + +To enable dataset preview, add the `preview_args` attribute to the kedro-viz configuration under the `metadata` section in the Data Catalog. Within preview_args, specify `nrows` as the number of rows to preview for the dataset. + +```yaml +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + metadata: + kedro-viz: + layer: raw + preview_args: + nrows: 5 + +reviews: + type: pandas.CSVDataSet + filepath: data/01_raw/reviews.csv + metadata: + kedro-viz: + layer: raw + preview_args: + nrows: 10 + +shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + metadata: + kedro-viz: + layer: raw + preview_args: + nrows: 15 +``` + + + +## Previewing Data on Kedro-viz + +After you've configured the Data Catalog, you can preview the datasets on Kedro-Viz. Start Kedro-Viz by running the following command in your terminal: + +```bash +kedro viz +``` + +The previews are shown as follows: + +Click on each dataset node to see a small preview in the metadata panel: + + +![](../meta/images/preview_datasets_metadata.png) + + +View the larger preview of the dataset by clicking the `Expand Preview Table` button on the bottom of the metadata panel. + + +![](../meta/images/preview_datasets_expanded.png) diff --git a/docs/source/visualisation/visualise_charts_with_plotly.md b/docs/source/visualisation/visualise_charts_with_plotly.md new file mode 100644 index 0000000000..5b14d2c635 --- /dev/null +++ b/docs/source/visualisation/visualise_charts_with_plotly.md @@ -0,0 +1,252 @@ +# Visualise charts in Kedro-Viz + +This page describes how to make interactive visualisations of a Kedro project with Kedro-Viz, which supports integration with [Plotly](https://plotly.com/python/) and [Matplotlib](https://matplotlib.org/). + +## Visualisation with Plotly + +We use the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) and add a reporting pipeline that uses Plotly. Even if you have not yet worked through the tutorial, you can still follow this example; you'll need to use the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights) to generate a copy of the project with working code in place. + +If you haven't installed Kedro [follow the documentation to get set up](../get_started/install.md). + +Then, in your terminal window, navigate to the folder you want to store the project. + +Generate a copy of the spaceflights tutorial project with all the code in place by using the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you can enter anything, but we will assume `Spaceflights` throughout. + +When your project is ready, navigate to the root directory of the project. + +### Update the dependencies + +There are two types of Plotly datasets supported by Kedro: + +* `plotly.PlotlyDataSet` which only supports [Plotly Express](https://plotly.com/python/plotly-express) +* `plotly.JSONDataSet` which supports Plotly Express and [Plotly Graph Objects](https://plotly.com/python/graph-objects/) + +To use the Plotly datasets, you must update the `requirements.txt` file in the `src` folder of the Kedro project to add the following dependencies: + + +```text +kedro-datasets[pandas.CSVDataSet, pandas.ExcelDataSet, pandas.ParquetDataSet]~=1.1 +kedro-datasets[plotly.PlotlyDataSet, plotly.JSONDataSet]~=1.1 +``` + +Navigate to the root directory of the project in your terminal and install the dependencies for the tutorial project: + +```bash +pip install -r src/requirements.txt +``` + +### Configure the Data Catalog + +To use the datasets, add them to the Data Catalog by updating `conf/base/catalog.yml`: + +```yaml +shuttle_passenger_capacity_plot_exp: + type: plotly.PlotlyDataSet + filepath: data/08_reporting/shuttle_passenger_capacity_plot_exp.json + versioned: true + plotly_args: + type: bar + fig: + x: shuttle_type + y: passenger_capacity + orientation: h + layout: + xaxis_title: Shuttles + yaxis_title: Average passenger capacity + title: Shuttle Passenger capacity + +shuttle_passenger_capacity_plot_go: + type: plotly.JSONDataSet + filepath: data/08_reporting/shuttle_passenger_capacity_plot_go.json + versioned: true +``` + + +### Create the template reporting pipeline + +In the terminal, run the following command to generate a template for the reporting pipeline: + +```bash +kedro pipeline create reporting +``` + +### Add the Plotly reporting nodes + +Add the following to `src/spaceflights/pipelines/reporting/nodes.py`: + +```python +import plotly.express as px +import plotly.graph_objs as go +import pandas as pd + +# This function uses plotly.express +def compare_passenger_capacity_exp(preprocessed_shuttles: pd.DataFrame): + return ( + preprocessed_shuttles.groupby(["shuttle_type"]) + .mean(numeric_only=True) + .reset_index() + ) + + +# This function uses plotly.graph_objects +def compare_passenger_capacity_go(preprocessed_shuttles: pd.DataFrame): + + data_frame = ( + preprocessed_shuttles.groupby(["shuttle_type"]) + .mean(numeric_only=True) + .reset_index() + ) + fig = go.Figure( + [ + go.Bar( + x=data_frame["shuttle_type"], + y=data_frame["passenger_capacity"], + ) + ] + ) + + return fig +``` + +### Update the reporting pipeline code + +Update `src/spaceflights/pipelines/reporting/pipeline.py` to replace the existing code with the following: + +```python +from kedro.pipeline import Pipeline, node, pipeline +from .nodes import compare_passenger_capacity_exp, compare_passenger_capacity_go + + +def create_pipeline(**kwargs) -> Pipeline: + """This is a simple pipeline which generates a pair of plots""" + return pipeline( + [ + node( + func=compare_passenger_capacity_exp, + inputs="preprocessed_shuttles", + outputs="shuttle_passenger_capacity_plot_exp", + ), + node( + func=compare_passenger_capacity_go, + inputs="preprocessed_shuttles", + outputs="shuttle_passenger_capacity_plot_go", + ), + ] + ) +``` + + +### Run the pipeline + +Now run the pipelines: + +```bash +kedro run +``` + +Then visualise with `kedro viz` + +The generated charts are shown as follows: + +![](../meta/images/chart-icon.png). + +Click on each of see a small preview in the metadata panel: + +![](../meta/images/pipeline_visualisation_plotly_1.png) + +View the larger visualisation of the chart by clicking the 'Expand Plotly Visualisation' button on the bottom of the metadata panel. + +![](../meta/images/pipeline_visualisation_plotly_expand_1.png) + + +## Visualisation with Matplotlib + +Integrating Matplotlib into Kedro-Viz allows you to output charts as part of pipeline visualisation. + +```{note} +The MatplotlibWriter dataset converts Matplotlib objects to image files. This means that Matplotlib charts within Kedro-Viz are static and not interactive, unlike the Plotly charts seen above. +``` + +You can view Matplotlib charts in Kedro-Viz when you use the [Kedro MatplotLibWriter dataset](/kedro_datasets.matplotlib.MatplotlibWriter). + +### Update the dependencies + +You must update the `src/requirements.txt` file in the Kedro project by adding the following dataset to enable Matplotlib for the project: + +```bash +kedro-datasets[matplotlib.MatplotlibWriter]~=1.1 +seaborn~=0.12.1 +``` + +### Configure the Data Catalog +You must also specify the output type in the `catalog.yml` file for the Data Catalog: + +```yaml +dummy_confusion_matrix: + type: matplotlib.MatplotlibWriter + filepath: data/08_reporting/dummy_confusion_matrix.png + versioned: true +``` + +### Add another node +Add the following to `src/spaceflights/pipelines/reporting/nodes.py`: + +```python +import matplotlib.pyplot as plt +import seaborn as sn + +... + + +def create_confusion_matrix(companies: pd.DataFrame): + actuals = [0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1] + predicted = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1] + data = {"y_Actual": actuals, "y_Predicted": predicted} + df = pd.DataFrame(data, columns=["y_Actual", "y_Predicted"]) + confusion_matrix = pd.crosstab( + df["y_Actual"], df["y_Predicted"], rownames=["Actual"], colnames=["Predicted"] + ) + sn.heatmap(confusion_matrix, annot=True) + return plt +``` + +### Update the pipeline + +Update `src/spaceflights/pipelines/reporting/pipeline.py` to add the following to `create_pipeline`: + +```python +from .nodes import create_confusion_matrix + +... + + +def create_pipeline(**kwargs) -> Pipeline: + """This is a simple pipeline which generates a plot""" + return pipeline( + [ + node( + func=create_confusion_matrix, + inputs="companies", + outputs="dummy_confusion_matrix", + ), + ] + ) +``` + +### Run the pipeline + +Run the pipelines with `kedro run` and then visualise the result with `kedro viz`. + +Click to see a small preview of the Matplotlib image in the metadata panel. + +![](../meta/images/pipeline_visualisation_matplotlib.png) + +View the larger visualisation of the chart by clicking the 'Expand Matplotlib Image' button on the bottom of the metadata panel. + +![](../meta/images/pipeline_visualisation_matplotlib_expand.png) diff --git a/features/activate_nbstripout.feature b/features/activate_nbstripout.feature index 0b1ad7fcd9..fa221417d4 100644 --- a/features/activate_nbstripout.feature +++ b/features/activate_nbstripout.feature @@ -1,36 +1,8 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -# Feature: Activate_nbstripout target in new project Scenario: Check nbstripout git post commit hook functionality Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" And I have added a test jupyter notebook And I have initialized a git repository And I have added the project directory to staging diff --git a/features/build_docs.feature b/features/build_docs.feature index 8c526b9e6d..c9f9307ef1 100644 --- a/features/build_docs.feature +++ b/features/build_docs.feature @@ -1,40 +1,11 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -# - Feature: build-docs target in new project @fresh_venv Scenario: Execute build-docs target Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" And I have updated kedro requirements - And I have executed the kedro command "install" + And I have installed the project dependencies When I execute the kedro command "build-docs" Then I should get a successful exit code And docs should be generated diff --git a/features/build_reqs.feature b/features/build_reqs.feature index 8a09d84730..085cab2242 100644 --- a/features/build_reqs.feature +++ b/features/build_reqs.feature @@ -1,38 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -# - Feature: build-reqs target in new project @fresh_venv Scenario: Execute build-reqs target Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" And I have updated kedro requirements And I have executed the kedro command "build-reqs" When I add scrapy>=1.7.3 to the requirements diff --git a/features/environment.py b/features/environment.py index 5f8463ff7e..172dfd006a 100644 --- a/features/environment.py +++ b/features/environment.py @@ -1,44 +1,16 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """Behave environment setup commands.""" -# pylint: disable=unused-argument +# noqa: unused-argument +from __future__ import annotations import os import shutil -import sys import tempfile import venv from pathlib import Path -from typing import Set from features.steps.sh_run import run -_PATHS_TO_REMOVE = set() # type: Set[Path] +_PATHS_TO_REMOVE: set[Path] = set() FRESH_VENV_TAG = "fresh_venv" @@ -78,23 +50,20 @@ def _setup_context_with_venv(context, venv_dir): # this is because exe resolution in subprocess doesn't respect a passed env if os.name == "posix": bin_dir = context.venv_dir / "bin" - path_sep = ":" else: bin_dir = context.venv_dir / "Scripts" - path_sep = ";" context.bin_dir = bin_dir context.pip = str(bin_dir / "pip") context.python = str(bin_dir / "python") context.kedro = str(bin_dir / "kedro") - context.requirements_path = Path("requirements.txt").resolve() # clone the environment, remove any condas and venvs and insert our venv context.env = os.environ.copy() - path = context.env["PATH"].split(path_sep) + path = context.env["PATH"].split(os.pathsep) path = [p for p in path if not (Path(p).parent / "pyvenv.cfg").is_file()] path = [p for p in path if not (Path(p).parent / "conda-meta").is_dir()] path = [str(bin_dir)] + path - context.env["PATH"] = path_sep.join(path) + context.env["PATH"] = os.pathsep.join(path) # Create an empty pip.conf file and point pip to it pip_conf_path = context.venv_dir / "pip.conf" @@ -134,13 +103,15 @@ def _setup_minimal_env(context): "pip", "install", "-U", - "pip>=20.0", - "setuptools>=38.0", + # pip==23.2 breaks pip-tools<7.0, and pip-tools>=7.0 does not support Python 3.7 + "pip>=21.2,<23.2; python_version < '3.8'", + "pip>=21.2; python_version >= '3.8'", + "setuptools>=65.5.1", "wheel", - ".", ], env=context.env, ) + call([context.python, "-m", "pip", "install", "."], env=context.env) return context @@ -149,18 +120,10 @@ def _install_project_requirements(context): Path( "kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt" ) - .read_text() + .read_text(encoding="utf-8") .splitlines() ) install_reqs = [req for req in install_reqs if "{" not in req] install_reqs.append(".[pandas.CSVDataSet]") - - # JupyterLab indirectly depends on pywin32 on Windows. Newer versions of pywin32 - # (e.g. 3xx, to which jupyterlab~=3.0 defaults) have a bug that prevents - # JupyterLab from running, hence the version is forcefully set to 225. - # More details: https://github.com/mhammond/pywin32/issues/1431 - if sys.platform.startswith("win"): - install_reqs.append("pywin32==225") - call([context.pip, "install", *install_reqs], env=context.env) return context diff --git a/features/info.feature b/features/info.feature new file mode 100644 index 0000000000..a4adc3eab3 --- /dev/null +++ b/features/info.feature @@ -0,0 +1,10 @@ +Feature: Run kedro info + Background: + Given I have prepared a config file + And I have run a non-interactive kedro new with starter "default" + + Scenario: Plugins are installed and detected by kedro info + Given I have installed the test plugin + When I execute the kedro command "info" + Then I should get a successful exit code + And I should get a message including "plugin: 0.1" diff --git a/features/install.feature b/features/install.feature deleted file mode 100644 index c5187bff4f..0000000000 --- a/features/install.feature +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -# - -Feature: install target in new project - Background: - Given I have prepared a config file - And I have run a non-interactive kedro new with starter - And I have updated kedro requirements - Then src/requirements.in must not exist - - @fresh_venv - Scenario: Execute install target - When I execute the kedro command "install" - Then I should get a successful exit code - And src/requirements.in file must exist - - @fresh_venv - Scenario: Execute install target without compiled requirements - When I execute the kedro command "install --no-build-reqs" - Then I should get a successful exit code - And src/requirements.in must not exist diff --git a/features/ipython.feature b/features/ipython.feature index 977020f85d..c42dd5daaa 100644 --- a/features/ipython.feature +++ b/features/ipython.feature @@ -1,37 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - Feature: IPython target in new project Scenario: Execute ipython target Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" When I execute the kedro command "ipython" Then I should get a message including "An enhanced Interactive Python" - And I should get a message including "INFO - ** Kedro project project-dummy" - And I should get a message including "INFO - Defined global variable `context`, `session` and `catalog`" + And I should get a message including "Kedro project project-dummy" + And I should get a message including "Defined global variable 'context', 'session', 'catalog' and 'pipelines'" diff --git a/features/jupyter.feature b/features/jupyter.feature index 096e5de2c4..65b5173442 100644 --- a/features/jupyter.feature +++ b/features/jupyter.feature @@ -1,50 +1,27 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -# - Feature: Jupyter targets in new project Background: Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" + + Scenario: Execute jupyter setup target + When I execute the kedro command "jupyter setup" + Then I should get a message including "The kernel has been created successfully at" - Scenario: Execute jupyter-notebook target + Scenario: Execute jupyter notebook target When I execute the kedro jupyter command "notebook --no-browser" + Then I wait for the jupyter webserver to run for up to "120" seconds Then jupyter notebook should run on port 8888 - Scenario: Execute jupyter-lab target + Scenario: Execute jupyter lab target When I execute the kedro jupyter command "lab --no-browser" + Then I wait for the jupyter webserver to run for up to "120" seconds Then Jupyter Lab should run on port 8888 Scenario: Execute node convert into Python files Given I have added a test jupyter notebook When I execute the test jupyter notebook and save changes And I execute the kedro jupyter command "convert --all" - And Wait until the process is finished + And Wait until the process is finished for up to "120" seconds Then I should get a successful exit code And Code cell with node tag should be converted into kedro node diff --git a/features/load_context.feature b/features/load_context.feature index a239e437b6..7930cf2b8b 100644 --- a/features/load_context.feature +++ b/features/load_context.feature @@ -1,36 +1,7 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - Feature: Custom Kedro project Background: Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" Scenario: Update the source directory to be nested When I move the package to "src/nested" @@ -44,18 +15,24 @@ Feature: Custom Kedro project And I execute the kedro command "run" Then I should get a successful exit code - Scenario: Hooks from installed plugins are automatically registered + Scenario: Hooks from installed plugins are automatically registered and work with the default runner Given I have installed the test plugin When I execute the kedro command "run" Then I should get a successful exit code - And I should get a message including "Registered hooks from 1 installed plugin(s): test-plugin-0.1" And I should get a message including "Reached after_catalog_created hook" + # The below line has been changed to DEBUG level. Currently, it's not possible to show + # this message anymore because it's logged before `session._setup_logging` is called. + # It is yet to be determined if we should keep it this way, so leaving this here until + # we have more clarity on the necessity of these logging messages. + # And I should get a message including "Registered hooks from 1 installed plugin(s): test-plugin-0.1" - Scenario: Pipelines from installed plugins are added to the project's pipelines + Scenario: Hooks from installed plugins are automatically registered and work with the parallel runner Given I have installed the test plugin - When I execute the kedro command "run --pipeline from_plugin" + When I execute the kedro command "run --runner=ParallelRunner" Then I should get a successful exit code - And I should get a message including "Registered hooks from 1 installed plugin(s): test-plugin-0.1" + And I should get a message including "Reached after_catalog_created hook" + # See explanation in test above. + # And I should get a message including "Registered hooks from 1 installed plugin(s): test-plugin-0.1" Scenario: Disable automatically registered plugin hooks Given I have installed the test plugin @@ -64,4 +41,5 @@ Feature: Custom Kedro project Then I should get a successful exit code And I should not get a message including "Registered hooks from 1 installed plugin(s): test-plugin-0.1" And I should not get a message including "Reached after_catalog_created hook" - And I should get a message including "Hooks are disabled for plugin(s): test-plugin-0.1" + # See explanation in test above. + # And I should get a message including "Hooks are disabled for plugin(s): test-plugin-0.1" diff --git a/features/micropkg.feature b/features/micropkg.feature new file mode 100644 index 0000000000..d8acc45138 --- /dev/null +++ b/features/micropkg.feature @@ -0,0 +1,19 @@ +Feature: Micro-package target in new project + + Background: + Given I have prepared a config file + And I have run a non-interactive kedro new with starter "default" + And I have installed the project dependencies + + @fresh_venv + Scenario: Package a micro-package + When I execute the kedro command "micropkg package pipelines.data_science" + Then I should get a successful exit code + And I should get a message including "'project_dummy.pipelines.data_science' packaged!" + + @fresh_venv + Scenario: Package a micro-package from manifest + Given I have micro-packaging settings in pyproject.toml + When I execute the kedro command "micropkg package --all" + Then I should get a successful exit code + And I should get a message including "Packaged 'pipelines.data_science' micro-package!" diff --git a/features/new.feature b/features/new.feature new file mode 100644 index 0000000000..4dc40f04e0 --- /dev/null +++ b/features/new.feature @@ -0,0 +1,16 @@ +Feature: New Kedro project + Background: + Given I have prepared a config file + + Scenario: Create a new kedro project without example code + When I run a non-interactive kedro new without starter + Then the expected project directories and files should be created + + Scenario: Create a new kedro project with example code + When I run a non-interactive kedro new with starter "default" + Then the expected project directories and files should be created + + Scenario: Plugins are installed and create a new kedro project with custom plugin starter + Given I have installed the test plugin + When I run a non-interactive kedro new with starter "test_plugin_starter" + Then the expected project directories and files should be created diff --git a/features/new_project.feature b/features/new_project.feature deleted file mode 100644 index 1418bf0b25..0000000000 --- a/features/new_project.feature +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - -Feature: New Kedro project - - Scenario: Create a new kedro project without example code - Given I have prepared a config file - When I run a non-interactive kedro new without starter - Then the expected project directories and files should be created - And the pipeline should contain no nodes - - Scenario: Create a new kedro project with example code - Given I have prepared a config file - When I run a non-interactive kedro new with starter - Then the expected project directories and files should be created - And the pipeline should contain nodes diff --git a/features/package.feature b/features/package.feature index 5858cae26f..21873e775c 100644 --- a/features/package.feature +++ b/features/package.feature @@ -1,37 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - Feature: Package target in new project Background: Given I have prepared a config file - And I have run a non-interactive kedro new with starter - And I have executed the kedro command "install --no-build-reqs" + And I have run a non-interactive kedro new with starter "default" + And I have installed the project dependencies @fresh_venv Scenario: Install package diff --git a/features/run.feature b/features/run.feature index 72ffc4e0e4..1f07e29ad6 100644 --- a/features/run.feature +++ b/features/run.feature @@ -1,32 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - Feature: Run Project @@ -35,20 +6,20 @@ Feature: Run Project Local environment should be used by default when no env option is specified. Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" When I execute the kedro command "run" Then I should get a successful exit code - And the console log should show that 4 nodes were run + And the logs should show that 4 nodes were run Scenario: Run parallel runner with default python entry point with example code Given I have prepared a config file - And I have run a non-interactive kedro new with starter - When I execute the kedro command "run --parallel" + And I have run a non-interactive kedro new with starter "default" + When I execute the kedro command "run --runner=ParallelRunner" Then I should get a successful exit code - And the console log should show that "split_data" was run - And the console log should show that "train_model" was run - And the console log should show that "predict" was run - And the console log should show that "report_accuracy" was run + And the logs should show that "split_data" was run + And the logs should show that "train_model" was run + And the logs should show that "predict" was run + And the logs should show that "report_accuracy" was run Scenario: Run default python entry point without example code Given I have prepared a config file @@ -59,23 +30,33 @@ Feature: Run Project Scenario: Run kedro run with config file Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" And I have prepared a run_config file with config options When I execute the kedro command "run --config run_config.yml" Then I should get a successful exit code - And the console log should show that 1 nodes were run + And the logs should show that 1 nodes were run + + Scenario: Run kedro run with config from archive and OmegaConfigLoader + Given I have prepared a config file + And I have run a non-interactive kedro new with starter "default" + And I have set the OmegaConfigLoader in settings + When I execute the kedro command "package" + Then I should get a successful exit code + When I execute the kedro command "run --conf-source dist/conf-project_dummy.tar.gz" + Then I should get a successful exit code + And the logs should show that 4 nodes were run Scenario: Run kedro run with config file and override option Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" And I have prepared a run_config file with config options When I execute the kedro command "run --config run_config.yml --pipeline __default__" Then I should get a successful exit code - And the console log should show that 4 nodes were run + And the logs should show that 4 nodes were run Scenario: Run kedro run with extra parameters Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" When I execute the kedro command "run --params extra1:1,extra2:value2" Then I should get a successful exit code - And the console log should show that 4 nodes were run + And the logs should show that 4 nodes were run diff --git a/features/starter.feature b/features/starter.feature new file mode 100644 index 0000000000..2449324846 --- /dev/null +++ b/features/starter.feature @@ -0,0 +1,9 @@ +Feature: List Kedro Starters + + Scenario: List all starters with custom starters from plugin + Given I have prepared a config file + And I have installed the test plugin + And I have run a non-interactive kedro new with starter "default" + When I execute the kedro command "starter list" + Then I should get a successful exit code + And I should get a message including "test_plugin_starter" diff --git a/features/steps/cli_steps.py b/features/steps/cli_steps.py index 0e52c1a1c8..a4b2096fdd 100644 --- a/features/steps/cli_steps.py +++ b/features/steps/cli_steps.py @@ -1,37 +1,8 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Behave step definitions for the cli_scenarios feature.""" -import itertools import json -import shlex import shutil +import textwrap from pathlib import Path from time import time @@ -40,6 +11,7 @@ import toml import yaml from behave import given, then, when +from packaging.requirements import Requirement import kedro from features.steps import util @@ -157,7 +129,6 @@ def _check_service_up(context: behave.runner.Context, url: str, string: str): """ response = requests.get(url, timeout=1.0) response.raise_for_status() - data = response.text assert string in data assert context.result.poll() is None @@ -189,12 +160,15 @@ def create_config_file(context): yaml.dump(config, config_file, default_flow_style=False) -@given('I have executed the kedro command "{command}"') -def exec_kedro_target_checked(context, command): - """Execute Kedro command and check the status.""" - cmd = [context.kedro] + command.split() - - res = run(cmd, env=context.env, cwd=str(context.root_project_dir)) +@given("I have installed the project dependencies") +def pip_install_dependencies(context): + """Install project dependencies using pip.""" + reqs_path = "src/requirements.txt" + res = run( + [context.pip, "install", "-r", reqs_path], + env=context.env, + cwd=str(context.root_project_dir), + ) if res.returncode != OK_EXIT_CODE: print(res.stdout) @@ -202,48 +176,6 @@ def exec_kedro_target_checked(context, command): assert False -@given('I have created new environment "{}"') -def create_new_env(context, env_name): - env_path = context.root_project_dir / "conf" / env_name - env_path.mkdir() - - for config_name in ("catalog", "parameters", "credentials"): - path = env_path / f"{config_name}.yml" - with path.open("w") as config_file: - yaml.dump({}, config_file, default_flow_style=False) - - # overwrite the log level for anyconfig from WARNING to INFO - logging_path = env_path / "logging.yml" - logging_json = { - "loggers": { - "anyconfig": { - "level": "INFO", - "handlers": ["console", "info_file_handler", "error_file_handler"], - "propagate": "no", - }, - "kedro.io": { - "level": "INFO", - "handlers": ["console", "info_file_handler", "error_file_handler"], - "propagate": "no", - }, - "kedro.pipeline": { - "level": "INFO", - "handlers": ["console", "info_file_handler", "error_file_handler"], - "propagate": "no", - }, - } - } - with logging_path.open("w") as config_file: - yaml.dump(logging_json, config_file, default_flow_style=False) - - -@given('the python package "{package}" has been uninstalled') -def uninstall_package_via_pip(context, package): - """Uninstall a python package using pip.""" - run([context.pip, "uninstall", "-y", package], env=context.env) - - -@given("I have installed the project's python package") @when("I install the project's python package") def install_project_package_via_pip(context): """Install a python package using pip.""" @@ -271,38 +203,57 @@ def disable_plugin_hooks(context, plugin): settings_file.write(to_add) +@given("I have set the OmegaConfigLoader in settings") +def use_omegaconfigloader(context): + """Set `config_loader_class` in `settings.py`.""" + settings_path = ( + context.root_project_dir / "src" / context.package_name / "settings.py" + ) + to_add = """\nfrom kedro.config import OmegaConfigLoader + \nCONFIG_LOADER_CLASS = OmegaConfigLoader""" + with settings_path.open("a") as settings_file: + settings_file.write(to_add) + + @given("I have initialized a git repository") def init_git_repo(context): """Init git repo""" with util.chdir(context.root_project_dir): check_run("git init") check_run("git config user.name 'Tester'") - check_run("git config user.email 'tester.kedro@quantumblack.com'") + check_run("git config user.email 'tester.kedro@kedro.com'") @given("I have added a test jupyter notebook") def add_test_jupyter_nb(context): """Create a test jupyter notebook using TEST_JUPYTER_ORG.""" with open( - str(context.root_project_dir / "notebooks" / "hello_world.ipynb"), "wt" + str(context.root_project_dir / "notebooks" / "hello_world.ipynb"), + "w", + encoding="utf-8", ) as test_nb_fh: test_nb_fh.write(TEST_JUPYTER_ORG) -@given("I have run a non-interactive kedro new with starter") -@when("I run a non-interactive kedro new with starter") -def create_project_with_starter(context): +@given('I have run a non-interactive kedro new with starter "{starter}"') +@when('I run a non-interactive kedro new with starter "{starter}"') +def create_project_with_starter(context, starter): """Behave step to run kedro new given the config I previously created.""" - starter_dir = Path(__file__).parent / "test_starter" + + if starter == "default": + starter = Path(__file__).parent / "test_starter" + + args = [ + context.kedro, + "new", + "-c", + str(context.config_file), + "--starter", + str(starter), + ] + res = run( - [ - context.kedro, - "new", - "-c", - str(context.config_file), - "--starter", - str(starter_dir), - ], + args, env=context.env, cwd=context.temp_dir, ) @@ -324,13 +275,6 @@ def create_project_without_starter(context): telemetry_file.write_text("consent: false", encoding="utf-8") -@given("I have deleted the credentials file") -def delete_credentials_file(context): - """Delete configuration file from project""" - path_to_config_file = context.root_project_dir / "conf" / "base" / "credentials.yml" - path_to_config_file.unlink() - - @given("I have added the project directory to staging") @when("I add the project directory to staging") def add_proj_dir_to_staging(context): @@ -347,6 +291,7 @@ def commit_changes_to_git(context): check_run(f"git commit -m 'Change {time()}'") +@given('I have executed the kedro command "{command}"') @when('I execute the kedro command "{command}"') def exec_kedro_target(context, command): """Execute Kedro target.""" @@ -367,21 +312,6 @@ def exec_project(context): context.result = run(cmd, env=context.env, cwd=str(context.root_project_dir)) -@when('with tags {tags:CSV}, I execute the kedro command "{cmd}"') -def exec_kedro_run_with_tag(context, cmd, tags): - """Execute `kedro run` with tags""" - kedro_args = shlex.split(cmd) - context.logfile_count = util.get_logline_count( - util.get_logfile_path(context.root_project_dir) - ) - - tag_list = [["--tag", t] for t in tags] - tag_args = list(itertools.chain.from_iterable(tag_list)) - run_cmd = [context.kedro] + kedro_args + tag_args - - context.result = run(run_cmd, env=context.env, cwd=str(context.root_project_dir)) - - @when("I ask the CLI for a version") def get_kedro_version(context): """Behave step to run `kedro -V`.""" @@ -407,14 +337,26 @@ def exec_notebook(context, command): # Jupyter notebook forks a child process from a parent process, and # only kills the parent process when it is terminated context.result = ChildTerminatingPopen( - cmd, env=context.env, cwd=str(context.root_project_dir) + cmd, env=context.env, cwd=str(context.root_project_dir), universal_newlines=True ) -@when("Wait until the process is finished") -def wait(context): +@then('I wait for the jupyter webserver to run for up to "{timeout:d}" seconds') +def wait_for_notebook_to_run(context, timeout): + timeout_start = time() + while time() < timeout_start + timeout: + stdout = context.result.stdout.readline() + if "http://127.0.0.1:" in stdout: + break + + if time() >= timeout_start + timeout: + raise TimeoutError("Failed to run Jupyter server in time") + + +@when('Wait until the process is finished for up to "{timeout:d}" seconds') +def wait(context, timeout): """Wait for child process to terminate.""" - context.result.wait() + context.result.wait(timeout) @when("I execute the test jupyter notebook and save changes") @@ -423,7 +365,9 @@ def simulate_nb_execution(context): simulate that it was executed and output was saved. """ with open( - str(context.root_project_dir / "notebooks" / "hello_world.ipynb"), "wt" + str(context.root_project_dir / "notebooks" / "hello_world.ipynb"), + "w", + encoding="utf-8", ) as test_nb_fh: test_nb_fh.write(TEST_JUPYTER_AFTER_EXEC) @@ -464,18 +408,16 @@ def update_pyproject_toml(context: behave.runner.Context, new_source_dir): @given("I have updated kedro requirements") def update_kedro_req(context: behave.runner.Context): - """Replace kedro as a standalone requirement with a line - that includes all of kedro's dependencies (-r kedro/requirements.txt) - """ + """Remove kedro as a standalone requirement.""" reqs_path = context.root_project_dir / "src" / "requirements.txt" - kedro_reqs = f"-r {context.requirements_path.as_posix()}" if reqs_path.is_file(): old_reqs = reqs_path.read_text().splitlines() new_reqs = [] for req in old_reqs: - if req.startswith("kedro"): - new_reqs.append(kedro_reqs) + if req.startswith("kedro") and Requirement(req).name.lower() == "kedro": + # Do not include kedro as it's preinstalled in the environment + pass else: new_reqs.append(req) new_reqs = "\n".join(new_reqs) @@ -485,7 +427,7 @@ def update_kedro_req(context: behave.runner.Context): @when("I add {dependency} to the requirements") def add_req(context: behave.runner.Context, dependency: str): - reqs_path = context.root_project_dir / "src" / "requirements.in" + reqs_path = context.root_project_dir / "src" / "requirements.txt" if reqs_path.is_file(): reqs_path.write_text(reqs_path.read_text() + "\n" + str(dependency) + "\n") @@ -509,41 +451,13 @@ def is_created(name): assert is_created(path) -@then("the pipeline should contain no nodes") -def check_empty_pipeline_exists(context): - """Check if the created pipeline in - `pipeline_registry.py` contains no nodes. - """ - pipeline_file = ( - context.root_project_dir - / "src" - / context.project_name.replace("-", "_") - / "pipeline_registry.py" - ) - assert '"__default__": Pipeline([])' in pipeline_file.read_text("utf-8") - - -@then("the pipeline should contain nodes") -def check_pipeline_not_empty(context): - """Check if the created pipeline in - `pipeline_registry.py` contains nodes. - """ - pipeline_file = ( - context.root_project_dir - / "src" - / context.project_name.replace("-", "_") - / "pipeline_registry.py" - ) - assert "pipeline = Pipeline([])" not in pipeline_file.read_text("utf-8") - - -@then("the console log should show that {number} nodes were run") +@then("the logs should show that {number} nodes were run") def check_one_node_run(context, number): expected_log_line = f"Completed {number} out of {number} tasks" assert expected_log_line in context.result.stdout -@then('the console log should show that "{node}" was run') +@then('the logs should show that "{node}" was run') def check_correct_nodes_run(context, node): expected_log_line = f"Running node: {node}" stdout = context.result.stdout @@ -578,16 +492,6 @@ def check_failed_status_code(context): assert False, error_msg -@then("the relevant packages should be created") -def check_python_packages_created(context): - """Check that egg and whl files exist in dist dir.""" - dist_dir = context.root_project_dir / "dist" - egg_file = dist_dir.glob("*.egg") - whl_file = dist_dir.glob("*.whl") - assert any(egg_file) - assert any(whl_file) - - @then('I should get a message including "{msg}"') def check_message_printed(context, msg): """Check that specified message is printed to stdout (can be a segment).""" @@ -642,10 +546,11 @@ def check_additional_cell_added(context): coded by TEST_JUPYTER_ORG. """ with open( - str(context.root_project_dir / "notebooks" / "hello_world.ipynb") + str(context.root_project_dir / "notebooks" / "hello_world.ipynb"), + encoding="utf-8", ) as test_nb_fh: context.nb_data = json.load(test_nb_fh) - assert len(context.nb_data["cells"]) == 2 + assert len(context.nb_data["cells"]) == 2 # noqa: PLR2004 @then("the output should be empty in all the cells in the jupyter notebook") @@ -655,7 +560,7 @@ def check_output_cells_empty(context): assert cell["outputs"] == [] -@then("jupyter notebook should run on port {port}") +@then("jupyter notebook should run on port {port:d}") def check_jupyter_nb_proc_on_port(context: behave.runner.Context, port: int): """Check that jupyter notebook service is running on specified port. @@ -664,21 +569,14 @@ def check_jupyter_nb_proc_on_port(context: behave.runner.Context, port: int): port: Port to check """ - url = "http://localhost:%d" % int(port) + url = f"http://localhost:{port}" try: - util.wait_for( - func=_check_service_up, - context=context, - url=url, - string="Jupyter Notebook", - timeout_=15, - print_error=True, - ) + _check_service_up(context, url, "Jupyter Notebook") finally: context.result.terminate() -@then("Jupyter Lab should run on port {port}") +@then("Jupyter Lab should run on port {port:d}") def check_jupyter_lab_proc_on_port(context: behave.runner.Context, port: int): """Check that jupyter lab service is running on specified port. @@ -687,16 +585,9 @@ def check_jupyter_lab_proc_on_port(context: behave.runner.Context, port: int): port: Port to check """ - url = "http://localhost:%d" % int(port) + url = f"http://localhost:{port}" try: - util.wait_for( - func=_check_service_up, - timeout_=20, - context=context, - url=url, - string=' subprocess.CompletedProcess: """Run a shell command. @@ -48,7 +22,7 @@ def run( print_output: If True will print previously captured stdout. Default is False. - kwargs: Extra options to pass to subprocess. + **kwargs: Extra options to pass to subprocess. Example: :: @@ -63,9 +37,7 @@ def run( if isinstance(cmd, str) and split: cmd = shlex.split(cmd) # pylint: disable=subprocess-run-check - result = subprocess.run( - cmd, input="", stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs - ) + result = subprocess.run(cmd, input="", capture_output=True, **kwargs) result.stdout = result.stdout.decode("utf-8") result.stderr = result.stderr.decode("utf-8") if print_output: @@ -73,7 +45,7 @@ def run( return result -def check_run(cmd: Union[list, str], print_output: bool = False) -> None: +def check_run(cmd: list | str, print_output: bool = False) -> None: """ Run cmd using subprocess.check_call (throws error if non-zero value returned) @@ -102,7 +74,7 @@ class ChildTerminatingPopen(subprocess.Popen): dies (so-called orphan processes) """ - def __init__(self, cmd: List[str], **kwargs) -> None: + def __init__(self, cmd: list[str], **kwargs) -> None: """ Initializer pipes stderr and stdout. @@ -112,7 +84,7 @@ def __init__(self, cmd: List[str], **kwargs) -> None: """ super().__init__( # type: ignore - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, **kwargs ) def terminate(self) -> None: diff --git a/features/steps/test_plugin/plugin.py b/features/steps/test_plugin/plugin.py index 542cdab7c6..277fb1f18e 100644 --- a/features/steps/test_plugin/plugin.py +++ b/features/steps/test_plugin/plugin.py @@ -1,49 +1,25 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """Dummy plugin with simple hook implementations.""" import logging +from pathlib import Path +from kedro.framework.cli.starters import KedroStarterSpec from kedro.framework.hooks import hook_impl -from kedro.pipeline import Pipeline, node + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) class MyPluginHook: @hook_impl - def after_catalog_created( - self, catalog - ): # pylint: disable=unused-argument,no-self-use - logging.info("Reached after_catalog_created hook") + def after_catalog_created(self, catalog): # noqa: unused-argument, no-self-use + logger.info("Reached after_catalog_created hook") - @hook_impl - def register_pipelines(self): # pylint: disable=no-self-use - return { - "from_plugin": Pipeline([node(lambda: "sth", inputs=None, outputs="x")]) - } +starters = [ + KedroStarterSpec( + "test_plugin_starter", + template_path=str((Path(__file__).parents[1] / "test_starter").resolve()), + ) +] hooks = MyPluginHook() diff --git a/features/steps/test_plugin/setup.py b/features/steps/test_plugin/setup.py index 40971dd266..d76b760a78 100644 --- a/features/steps/test_plugin/setup.py +++ b/features/steps/test_plugin/setup.py @@ -1,36 +1,12 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. from setuptools import find_packages, setup setup( name="test_plugin", version="0.1", - description="Dummy plugin with hook implementations", + description="Dummy plugin with hook implementations and custom starters", packages=find_packages(), - entry_points={"kedro.hooks": ["test_plugin = plugin:hooks"]}, + entry_points={ + "kedro.hooks": ["test_plugin = plugin:hooks"], + "kedro.starters": ["starter = plugin:starters"], + }, ) diff --git a/features/steps/test_starter/cookiecutter.json b/features/steps/test_starter/cookiecutter.json index 71f06bbf59..8ced11761f 100644 --- a/features/steps/test_starter/cookiecutter.json +++ b/features/steps/test_starter/cookiecutter.json @@ -1,6 +1,6 @@ { "project_name": "New Kedro Project", "repo_name": "{{ cookiecutter.project_name.replace(' ', '-').lower().strip('-') }}", - "python_package": "{{ cookiecutter.project_name.replace(' ', '_').lower().strip('-') }}", + "python_package": "{{ cookiecutter.project_name.replace(' ', '_').replace('-', '_').lower() }}", "kedro_version": "{{ cookiecutter.kedro_version }}" } diff --git a/features/steps/test_starter/prompts.yml b/features/steps/test_starter/prompts.yml index 01a7da5ac0..7e4bf62f66 100644 --- a/features/steps/test_starter/prompts.yml +++ b/features/steps/test_starter/prompts.yml @@ -1,28 +1,9 @@ project_name: - title: "Project Name:" + title: "Project Name" text: | Please enter a human readable name for your new project. - Spaces and punctuation are allowed. - -repo_name: - title: "Repository Name:" - text: | - Please enter a directory name for your new project repository. - Alphanumeric characters, hyphens and underscores are allowed. - Lowercase is recommended. - regex_validator: "^\\w+(-*\\w+)*$" - error_message: | - It must contain only word symbols and/or hyphens, must also - start and end with alphanumeric symbol." - -python_package: - title: "Python Package Name:" - text: | - Please enter a valid Python package name for your project package. - Alphanumeric characters and underscores are allowed. - Lowercase is recommended. Package name must start with a letter - or underscore. - regex_validator: "^[a-zA-Z_]\\w{1,}$" + Spaces, hyphens, and underscores are allowed. + regex_validator: "^[\\w -]{2,}$" error_message: | - It must start with a letter or underscore, be at least 2 characters long - and contain only letters, digits, and/or underscores. + It must contain only alphanumeric symbols, spaces, underscores and hyphens and + be at least 2 characters long. diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/.coveragerc b/features/steps/test_starter/{{ cookiecutter.repo_name }}/.coveragerc deleted file mode 100644 index 003c131200..0000000000 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/.coveragerc +++ /dev/null @@ -1,6 +0,0 @@ -[report] -fail_under=0 -show_missing=True -exclude_lines = - pragma: no cover - raise NotImplementedError diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/.ipython/profile_default/ipython_config.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/.ipython/profile_default/ipython_config.py deleted file mode 100644 index a76e260386..0000000000 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/.ipython/profile_default/ipython_config.py +++ /dev/null @@ -1 +0,0 @@ -c.InteractiveShellApp.extensions.append("kedro.extras.extensions.ipython") diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/README.md b/features/steps/test_starter/{{ cookiecutter.repo_name }}/README.md index 96f144ca62..8041d41dd9 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/README.md +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/README.md @@ -11,7 +11,7 @@ Take a look at the [Kedro documentation](https://kedro.readthedocs.io) to get st In order to get the best out of the template: * Don't remove any lines from the `.gitignore` file we provide -* Make sure your results can be reproduced by following a [data engineering convention](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention) +* Make sure your results can be reproduced by following a [data engineering convention](https://docs.kedro.org/en/stable/resources/glossary.html#layers-data-engineering-convention) * Don't commit data to your repository * Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/` @@ -22,7 +22,7 @@ Declare any dependencies in `src/requirements.txt` for `pip` installation and `s To install them, run: ``` -kedro install +pip install -r src/requirements.txt ``` ## How to run Kedro @@ -52,11 +52,11 @@ To generate or update the dependency requirements for your project: kedro build-reqs ``` -This will copy the contents of `src/requirements.txt` into a new file `src/requirements.in` which will be used as the source for `pip-compile`. You can see the output of the resolution by opening `src/requirements.txt`. +This will `pip-compile` the contents of `src/requirements.txt` into a new file `src/requirements.lock`. You can see the output of the resolution by opening `src/requirements.lock`. -After this, if you'd like to update your project requirements, please update `src/requirements.in` and re-run `kedro build-reqs`. +After this, if you'd like to update your project requirements, please update `src/requirements.txt` and re-run `kedro build-reqs`. -[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/04_kedro_project_setup/01_dependencies.html#project-specific-dependencies) +[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) ## How to work with Kedro and notebooks @@ -118,4 +118,4 @@ To automatically strip out all output cell contents before committing to `git`, ## Package your Kedro project -[Further information about building project documentation and packaging your project](https://kedro.readthedocs.io/en/stable/03_tutorial/05_package_a_project.html) +[Further information about building project documentation and packaging your project](https://kedro.readthedocs.io/en/stable/tutorial/package_a_project.html) diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index de74c169cf..c0c61a3a2c 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -1,7 +1,7 @@ # Here you can define all your data sets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" -# Link: https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html +# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html # # We support interacting with a variety of data stores including local file systems, cloud, network and HDFS # @@ -37,7 +37,7 @@ # # The Data Catalog supports being able to reference the same file using two different DataSet implementations # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: -# https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html +# https://kedro.readthedocs.io/en/stable/data/data_catalog.html # # This is a data set used by the "Hello World" example pipeline provided with the project # template. Please feel free to remove it once you remove the example pipeline. diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/logging.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/logging.yml index 3689418056..984cac5069 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/logging.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/logging.yml @@ -1,66 +1,41 @@ version: 1 + disable_existing_loggers: False + formatters: - simple: - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - json_formatter: - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - class: pythonjsonlogger.jsonlogger.JsonFormatter + simple: + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" handlers: - console: - class: logging.StreamHandler - level: INFO - formatter: simple - stream: ext://sys.stdout - - info_file_handler: - class: logging.handlers.RotatingFileHandler - level: INFO - formatter: simple - filename: logs/info.log - maxBytes: 10485760 # 10MB - backupCount: 20 - encoding: utf8 - delay: True - - error_file_handler: - class: logging.handlers.RotatingFileHandler - level: ERROR - formatter: simple - filename: logs/errors.log - maxBytes: 10485760 # 10MB - backupCount: 20 - encoding: utf8 - delay: True + console: + class: logging.StreamHandler + level: INFO + formatter: simple + stream: ext://sys.stdout - journal_file_handler: - class: kedro.versioning.journal.JournalFileHandler - level: INFO - base_dir: logs/journals - formatter: json_formatter + info_file_handler: + class: logging.handlers.RotatingFileHandler + level: INFO + formatter: simple + filename: info.log + maxBytes: 10485760 # 10MB + backupCount: 20 + encoding: utf8 + delay: True + + rich: + class: kedro.logging.RichHandler + rich_tracebacks: True + # Advance options for customisation. + # See https://docs.kedro.org/en/stable/logging/logging.html#project-side-logging-configuration + # tracebacks_show_locals: False loggers: - anyconfig: - level: WARNING - handlers: [console, info_file_handler, error_file_handler] - propagate: no - - kedro.io: - level: INFO - handlers: [console, info_file_handler, error_file_handler] - propagate: no - - kedro.pipeline: - level: INFO - handlers: [console, info_file_handler, error_file_handler] - propagate: no + kedro: + level: INFO - kedro.journal: - level: INFO - handlers: [journal_file_handler] - propagate: no + {{ cookiecutter.python_package }}: + level: INFO root: - level: INFO - handlers: [console, info_file_handler, error_file_handler] + handlers: [rich] diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/docs/source/conf.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/docs/source/conf.py index d6e605c0e0..89d30d8a16 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/docs/source/conf.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/docs/source/conf.py @@ -1,33 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. # {{ cookiecutter.python_package }} documentation build # configuration file, created by sphinx-quickstart. @@ -46,8 +17,6 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # import re - -from recommonmark.transform import AutoStructify from {{cookiecutter.python_package}} import __version__ as release from kedro.framework.cli.utils import find_stylesheets @@ -55,8 +24,7 @@ # -- Project information ----------------------------------------------------- project = "{{ cookiecutter.python_package }}" -copyright = "2021, QuantumBlack Visual Analytics Limited" -author = "QuantumBlack" +author = "Kedro" # The short X.Y version. version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) @@ -82,8 +50,8 @@ "sphinx.ext.viewcode", "sphinx.ext.mathjax", "nbsphinx", - "recommonmark", "sphinx_copybutton", + "myst_parser", ] # enable autosummary plugin (table of contents for modules/classes/class @@ -179,7 +147,7 @@ master_doc, "{{ cookiecutter.python_package }}.tex", "{{ cookiecutter.python_package }} Documentation", - "QuantumBlack", + "Kedro", "manual", ) ] @@ -251,7 +219,4 @@ def setup(app): app.connect("autodoc-skip-member", skip) # add Kedro stylesheets for stylesheet in find_stylesheets(): - app.add_stylesheet(stylesheet) - # enable rendering RST tables in Markdown - app.add_config_value("recommonmark_config", {"enable_eval_rst": True}, True) - app.add_transform(AutoStructify) + app.add_css_file(stylesheet) diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/logs/journals/.gitkeep b/features/steps/test_starter/{{ cookiecutter.repo_name }}/logs/journals/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/pyproject.toml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/pyproject.toml index ab5d8cdbae..ca5524efc1 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/pyproject.toml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/pyproject.toml @@ -4,14 +4,14 @@ project_version = "{{ cookiecutter.kedro_version }}" package_name = "{{ cookiecutter.python_package }}" [tool.isort] -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -use_parentheses = true -line_length = 88 -known_third_party = "kedro" +profile = "black" [tool.pytest.ini_options] addopts = """ --cov-report term-missing \ --cov src/{{ cookiecutter.python_package }} -ra""" + +[tool.coverage.report] +fail_under = 0 +show_missing = true +exclude_lines = ["pragma: no cover", "raise NotImplementedError"] diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt index 445a0645f3..7e6f29ac16 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt @@ -1,14 +1,14 @@ -black==21.5b1 -flake8>=3.7.9, <4.0 -ipython~=7.10 +black~=22.0 +flake8>=3.7.9, <5.0 +ipython>=7.31.1, <8.0; python_version < '3.8' +ipython~=8.10; python_version >= '3.8' isort~=5.0 jupyter~=1.0 -jupyter_client>=5.1, <7.0 -jupyterlab~=3.0 +jupyterlab_server>=2.11.1, <2.16.0 +jupyterlab~=3.0, <3.6.0 kedro[pandas.CSVDataSet]=={{ cookiecutter.kedro_version }} -kedro-telemetry~=0.1.0; python_version < '3.9' +kedro-telemetry~=0.2.0 nbstripout~=0.4 -pytest-cov~=2.5 +pytest-cov~=3.0 pytest-mock>=1.7.1, <2.0 -pytest~=6.2 -wheel>=0.35, <0.37 +pytest~=7.2 diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/setup.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/setup.py index 940262245b..af5b101519 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/setup.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/setup.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from setuptools import find_packages, setup entry_point = ( @@ -34,14 +6,14 @@ # get the dependencies and installs -with open("requirements.txt", "r", encoding="utf-8") as f: +with open("requirements.txt", encoding="utf-8") as f: # Make sure we strip all comments and options (e.g "--extra-index-url") # that arise from a modified pip.conf file that configure global options # when running kedro build-reqs requires = [] for line in f: req = line.split("#", 1)[0].strip() - if req and not req.startswith("--"): + if req and not req.startswith("-r"): requires.append(req) setup( @@ -52,14 +24,16 @@ install_requires=requires, extras_require={ "docs": [ + "docutils<0.18.0", "sphinx~=3.4.3", "sphinx_rtd_theme==0.5.1", "nbsphinx==0.8.1", "nbstripout~=0.4", - "recommonmark==0.7.1", "sphinx-autodoc-typehints==1.11.1", "sphinx_copybutton==0.3.1", "ipykernel>=5.3, <7.0", + "Jinja2<3.1.0", + "myst-parser~=0.17.2", ] }, ) diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/tests/test_run.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/tests/test_run.py index 054403d876..ee11dea542 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/tests/test_run.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/tests/test_run.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """ This module contains an example test. @@ -39,17 +11,30 @@ from pathlib import Path import pytest +from kedro.config import ConfigLoader from kedro.framework.context import KedroContext +from kedro.framework.hooks import _create_hook_manager +from kedro.framework.project import settings + + +@pytest.fixture +def config_loader(): + return ConfigLoader(conf_source=str(Path.cwd() / settings.CONF_SOURCE)) @pytest.fixture -def project_context(): - return KedroContext(package_name="{{ cookiecutter.python_package }}", project_path=Path.cwd()) +def project_context(config_loader): + return KedroContext( + package_name="{{ cookiecutter.python_package }}", + project_path=Path.cwd(), + config_loader=config_loader, + hook_manager=_create_hook_manager(), + ) # The tests below are here for the demonstration purpose # and should be replaced with the ones testing the project # functionality class TestProjectContext: - def test_package_name(self, project_context): - assert project_context.package_name == "{{ cookiecutter.python_package }}" + def test_project_path(self, project_context): + assert project_context.project_path == Path.cwd() diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py index 6a65ee3d29..177bba98c1 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """{{ cookiecutter.project_name }} """ diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py index c018e6878e..9e6750922a 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """{{ cookiecutter.project_name }} file for ensuring the package is executable as `{{ cookiecutter.repo_name }}` and `python -m {{ cookiecutter.python_package }}` """ @@ -47,7 +20,7 @@ def _find_run_command(package_name): if run: # use run command from installed plugin if it exists return run - # use run command from the framework project + # use run command from `kedro.framework.cli.project` from kedro.framework.cli.project import run return run @@ -63,11 +36,11 @@ def _find_run_command_in_plugins(plugins): return group.commands["run"] -def main(): +def main(*args, **kwargs): package_name = Path(__file__).parent.name configure_project(package_name) run = _find_run_command(package_name) - run() + run(*args, **kwargs) if __name__ == "__main__": diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py deleted file mode 100644 index 135fc6b04b..0000000000 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Project hooks.""" -from typing import Any, Dict, Optional - -from kedro.framework.hooks import hook_impl -from kedro.io import DataCatalog -from kedro.versioning import Journal - - -class ProjectHooks: - @hook_impl - def register_catalog( - self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: - return DataCatalog.from_config( - catalog, credentials, load_versions, save_version, journal - ) diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py index 3262009080..4f2f1eeb89 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py @@ -1,55 +1,19 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Project pipelines.""" -from typing import Dict +from __future__ import annotations +from kedro.framework.project import find_pipelines from kedro.pipeline import Pipeline, pipeline -from {{cookiecutter.python_package}}.pipelines import data_engineering as de -from {{cookiecutter.python_package}}.pipelines import data_science as ds - -def register_pipelines() -> Dict[str, Pipeline]: +def register_pipelines() -> dict[str, Pipeline]: """Register the project's pipelines. Returns: - A mapping from a pipeline name to a ``Pipeline`` object. + A mapping from pipeline names to ``Pipeline`` objects. """ - data_engineering_pipeline = de.create_pipeline() - data_processing_pipeline = pipeline( - de.create_pipeline(), namespace="data_processing" + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + pipelines["data_processing"] = pipeline( + pipelines["data_engineering"], namespace="data_processing" ) - data_science_pipeline = ds.create_pipeline() - - return { - "de": data_engineering_pipeline, - "ds": data_science_pipeline, - "dp": data_processing_pipeline, - "__default__": data_engineering_pipeline + data_science_pipeline, - } + return pipelines diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/__init__.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/__init__.py index 084f7df03e..a3b6f56ce5 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/__init__.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/__init__.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """Example code for the nodes in the example pipeline. This code is meant just for illustrating basic Kedro features. diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py index eb286f1198..3f9c8e1337 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py @@ -1,42 +1,16 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """Example code for the nodes in the example pipeline. This code is meant just for illustrating basic Kedro features. PLEASE DELETE THIS FILE ONCE YOU START WORKING ON YOUR OWN PROJECT! """ +from __future__ import annotations -from typing import Any, Dict +from typing import Any import pandas as pd -def split_data(data: pd.DataFrame, example_test_data_ratio: float) -> Dict[str, Any]: +def split_data(data: pd.DataFrame, example_test_data_ratio: float) -> dict[str, Any]: """Node for splitting the classical Iris data set into training and test sets, each split into features and labels. The split ratio parameter is taken from conf/project/parameters.yml. diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/pipeline.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/pipeline.py index 1050e0283f..dee78699fd 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/pipeline.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/pipeline.py @@ -1,44 +1,16 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Example code for the nodes in the example pipeline. This code is meant just for illustrating basic Kedro features. Delete this when you start working on your own Kedro project. """ -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node, pipeline from .nodes import split_data def create_pipeline(**kwargs): - return Pipeline( + return pipeline( [ node( split_data, diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/__init__.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/__init__.py index 084f7df03e..a3b6f56ce5 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/__init__.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/__init__.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """Example code for the nodes in the example pipeline. This code is meant just for illustrating basic Kedro features. diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/nodes.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/nodes.py index d163c316b3..9217c2d3b1 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/nodes.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/nodes.py @@ -1,47 +1,20 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Example code for the nodes in the example pipeline. This code is meant just for illustrating basic Kedro features. Delete this when you start working on your own Kedro project. """ -# pylint: disable=invalid-name +# noqa: invalid-name +from __future__ import annotations import logging -from typing import Any, Dict +from typing import Any import numpy as np import pandas as pd def train_model( - train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: Dict[str, Any] + train_x: pd.DataFrame, train_y: pd.DataFrame, parameters: dict[str, Any] ) -> np.ndarray: """Node for training a simple multi-class logistic regression model. The number of training iterations as well as the learning rate are taken from diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/pipeline.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/pipeline.py index 9fa4617ab2..a2e44d9424 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/pipeline.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/pipeline.py @@ -1,44 +1,16 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Example code for the nodes in the example pipeline. This code is meant just for illustrating basic Kedro features. Delete this when you start working on your own Kedro project. """ -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node, pipeline from .nodes import predict, report_accuracy, train_model def create_pipeline(**kwargs): - return Pipeline( + return pipeline( [ node( train_model, diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py index ac535e301a..86a92b1c80 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py @@ -1,51 +1,41 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +"""Project settings. There is no need to edit this file unless you want to change values +from the Kedro defaults. For further information, including these default values, see +https://kedro.readthedocs.io/en/stable/kedro_project_setup/settings.html.""" -"""Project settings.""" -from {{cookiecutter.python_package}}.hooks import ProjectHooks +# Instantiated project hooks. +# For example, after creating a hooks.py and defining a ProjectHooks class there, do +# from {{cookiecutter.python_package}}.hooks import ProjectHooks +# HOOKS = (ProjectHooks(),) -# Instantiate and list your project hooks here -HOOKS = (ProjectHooks(),) - -# List the installed plugins for which to disable auto-registry +# Installed plugins for which to disable hook auto-registration. # DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) -# Define where to store data from a KedroSession. Defaults to BaseSessionStore. -# from kedro.framework.session.store import ShelveStore -# SESSION_STORE_CLASS = ShelveStore - -# Define keyword arguments to be passed to `SESSION_STORE_CLASS` constructor +# Class that manages storing KedroSession data. +# from kedro.framework.session.store import BaseSessionStore +# SESSION_STORE_CLASS = BaseSessionStore +# Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor. # SESSION_STORE_ARGS = { # "path": "./sessions" # } -# Define custom context class. Defaults to `KedroContext` +# Directory that holds configuration. +# CONF_SOURCE = "conf" + +# Class that manages how configuration is loaded. +# from kedro.config import OmegaConfigLoader +# CONFIG_LOADER_CLASS = OmegaConfigLoader +# Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor. +# CONFIG_LOADER_ARGS = { +# "config_patterns": { +# "spark" : ["spark*/"], +# "parameters": ["parameters*", "parameters*/**", "**/parameters*"], +# } +# } + +# Class that manages Kedro's library components. +# from kedro.framework.context import KedroContext # CONTEXT_CLASS = KedroContext -# Define the configuration folder. Defaults to `conf` -# CONF_SOURCE = "conf" +# Class that manages the Data Catalog. +# from kedro.io import DataCatalog +# DATA_CATALOG_CLASS = DataCatalog diff --git a/features/steps/util.py b/features/steps/util.py index 53c1c3b48e..f65a4adfa3 100644 --- a/features/steps/util.py +++ b/features/steps/util.py @@ -1,67 +1,13 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Common functions for e2e testing. """ +from __future__ import annotations import os import re -import tempfile from contextlib import contextmanager from pathlib import Path from time import sleep, time -from typing import Any, Callable, Iterator, List - -import pandas as pd - - -def get_sample_csv_content(): - return """col1, col2, col3 - 1, 2, 3 - 4, 5, 6 - """ - - -def get_sample_data_frame(): - data = {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]} - return pd.DataFrame(data) - - -def create_temp_csv(): - _, csv_file_path = tempfile.mkstemp(suffix=".csv") - return csv_file_path - - -def create_sample_csv(): - csv_file_path = create_temp_csv() - with open(csv_file_path, mode="w") as output_file: - output_file.write(get_sample_csv_content()) - return csv_file_path +from typing import Any, Callable, Iterator @contextmanager @@ -123,60 +69,11 @@ def wait_for( sleep(sleep_for) raise WaitForException( - "func: %s, didn't return within specified timeout: %d" % (func, timeout_) + f"func: {func}, didn't return within specified timeout: {timeout_}" ) -def get_logline_count(logfile: str) -> int: - """Get line count in logfile - - Note: If logfile doesn't exist will return 0 - - Args: - logfile: path to logfile - - Returns: - line count of logfile - """ - try: - with open(logfile) as file_handle: - return sum(1 for i in file_handle) - except FileNotFoundError: - return 0 - - -def get_last_logline(logfile: str) -> str: - """Get last line of logfile - - Args: - logfile: path to logfile - - Returns: - last line of logfile - """ - line = "" - with open(logfile) as file_handle: - for line in file_handle: - pass - - return line - - -def get_logfile_path(proj_dir: Path) -> str: - """ - Helper function to fet full path of `pipeline.log` inside project - - Args: - proj_dir: path to proj_dir - - Returns: - path to `pipeline.log` - """ - log_file = (proj_dir / "logs" / "visualization" / "pipeline.log").absolute() - return str(log_file) - - -def parse_csv(text: str) -> List[str]: +def parse_csv(text: str) -> list[str]: """Parse comma separated **double quoted** strings in behave steps Args: diff --git a/features/test.feature b/features/test.feature index c70121e600..0d42f336e6 100644 --- a/features/test.feature +++ b/features/test.feature @@ -1,37 +1,8 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - Feature: Test target in new project Background: Given I have prepared a config file - And I have run a non-interactive kedro new with starter + And I have run a non-interactive kedro new with starter "default" Scenario: Execute successful test in new project When I execute the kedro command "test" diff --git a/features/version.feature b/features/version.feature index 520778a15c..42017daad5 100644 --- a/features/version.feature +++ b/features/version.feature @@ -1,32 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - Feature: Kedro version Scenario: Check kedro version diff --git a/features/windows_reqs.txt b/features/windows_reqs.txt index 32df180278..9d9c461b56 100644 --- a/features/windows_reqs.txt +++ b/features/windows_reqs.txt @@ -1,9 +1,10 @@ -# same versions as `test_requirements` +# same versions as [test] optional requirements # e2e tests on Windows are slow but we don't need to install # everything, so just this subset will be enough for CI behave==1.2.6 -pandas~=1.2 -psutil==5.8.0 +pandas~=1.3 +psutil~=5.8 requests~=2.20 toml~=0.10.1 -PyYAML>=4.2, <6.0 +PyYAML>=4.2, <7.0 +packaging>=20.0 diff --git a/kedro/__init__.py b/kedro/__init__.py index be0bc5d7f6..de25eb79a4 100644 --- a/kedro/__init__.py +++ b/kedro/__init__.py @@ -1,39 +1,28 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Kedro is a framework that makes it easy to build robust and scalable data pipelines by providing uniform project templates, data abstraction, configuration and pipeline assembly. """ -__version__ = "0.17.4" +import sys +import warnings + +__version__ = "0.18.11" + + +class KedroPythonVersionWarning(UserWarning): + """Custom class for warnings about incompatibilities with Python versions.""" + + pass -import logging +if not sys.warnoptions: + warnings.simplefilter("error", KedroPythonVersionWarning) -logging.getLogger(__name__).addHandler(logging.NullHandler()) +if sys.version_info >= (3, 11): + warnings.warn( + """Kedro is not yet fully compatible with this Python version. +To proceed at your own risk and ignore this warning, +run Kedro with `python -W "default:Kedro is not yet fully compatible" -m kedro ...` +or set the PYTHONWARNINGS environment variable accordingly.""", + KedroPythonVersionWarning, + ) diff --git a/kedro/__main__.py b/kedro/__main__.py index f696ab4a24..09ff52f885 100644 --- a/kedro/__main__.py +++ b/kedro/__main__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Entry point when invoked with python -m kedro.""" # pragma: no cover if __name__ == "__main__": # pragma: no cover diff --git a/kedro/config/__init__.py b/kedro/config/__init__.py index c8493961e5..1b17cec0ac 100644 --- a/kedro/config/__init__.py +++ b/kedro/config/__init__.py @@ -1,38 +1,21 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.config`` provides functionality for loading Kedro configuration from different file formats. """ +from .abstract_config import ( + AbstractConfigLoader, + BadConfigException, + MissingConfigException, +) +from .config import ConfigLoader +from .omegaconf_config import OmegaConfigLoader +from .templated_config import TemplatedConfigLoader -from .abstract_config import AbstractConfigLoader # NOQA -from .abstract_config import BadConfigException # NOQA -from .abstract_config import MissingConfigException # NOQA -from .config import ConfigLoader # NOQA -from .templated_config import TemplatedConfigLoader # NOQA +__all__ = [ + "AbstractConfigLoader", + "BadConfigException", + "ConfigLoader", + "MissingConfigException", + "TemplatedConfigLoader", + "OmegaConfigLoader", +] diff --git a/kedro/config/abstract_config.py b/kedro/config/abstract_config.py index 5eecd68a29..776ec6c836 100644 --- a/kedro/config/abstract_config.py +++ b/kedro/config/abstract_config.py @@ -1,40 +1,14 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module provides ``kedro.abstract_config`` with the baseline class model for a `ConfigLoader` implementation. """ -from abc import ABC, abstractmethod -from typing import Any, Dict +from __future__ import annotations +from collections import UserDict +from typing import Any -class AbstractConfigLoader(ABC): - """ - ``AbstractConfigLoader`` is the abstract base class + +class AbstractConfigLoader(UserDict): + """``AbstractConfigLoader`` is the abstract base class for all `ConfigLoader` implementations. All user-defined `ConfigLoader` implementations should inherit from `AbstractConfigLoader` and implement all relevant abstract methods. @@ -44,17 +18,13 @@ def __init__( self, conf_source: str, env: str = None, - runtime_params: Dict[str, Any] = None, - **kwargs # pylint: disable=unused-argument + runtime_params: dict[str, Any] = None, + **kwargs, ): + super().__init__() self.conf_source = conf_source self.env = env - self.runtime_params = runtime_params - - @abstractmethod # pragma: no cover - def get(self) -> Dict[str, Any]: - """Required method to get all configurations.""" - pass + self.runtime_params = runtime_params or {} class BadConfigException(Exception): diff --git a/kedro/config/common.py b/kedro/config/common.py index 3f8d4c0f4a..0db6b637f8 100644 --- a/kedro/config/common.py +++ b/kedro/config/common.py @@ -1,42 +1,17 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """This module contains methods and facade interfaces for various ConfigLoader implementations. """ +from __future__ import annotations import logging from glob import iglob from pathlib import Path -from typing import AbstractSet, Any, Dict, Iterable, List, Set +from typing import AbstractSet, Any, Iterable from warnings import warn -from kedro.config import BadConfigException, MissingConfigException +from yaml.parser import ParserError + +from kedro.config.abstract_config import BadConfigException, MissingConfigException SUPPORTED_EXTENSIONS = [ ".yml", @@ -54,7 +29,8 @@ def _get_config_from_patterns( conf_paths: Iterable[str], patterns: Iterable[str] = None, ac_template: bool = False, -) -> Dict[str, Any]: + ac_context: dict[str, Any] = None, +) -> dict[str, Any]: """Recursively scan for configuration files, load and merge them, and return them in the form of a config dictionary. @@ -65,6 +41,8 @@ def _get_config_from_patterns( ac_template: Boolean flag to indicate whether to use the `ac_template` argument of the ``anyconfig.load`` method. Used in the context of `_load_config_file` function. + ac_context: anyconfig context to pass to ``anyconfig.load`` method. + Used in the context of `_load_config_file` function. Raises: ValueError: If 2 or more configuration files inside the same @@ -83,12 +61,12 @@ def _get_config_from_patterns( if not patterns: raise ValueError( - "`patterns` must contain at least one glob " + "'patterns' must contain at least one glob " "pattern to match config filenames against." ) - config = {} # type: Dict[str, Any] - processed_files = set() # type: Set[Path] + config: dict[str, Any] = {} + processed_files: set[Path] = set() for conf_path in conf_paths: if not Path(conf_path).is_dir(): @@ -101,17 +79,19 @@ def _get_config_from_patterns( Path(conf_path), patterns, processed_files, _config_logger ) new_conf = _load_configs( - config_filepaths=config_filepaths, ac_template=ac_template + config_filepaths=config_filepaths, + ac_template=ac_template, + ac_context=ac_context, ) common_keys = config.keys() & new_conf.keys() if common_keys: sorted_keys = ", ".join(sorted(common_keys)) msg = ( - "Config from path `%s` will override the following " - "existing top-level config keys: %s" + "Config from path [magenta]%s[/magenta] will override the " + "following existing top-level config keys: '%s'" ) - _config_logger.info(msg, conf_path, sorted_keys) + _config_logger.info(msg, conf_path, sorted_keys, extra={"markup": True}) config.update(new_conf) processed_files |= set(config_filepaths) @@ -124,37 +104,58 @@ def _get_config_from_patterns( return config -def _load_config_file(config_file: Path, ac_template: bool = False) -> Dict[str, Any]: +def _load_config_file( + config_file: Path, ac_template: bool = False, ac_context: dict[str, Any] = None +) -> dict[str, Any]: """Load an individual config file using `anyconfig` as a backend. Args: config_file: Path to a config file to process. ac_template: Boolean flag to indicate whether to use the `ac_template` argument of the ``anyconfig.load`` method. + ac_context: anyconfig context to pass to ``anyconfig.load`` method. Raises: BadConfigException: If configuration is poorly formatted and cannot be loaded. + ParserError: If file is invalid and cannot be parsed. Returns: Parsed configuration. """ # for performance reasons - import anyconfig # pylint: disable=import-outside-toplevel + import anyconfig # noqa: import-outside-toplevel try: # Default to UTF-8, which is Python 3 default encoding, to decode the file with open(config_file, encoding="utf8") as yml: + _config_logger.debug( + "Loading config file: [bright magenta]%s[/bright magenta]", + config_file, + extra={"markup": True}, + ) return { k: v - for k, v in anyconfig.load(yml, ac_template=ac_template).items() + for k, v in anyconfig.load( + yml, ac_template=ac_template, ac_context=ac_context + ).items() if not k.startswith("_") } except AttributeError as exc: raise BadConfigException(f"Couldn't load config file: {config_file}") from exc + except ParserError as exc: + assert exc.problem_mark is not None + line = exc.problem_mark.line + cursor = exc.problem_mark.column + raise ParserError( + f"Invalid YAML file {config_file}, unable to read line {line}, position {cursor}." + ) from exc -def _load_configs(config_filepaths: List[Path], ac_template: bool) -> Dict[str, Any]: + +def _load_configs( + config_filepaths: list[Path], ac_template: bool, ac_context: dict[str, Any] = None +) -> dict[str, Any]: """Recursively load all configuration files, which satisfy a given list of glob patterns from a specific path. @@ -163,6 +164,8 @@ def _load_configs(config_filepaths: List[Path], ac_template: bool) -> Dict[str, ac_template: Boolean flag to indicate whether to use the `ac_template` argument of the ``anyconfig.load`` method. Used in the context of `_load_config_file` function. + ac_context: anyconfig context to pass to ``anyconfig.load`` method. + Used in the context of `_load_config_file` function. Raises: ValueError: If 2 or more configuration files contain the same key(s). @@ -175,10 +178,12 @@ def _load_configs(config_filepaths: List[Path], ac_template: bool) -> Dict[str, """ aggregate_config = {} - seen_file_to_keys = {} # type: Dict[Path, AbstractSet[str]] + seen_file_to_keys: dict[Path, AbstractSet[str]] = {} for config_filepath in config_filepaths: - single_config = _load_config_file(config_filepath, ac_template=ac_template) + single_config = _load_config_file( + config_filepath, ac_template=ac_template, ac_context=ac_context + ) _check_duplicate_keys(seen_file_to_keys, config_filepath, single_config) seen_file_to_keys[config_filepath] = single_config.keys() aggregate_config.update(single_config) @@ -189,9 +194,9 @@ def _load_configs(config_filepaths: List[Path], ac_template: bool) -> Dict[str, def _lookup_config_filepaths( conf_path: Path, patterns: Iterable[str], - processed_files: Set[Path], + processed_files: set[Path], logger: Any, -) -> List[Path]: +) -> list[Path]: config_files = _path_lookup(conf_path, patterns) seen_files = config_files & processed_files @@ -207,7 +212,7 @@ def _lookup_config_filepaths( def _remove_duplicates(items: Iterable[str]): """Remove duplicates while preserving the order.""" - unique_items = [] # type: List[str] + unique_items: list[str] = [] for item in items: if item not in unique_items: unique_items.append(item) @@ -220,7 +225,7 @@ def _remove_duplicates(items: Iterable[str]): def _check_duplicate_keys( - processed_files: Dict[Path, AbstractSet[str]], filepath: Path, conf: Dict[str, Any] + processed_files: dict[Path, AbstractSet[str]], filepath: Path, conf: dict[str, Any] ) -> None: duplicates = [] @@ -229,7 +234,7 @@ def _check_duplicate_keys( if overlapping_keys: sorted_keys = ", ".join(sorted(overlapping_keys)) - if len(sorted_keys) > 100: + if len(sorted_keys) > 100: # noqa: PLR2004 sorted_keys = sorted_keys[:100] + "..." duplicates.append(f"{processed_file}: {sorted_keys}") @@ -238,7 +243,7 @@ def _check_duplicate_keys( raise ValueError(f"Duplicate keys found in {filepath} and:\n- {dup_str}") -def _path_lookup(conf_path: Path, patterns: Iterable[str]) -> Set[Path]: +def _path_lookup(conf_path: Path, patterns: Iterable[str]) -> set[Path]: """Return a set of all configuration files from ``conf_path`` or its subdirectories, which satisfy a given list of glob patterns. diff --git a/kedro/config/config.py b/kedro/config/config.py index 87d51f3e7c..5eb7b93921 100644 --- a/kedro/config/config.py +++ b/kedro/config/config.py @@ -1,37 +1,12 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module provides ``kedro.config`` with the functionality to load one or more configuration files from specified paths. """ +from __future__ import annotations + from pathlib import Path -from typing import Any, Dict, Iterable +from typing import Any, Iterable -from kedro.config import AbstractConfigLoader +from kedro.config.abstract_config import AbstractConfigLoader from kedro.config.common import _get_config_from_patterns, _remove_duplicates @@ -78,22 +53,22 @@ class ConfigLoader(AbstractConfigLoader): >>> import logging.config >>> from kedro.config import ConfigLoader + >>> from kedro.framework.project import settings >>> - >>> conf_loader = ConfigLoader('conf', 'local') - >>> - >>> conf_logging = conf_loader.get('logging*') - >>> logging.config.dictConfig(conf_logging) # set logging conf + >>> conf_path = str(project_path / settings.CONF_SOURCE) + >>> conf_loader = ConfigLoader(conf_source=conf_path, env="local") >>> - >>> conf_catalog = conf_loader.get('catalog*', 'catalog*/**') - >>> conf_params = conf_loader.get('**/parameters.yml') + >>> conf_catalog = conf_loader["catalog"] + >>> conf_params = conf_loader["parameters"] """ - def __init__( + def __init__( # noqa: too-many-arguments self, conf_source: str, env: str = None, - runtime_params: Dict[str, Any] = None, + runtime_params: dict[str, Any] = None, + config_patterns: dict[str, list[str]] = None, *, base_env: str = "base", default_run_env: str = "local", @@ -104,25 +79,51 @@ def __init__( conf_source: Path to use as root directory for loading configuration. env: Environment that will take precedence over base. runtime_params: Extra parameters passed to a Kedro run. + config_patterns: Regex patterns that specify the naming convention for configuration + files so they can be loaded. Can be customised by supplying config_patterns as + in `CONFIG_LOADER_ARGS` in `settings.py`. base_env: Name of the base environment. Defaults to `"base"`. This is used in the `conf_paths` property method to construct the configuration paths. - default_run_env: Name of the base environment. Defaults to `"local"`. + default_run_env: Name of the default run environment. Defaults to `"local"`. This is used in the `conf_paths` property method to construct the configuration paths. Can be overriden by supplying the `env` argument. """ - super().__init__( - conf_source=conf_source, env=env, runtime_params=runtime_params - ) self.base_env = base_env self.default_run_env = default_run_env + self.config_patterns = { + "catalog": ["catalog*", "catalog*/**", "**/catalog*"], + "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + "credentials": ["credentials*", "credentials*/**", "**/credentials*"], + } + self.config_patterns.update(config_patterns or {}) + + super().__init__( + conf_source=conf_source, + env=env, + runtime_params=runtime_params, + ) + + def __getitem__(self, key): + # Allow bypassing of loading config from patterns if a key and value have been set + # explicitly on the ``ConfigLoader`` instance. + if key in self: + return super().__getitem__(key) + return self.get(*self.config_patterns[key]) + + def __repr__(self): # pragma: no cover + return ( + f"ConfigLoader(conf_source={self.conf_source}, env={self.env}, " + f"config_patterns={self.config_patterns})" + ) + @property def conf_paths(self): """Property method to return deduplicated configuration paths.""" return _remove_duplicates(self._build_conf_paths()) - def get(self, *patterns: str) -> Dict[str, Any]: + def get(self, *patterns: str) -> dict[str, Any]: # type: ignore return _get_config_from_patterns( conf_paths=self.conf_paths, patterns=list(patterns) ) diff --git a/kedro/config/default_logger.py b/kedro/config/default_logger.py deleted file mode 100644 index 699de92ab0..0000000000 --- a/kedro/config/default_logger.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -"""This module facilitates the loading of the default ``kedro.config`` -for setting up the logging -""" - -import logging.config -import os - -import yaml - -CURRENT_DIR = os.path.dirname(__file__) - -with open(os.path.join(CURRENT_DIR, "logging.yml"), "rt") as conf_file: - LOGGING_CONFIG = yaml.safe_load(conf_file.read()) - logging.config.dictConfig(LOGGING_CONFIG) diff --git a/kedro/config/logging.yml b/kedro/config/logging.yml deleted file mode 100644 index db2e9916bf..0000000000 --- a/kedro/config/logging.yml +++ /dev/null @@ -1,47 +0,0 @@ -version: 1 -disable_existing_loggers: False -formatters: - simple: - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - -handlers: - console: - class: logging.StreamHandler - level: INFO - formatter: simple - stream: ext://sys.stdout - - info_file_handler: - class: logging.handlers.RotatingFileHandler - level: INFO - formatter: simple - filename: info.log - maxBytes: 10485760 # 10MB - backupCount: 20 - encoding: utf8 - delay: True - - error_file_handler: - class: logging.handlers.RotatingFileHandler - level: ERROR - formatter: simple - filename: errors.log - maxBytes: 10485760 # 10MB - backupCount: 20 - encoding: utf8 - delay: True - -loggers: - anyconfig: - level: WARNING - handlers: [console] - propagate: no - - kedro.framework.cli: - level: WARNING - handlers: [console] - propagate: no - -root: - level: INFO - handlers: [console, info_file_handler, error_file_handler] diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py new file mode 100644 index 0000000000..de41d4b213 --- /dev/null +++ b/kedro/config/omegaconf_config.py @@ -0,0 +1,342 @@ +"""This module provides ``kedro.config`` with the functionality to load one +or more configuration files of yaml or json type from specified paths through OmegaConf. +""" +from __future__ import annotations + +import io +import logging +import mimetypes +from pathlib import Path +from typing import Any, Iterable + +import fsspec +from omegaconf import OmegaConf +from omegaconf.resolvers import oc +from yaml.parser import ParserError +from yaml.scanner import ScannerError + +from kedro.config.abstract_config import AbstractConfigLoader, MissingConfigException + +_config_logger = logging.getLogger(__name__) + + +class OmegaConfigLoader(AbstractConfigLoader): + """Recursively scan directories (config paths) contained in ``conf_source`` for + configuration files with a ``yaml``, ``yml`` or ``json`` extension, load and merge + them through ``OmegaConf`` (https://omegaconf.readthedocs.io/) + and return them in the form of a config dictionary. + + The first processed config path is the ``base`` directory inside + ``conf_source``. The optional ``env`` argument can be used to specify a + subdirectory of ``conf_source`` to process as a config path after ``base``. + + When the same top-level key appears in any two config files located in + the same (sub)directory, a ``ValueError`` is raised. + + When the same key appears in any two config files located in different + (sub)directories, the last processed config path takes precedence + and overrides this key and any sub-keys. + + You can access the different configurations as follows: + :: + + >>> import logging.config + >>> from kedro.config import OmegaConfigLoader + >>> from kedro.framework.project import settings + >>> + >>> conf_path = str(project_path / settings.CONF_SOURCE) + >>> conf_loader = OmegaConfigLoader(conf_source=conf_path, env="local") + >>> + >>> conf_catalog = conf_loader["catalog"] + >>> conf_params = conf_loader["parameters"] + + ``OmegaConf`` supports variable interpolation in configuration + https://omegaconf.readthedocs.io/en/2.2_branch/usage.html#merging-configurations. It is + recommended to use this instead of yaml anchors with the ``OmegaConfigLoader``. + + This version of the ``OmegaConfigLoader`` does not support any of the built-in ``OmegaConf`` + resolvers. Support for resolvers might be added in future versions. + + To use this class, change the setting for the `CONFIG_LOADER_CLASS` constant + in `settings.py`. + + Example: + :: + + >>> # in settings.py + >>> from kedro.config import OmegaConfigLoader + >>> + >>> CONFIG_LOADER_CLASS = OmegaConfigLoader + + """ + + def __init__( # noqa: too-many-arguments + self, + conf_source: str, + env: str = None, + runtime_params: dict[str, Any] = None, + *, + config_patterns: dict[str, list[str]] = None, + base_env: str = "base", + default_run_env: str = "local", + ): + """Instantiates a ``OmegaConfigLoader``. + + Args: + conf_source: Path to use as root directory for loading configuration. + env: Environment that will take precedence over base. + runtime_params: Extra parameters passed to a Kedro run. + config_patterns: Regex patterns that specify the naming convention for configuration + files so they can be loaded. Can be customised by supplying config_patterns as + in `CONFIG_LOADER_ARGS` in `settings.py`. + base_env: Name of the base environment. Defaults to `"base"`. + This is used in the `conf_paths` property method to construct + the configuration paths. + default_run_env: Name of the default run environment. Defaults to `"local"`. + Can be overridden by supplying the `env` argument. + """ + self.base_env = base_env + self.default_run_env = default_run_env + + self.config_patterns = { + "catalog": ["catalog*", "catalog*/**", "**/catalog*"], + "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + "credentials": ["credentials*", "credentials*/**", "**/credentials*"], + } + self.config_patterns.update(config_patterns or {}) + + # Deactivate oc.env built-in resolver for OmegaConf + OmegaConf.clear_resolver("oc.env") + + file_mimetype, _ = mimetypes.guess_type(conf_source) + if file_mimetype == "application/x-tar": + self._protocol = "tar" + elif file_mimetype in ( + "application/zip", + "application/x-zip-compressed", + "application/zip-compressed", + ): + self._protocol = "zip" + else: + self._protocol = "file" + self._fs = fsspec.filesystem(protocol=self._protocol, fo=conf_source) + + super().__init__( + conf_source=conf_source, + env=env, + runtime_params=runtime_params, + ) + + def __getitem__(self, key) -> dict[str, Any]: + """Get configuration files by key, load and merge them, and + return them in the form of a config dictionary. + + Args: + key: Key of the configuration type to fetch. + + Raises: + KeyError: If key provided isn't present in the config_patterns of this + ``OmegaConfigLoader`` instance. + MissingConfigException: If no configuration files exist matching the patterns + mapped to the provided key. + + Returns: + Dict[str, Any]: A Python dictionary with the combined + configuration from all configuration files. + """ + # Allow bypassing of loading config from patterns if a key and value have been set + # explicitly on the ``OmegaConfigLoader`` instance. + if key in self: + return super().__getitem__(key) + + if key not in self.config_patterns: + raise KeyError( + f"No config patterns were found for '{key}' in your config loader" + ) + patterns = [*self.config_patterns[key]] + + read_environment_variables = key == "credentials" + + processed_files: set[Path] = set() + # Load base env config + if self._protocol == "file": + base_path = str(Path(self.conf_source) / self.base_env) + else: + base_path = str(Path(self._fs.ls("", detail=False)[-1]) / self.base_env) + base_config = self.load_and_merge_dir_config( + base_path, patterns, key, processed_files, read_environment_variables + ) + config = base_config + + # Load chosen env config + run_env = self.env or self.default_run_env + if self._protocol == "file": + env_path = str(Path(self.conf_source) / run_env) + else: + env_path = str(Path(self._fs.ls("", detail=False)[-1]) / run_env) + env_config = self.load_and_merge_dir_config( + env_path, patterns, key, processed_files, read_environment_variables + ) + # Destructively merge the two env dirs. The chosen env will override base. + common_keys = config.keys() & env_config.keys() + if common_keys: + sorted_keys = ", ".join(sorted(common_keys)) + msg = ( + "Config from path '%s' will override the following " + "existing top-level config keys: %s" + ) + _config_logger.debug(msg, env_path, sorted_keys) + + config.update(env_config) + + if not processed_files: + raise MissingConfigException( + f"No files of YAML or JSON format found in {base_path} or {env_path} matching" + f" the glob pattern(s): {[*self.config_patterns[key]]}" + ) + return config + + def __repr__(self): # pragma: no cover + return ( + f"OmegaConfigLoader(conf_source={self.conf_source}, env={self.env}, " + f"config_patterns={self.config_patterns})" + ) + + def load_and_merge_dir_config( # noqa: too-many-arguments + self, + conf_path: str, + patterns: Iterable[str], + key: str, + processed_files: set, + read_environment_variables: bool | None = False, + ) -> dict[str, Any]: + """Recursively load and merge all configuration files in a directory using OmegaConf, + which satisfy a given list of glob patterns from a specific path. + + Args: + conf_path: Path to configuration directory. + patterns: List of glob patterns to match the filenames against. + key: Key of the configuration type to fetch. + processed_files: Set of files read for a given configuration type. + read_environment_variables: Whether to resolve environment variables. + + Raises: + MissingConfigException: If configuration path doesn't exist or isn't valid. + ValueError: If two or more configuration files contain the same key(s). + ParserError: If config file contains invalid YAML or JSON syntax. + + Returns: + Resulting configuration dictionary. + + """ + # noqa: too-many-locals + + if not self._fs.isdir(Path(conf_path).as_posix()): + raise MissingConfigException( + f"Given configuration path either does not exist " + f"or is not a valid directory: {conf_path}" + ) + + paths = [ + Path(each) + for pattern in patterns + for each in self._fs.glob(Path(f"{str(conf_path)}/{pattern}").as_posix()) + ] + deduplicated_paths = set(paths) + config_files_filtered = [ + path for path in deduplicated_paths if self._is_valid_config_path(path) + ] + + config_per_file = {} + for config_filepath in config_files_filtered: + try: + with self._fs.open(str(config_filepath.as_posix())) as open_config: + # As fsspec doesn't allow the file to be read as StringIO, + # this is a workaround to read it as a binary file and decode it back to utf8. + tmp_fo = io.StringIO(open_config.read().decode("utf8")) + config = OmegaConf.load(tmp_fo) + processed_files.add(config_filepath) + if read_environment_variables: + self._resolve_environment_variables(config) + config_per_file[config_filepath] = config + except (ParserError, ScannerError) as exc: + line = exc.problem_mark.line # type: ignore + cursor = exc.problem_mark.column # type: ignore + raise ParserError( + f"Invalid YAML or JSON file {Path(conf_path, config_filepath.name).as_posix()}," + f" unable to read line {line}, position {cursor}." + ) from exc + + seen_file_to_keys = { + file: set(config.keys()) for file, config in config_per_file.items() + } + aggregate_config = config_per_file.values() + self._check_duplicates(seen_file_to_keys) + + if not aggregate_config: + return {} + + if key == "parameters": + # Merge with runtime parameters only for "parameters" + return OmegaConf.to_container( + OmegaConf.merge(*aggregate_config, self.runtime_params), resolve=True + ) + return { + k: v + for k, v in OmegaConf.to_container( + OmegaConf.merge(*aggregate_config), resolve=True + ).items() + if not k.startswith("_") + } + + def _is_valid_config_path(self, path): + """Check if given path is a file path and file type is yaml or json.""" + posix_path = path.as_posix() + return self._fs.isfile(str(posix_path)) and path.suffix in [ + ".yml", + ".yaml", + ".json", + ] + + @staticmethod + def _check_duplicates(seen_files_to_keys: dict[Path, set[Any]]): + duplicates = [] + + filepaths = list(seen_files_to_keys.keys()) + for i, filepath1 in enumerate(filepaths, 1): + config1 = seen_files_to_keys[filepath1] + for filepath2 in filepaths[i:]: + config2 = seen_files_to_keys[filepath2] + + combined_keys = config1 & config2 + overlapping_keys = { + key for key in combined_keys if not key.startswith("_") + } + + if overlapping_keys: + sorted_keys = ", ".join(sorted(overlapping_keys)) + if len(sorted_keys) > 100: # noqa: PLR2004 + sorted_keys = sorted_keys[:100] + "..." + duplicates.append( + f"Duplicate keys found in {filepath1} and {filepath2}: {sorted_keys}" + ) + + if duplicates: + dup_str = "\n".join(duplicates) + raise ValueError(f"{dup_str}") + + @staticmethod + def _resolve_environment_variables(config: dict[str, Any]) -> None: + """Use the ``oc.env`` resolver to read environment variables and replace + them in-place, clearing the resolver after the operation is complete if + it was not registered beforehand. + + Arguments: + config {Dict[str, Any]} -- The configuration dictionary to resolve. + """ + if not OmegaConf.has_resolver("oc.env"): + OmegaConf.register_new_resolver("oc.env", oc.env) + OmegaConf.resolve(config) + OmegaConf.clear_resolver("oc.env") + else: + OmegaConf.resolve(config) diff --git a/kedro/config/templated_config.py b/kedro/config/templated_config.py index d11c7481dd..615b75fdda 100644 --- a/kedro/config/templated_config.py +++ b/kedro/config/templated_config.py @@ -1,42 +1,17 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module provides ``kedro.config`` with the functionality to load one or more configuration files from specified paths, and format template strings with the values from the passed dictionary. """ +from __future__ import annotations + import re from copy import deepcopy from pathlib import Path -from typing import Any, Dict, Iterable, Optional +from typing import Any, Iterable import jmespath -from kedro.config import AbstractConfigLoader +from kedro.config.abstract_config import AbstractConfigLoader from kedro.config.common import _get_config_from_patterns, _remove_duplicates IDENTIFIER_PATTERN = re.compile( @@ -114,16 +89,17 @@ class TemplatedConfigLoader(AbstractConfigLoader): https://github.com/jmespath/jmespath.py and https://jmespath.org/. """ - def __init__( + def __init__( # noqa: too-many-arguments self, conf_source: str, env: str = None, - runtime_params: Dict[str, Any] = None, + runtime_params: dict[str, Any] = None, + config_patterns: dict[str, list[str]] = None, *, base_env: str = "base", default_run_env: str = "local", - globals_pattern: Optional[str] = None, - globals_dict: Optional[Dict[str, Any]] = None, + globals_pattern: str | None = None, + globals_dict: dict[str, Any] | None = None, ): """Instantiates a ``TemplatedConfigLoader``. @@ -131,6 +107,9 @@ def __init__( conf_source: Path to use as root directory for loading configuration. env: Environment that will take precedence over base. runtime_params: Extra parameters passed to a Kedro run. + config_patterns: Regex patterns that specify the naming convention for configuration + files so they can be loaded. Can be customised by supplying config_patterns as + in `CONFIG_LOADER_ARGS` in `settings.py`. base_env: default_run_env: globals_pattern: Optional keyword-only argument specifying a glob @@ -141,6 +120,13 @@ def __init__( obtained from the globals_pattern. In case of duplicate keys, the ``globals_dict`` keys take precedence. """ + self.config_patterns = { + "catalog": ["catalog*", "catalog*/**", "**/catalog*"], + "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + "credentials": ["credentials*", "credentials*/**", "**/credentials*"], + } + self.config_patterns.update(config_patterns or {}) + super().__init__( conf_source=conf_source, env=env, runtime_params=runtime_params ) @@ -150,7 +136,7 @@ def __init__( self._config_mapping = ( _get_config_from_patterns( conf_paths=self.conf_paths, - patterns=list(globals_pattern), + patterns=[globals_pattern], ac_template=False, ) if globals_pattern @@ -159,18 +145,31 @@ def __init__( globals_dict = deepcopy(globals_dict) or {} self._config_mapping = {**self._config_mapping, **globals_dict} + def __getitem__(self, key): + # Allow bypassing of loading config from patterns if a key and value have been set + # explicitly on the ``TemplatedConfigLoader`` instance. + if key in self: + return super().__getitem__(key) + return self.get(*self.config_patterns[key]) + + def __repr__(self): # pragma: no cover + return ( + f"TemplatedConfigLoader(conf_source={self.conf_source}, env={self.env}, " + f"config_patterns={self.config_patterns})" + ) + @property def conf_paths(self): """Property method to return deduplicated configuration paths.""" return _remove_duplicates(self._build_conf_paths()) - def get(self, *patterns: str) -> Dict[str, Any]: + def get(self, *patterns: str) -> dict[str, Any]: # type: ignore """Tries to resolve the template variables in the config dictionary provided by the ``ConfigLoader`` (super class) ``get`` method using the dictionary of replacement values obtained in the ``__init__`` method. Args: - patterns: Glob patterns to match. Files, which names match + *patterns: Glob patterns to match. Files, which names match any of the specified patterns, will be processed. Returns: @@ -178,15 +177,13 @@ def get(self, *patterns: str) -> Dict[str, Any]: configuration files. **Note:** any keys that start with `_` will be ignored. String values wrapped in `${...}` will be replaced with the result of the corresponding JMESpath - expression evaluated against globals (see `__init` for more - configuration files. **Note:** any keys that start with `_` - details). + expression evaluated against globals. Raises: ValueError: malformed config found. """ config_raw = _get_config_from_patterns( - conf_paths=self.conf_paths, patterns=list(patterns), ac_template=True + conf_paths=self.conf_paths, patterns=patterns, ac_template=True ) return _format_object(config_raw, self._config_mapping) @@ -198,7 +195,7 @@ def _build_conf_paths(self) -> Iterable[str]: ] -def _format_object(val: Any, format_dict: Dict[str, Any]) -> Any: +def _format_object(val: Any, format_dict: dict[str, Any]) -> Any: """Recursive function that loops through the values of a map. In case another map or a list is encountered, it calls itself. When a string is encountered, it will use the `format_dict` to replace strings that look like `${expr}`, @@ -247,8 +244,8 @@ def _format_string(match): if value is None: if match.group("default") is None: raise ValueError( - "Failed to format pattern '{}': " - "no config value found, no default provided".format(match.group(0)) + f"Failed to format pattern '{match.group(0)}': " + f"no config value found, no default provided" ) return match.group("default") @@ -262,11 +259,11 @@ def _format_string(match): formatted_key = _format_object(key, format_dict) if not isinstance(formatted_key, str): raise ValueError( - "When formatting '{}' key, only string values can be used. " - "'{}' found".format(key, formatted_key) + f"When formatting '{key}' key, only string values can be used. " + f"'{formatted_key}' found" ) - key = formatted_key + key = formatted_key # noqa: PLW2901 new_dict[key] = _format_object(value, format_dict) diff --git a/kedro/extras/__init__.py b/kedro/extras/__init__.py index 46d04812e7..5a7dd9fb59 100644 --- a/kedro/extras/__init__.py +++ b/kedro/extras/__init__.py @@ -1,30 +1,2 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.extras`` provides functionality such as datasets and extensions. """ diff --git a/kedro/extras/datasets/README.md b/kedro/extras/datasets/README.md index e15e47aeff..3058ac4ab2 100644 --- a/kedro/extras/datasets/README.md +++ b/kedro/extras/datasets/README.md @@ -1,5 +1,9 @@ # Datasets +> **Warning** +> `kedro.extras.datasets` is deprecated and will be removed in Kedro 0.19, +> install `kedro-datasets` instead by running `pip install kedro-datasets`. + Welcome to `kedro.extras.datasets`, the home of Kedro's data connectors. Here you will find `AbstractDataSet` implementations created by QuantumBlack and external contributors. ## What `AbstractDataSet` implementations are supported? @@ -8,11 +12,11 @@ We support a range of data descriptions, including CSV, Excel, Parquet, Feather, These data descriptions are supported with the APIs of `pandas`, `spark`, `networkx`, `matplotlib`, `yaml` and more. -[The Data Catalog](https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html) allows you to work with a range of file formats on local file systems, network file systems, cloud object stores, and Hadoop. +[The Data Catalog](https://kedro.readthedocs.io/en/stable/data/data_catalog.html) allows you to work with a range of file formats on local file systems, network file systems, cloud object stores, and Hadoop. Here is a full list of [supported data descriptions and APIs](https://kedro.readthedocs.io/en/stable/kedro.extras.datasets.html). ## How can I create my own `AbstractDataSet` implementation? -Take a look at our [instructions on how to create your own `AbstractDataSet` implementation](https://kedro.readthedocs.io/en/stable/07_extend_kedro/01_custom_datasets.html). +Take a look at our [instructions on how to create your own `AbstractDataSet` implementation](https://kedro.readthedocs.io/en/stable/extend_kedro/custom_datasets.html). diff --git a/kedro/extras/datasets/__init__.py b/kedro/extras/datasets/__init__.py index e6214db822..5397e3da98 100644 --- a/kedro/extras/datasets/__init__.py +++ b/kedro/extras/datasets/__init__.py @@ -1,31 +1,19 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.extras.datasets`` is where you can find all of Kedro's data connectors. These data connectors are implementations of the ``AbstractDataSet``. + +.. warning:: + + ``kedro.extras.datasets`` is deprecated and will be removed in Kedro 0.19. + Refer to :py:mod:`kedro_datasets` for the documentation, and + install ``kedro-datasets`` to avoid breakage by running ``pip install kedro-datasets``. + """ + +from warnings import warn as _warn + +_warn( + "`kedro.extras.datasets` is deprecated and will be removed in Kedro 0.19, " + "install `kedro-datasets` instead by running `pip install kedro-datasets`.", + DeprecationWarning, + stacklevel=2, +) diff --git a/kedro/extras/datasets/api/__init__.py b/kedro/extras/datasets/api/__init__.py deleted file mode 100644 index 205ec40312..0000000000 --- a/kedro/extras/datasets/api/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``APIDataSet`` loads the data from HTTP(S) APIs -and returns them into either as string or json Dict. -It uses the python requests library: https://requests.readthedocs.io/en/master/ -""" - -__all__ = ["APIDataSet"] - -from contextlib import suppress - -with suppress(ImportError): - from .api_dataset import APIDataSet # NOQA diff --git a/kedro/extras/datasets/api/api_dataset.py b/kedro/extras/datasets/api/api_dataset.py deleted file mode 100644 index 65071b5057..0000000000 --- a/kedro/extras/datasets/api/api_dataset.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``APIDataSet`` loads the data from HTTP(S) APIs. -It uses the python requests library: https://requests.readthedocs.io/en/master/ -""" -import socket -from typing import Any, Dict, List, Tuple, Union - -import requests -from requests.auth import AuthBase - -from kedro.io.core import AbstractDataSet, DataSetError - - -class APIDataSet(AbstractDataSet): - """``APIDataSet`` loads the data from HTTP(S) APIs. - It uses the python requests library: https://requests.readthedocs.io/en/master/ - - Example: - :: - - >>> from kedro.extras.datasets.api import APIDataSet - >>> - >>> - >>> data_set = APIDataSet( - >>> url="https://quickstats.nass.usda.gov" - >>> params={ - >>> "key": "SOME_TOKEN", - >>> "format": "JSON", - >>> "commodity_desc": "CORN", - >>> "statisticcat_des": "YIELD", - >>> "agg_level_desc": "STATE", - >>> "year": 2000 - >>> } - >>> ) - >>> data = data_set.load() - """ - - # pylint: disable=too-many-arguments - def __init__( - self, - url: str, - method: str = "GET", - data: Any = None, - params: Dict[str, Any] = None, - headers: Dict[str, Any] = None, - auth: Union[Tuple[str], AuthBase] = None, - json: Union[List, Dict[str, Any]] = None, - timeout: int = 60, - ) -> None: - """Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint. - - Args: - url: The API URL endpoint. - method: The Method of the request, GET, POST, PUT, DELETE, HEAD, etc... - data: The request payload, used for POST, PUT, etc requests - https://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests - params: The url parameters of the API. - https://requests.readthedocs.io/en/master/user/quickstart/#passing-parameters-in-urls - headers: The HTTP headers. - https://requests.readthedocs.io/en/master/user/quickstart/#custom-headers - auth: Anything ``requests`` accepts. Normally it's either ``('login', 'password')``, - or ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases. - json: The request payload, used for POST, PUT, etc requests, passed in - to the json kwarg in the requests object. - https://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests - timeout: The wait time in seconds for a response, defaults to 1 minute. - https://requests.readthedocs.io/en/master/user/quickstart/#timeouts - - """ - super().__init__() - self._request_args: Dict[str, Any] = { - "url": url, - "method": method, - "data": data, - "params": params, - "headers": headers, - "auth": auth, - "json": json, - "timeout": timeout, - } - - def _describe(self) -> Dict[str, Any]: - return dict(**self._request_args) - - def _execute_request(self) -> requests.Response: - try: - response = requests.request(**self._request_args) - response.raise_for_status() - except requests.exceptions.HTTPError as exc: - raise DataSetError("Failed to fetch data", exc) from exc - except socket.error as exc: - raise DataSetError("Failed to connect to the remote server") from exc - - return response - - def _load(self) -> requests.Response: - return self._execute_request() - - def _save(self, data: Any) -> None: - raise DataSetError(f"{self.__class__.__name__} is a read only data set type") - - def _exists(self) -> bool: - response = self._execute_request() - - return response.ok diff --git a/kedro/extras/datasets/biosequence/__init__.py b/kedro/extras/datasets/biosequence/__init__.py index 181335adc4..9f2f1a2a2e 100644 --- a/kedro/extras/datasets/biosequence/__init__.py +++ b/kedro/extras/datasets/biosequence/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to read/write from/to a sequence file.""" __all__ = ["BioSequenceDataSet"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .biosequence_dataset import BioSequenceDataSet # NOQA + from .biosequence_dataset import BioSequenceDataSet diff --git a/kedro/extras/datasets/biosequence/biosequence_dataset.py b/kedro/extras/datasets/biosequence/biosequence_dataset.py index 2e2b55086d..4888158774 100644 --- a/kedro/extras/datasets/biosequence/biosequence_dataset.py +++ b/kedro/extras/datasets/biosequence/biosequence_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """BioSequenceDataSet loads and saves data to/from bio-sequence objects to file. """ @@ -38,8 +10,12 @@ from kedro.io.core import AbstractDataSet, get_filepath_str, get_protocol_and_path +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class BioSequenceDataSet(AbstractDataSet): +class BioSequenceDataSet(AbstractDataSet[List, List]): r"""``BioSequenceDataSet`` loads and saves data to a sequence file. Example: @@ -68,8 +44,7 @@ class BioSequenceDataSet(AbstractDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -130,12 +105,12 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + } def _load(self) -> List: load_path = get_filepath_str(self._filepath, self._protocol) diff --git a/kedro/extras/datasets/dask/__init__.py b/kedro/extras/datasets/dask/__init__.py index 895ded5102..d93bf4c63f 100644 --- a/kedro/extras/datasets/dask/__init__.py +++ b/kedro/extras/datasets/dask/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Provides I/O modules using dask dataframe.""" __all__ = ["ParquetDataSet"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .parquet_dataset import ParquetDataSet # NOQA + from .parquet_dataset import ParquetDataSet diff --git a/kedro/extras/datasets/dask/parquet_dataset.py b/kedro/extras/datasets/dask/parquet_dataset.py index 6e8fed3d40..08c93b1d49 100644 --- a/kedro/extras/datasets/dask/parquet_dataset.py +++ b/kedro/extras/datasets/dask/parquet_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``ParquetDataSet`` is a data set used to load and save data to parquet files using Dask dataframe""" @@ -34,47 +6,92 @@ import dask.dataframe as dd import fsspec +import triad from kedro.io.core import AbstractDataSet, get_protocol_and_path +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class ParquetDataSet(AbstractDataSet): +class ParquetDataSet(AbstractDataSet[dd.DataFrame, dd.DataFrame]): """``ParquetDataSet`` loads and saves data to parquet file(s). It uses Dask remote data services to handle the corresponding load and save operations: - https://docs.dask.org/en/latest/remote-data-services.html - - Example (AWS S3): - :: - - >>> from kedro.extras.datasets.dask import ParquetDataSet - >>> import pandas as pd - >>> import dask.dataframe as dd - >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - >>> 'col3': [5, 6]}) - >>> ddf = dd.from_pandas(data, npartitions=2) - >>> - >>> data_set = ParquetDataSet( - >>> filepath="s3://bucket_name/path/to/folder", - >>> credentials={ - >>> 'client_kwargs':{ - >>> 'aws_access_key_id': 'YOUR_KEY', - >>> 'aws_secret_access_key': 'YOUR SECRET', - >>> } - >>> }, - >>> save_args={"compression": "GZIP"} - >>> ) - >>> data_set.save(ddf) - >>> reloaded = data_set.load() - >>> - >>> assert ddf.compute().equals(reloaded.compute()) + https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: dask.ParquetDataSet + filepath: s3://bucket_name/path/to/folder + save_args: + compression: GZIP + credentials: + client_kwargs: + aws_access_key_id: YOUR_KEY + aws_secret_access_key: YOUR_SECRET + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.dask import ParquetDataSet + >>> import pandas as pd + >>> import dask.dataframe as dd + >>> + >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], + >>> 'col3': [[5, 6], [7, 8]]}) + >>> ddf = dd.from_pandas(data, npartitions=2) + >>> + >>> data_set = ParquetDataSet( + >>> filepath="s3://bucket_name/path/to/folder", + >>> credentials={ + >>> 'client_kwargs':{ + >>> 'aws_access_key_id': 'YOUR_KEY', + >>> 'aws_secret_access_key': 'YOUR SECRET', + >>> } + >>> }, + >>> save_args={"compression": "GZIP"} + >>> ) + >>> data_set.save(ddf) + >>> reloaded = data_set.load() + >>> + >>> assert ddf.compute().equals(reloaded.compute()) + + The output schema can also be explicitly specified using + `Triad `_. + This is processed to map specific columns to + `PyArrow field types `_ or schema. For instance: + + .. code-block:: yaml + + parquet_dataset: + type: dask.ParquetDataSet + filepath: "s3://bucket_name/path/to/folder" + credentials: + client_kwargs: + aws_access_key_id: YOUR_KEY + aws_secret_access_key: "YOUR SECRET" + save_args: + compression: GZIP + schema: + col1: [int32] + col2: [int32] + col3: [[int32]] """ DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"write_index": False} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -95,7 +112,7 @@ def __init__( credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: - https://docs.dask.org/en/latest/remote-data-services.html#optional-parameters + https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html#optional-parameters """ self._filepath = filepath self._fs_args = deepcopy(fs_args) or {} @@ -121,11 +138,11 @@ def fs_args(self) -> Dict[str, Any]: return fs_args def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - load_args=self._load_args, - save_args=self._save_args, - ) + return { + "filepath": self._filepath, + "load_args": self._load_args, + "save_args": self._save_args, + } def _load(self) -> dd.DataFrame: return dd.read_parquet( @@ -133,8 +150,59 @@ def _load(self) -> dd.DataFrame: ) def _save(self, data: dd.DataFrame) -> None: + self._process_schema() data.to_parquet(self._filepath, storage_options=self.fs_args, **self._save_args) + def _process_schema(self) -> None: + """This method processes the schema in the catalog.yml or the API, if provided. + This assumes that the schema is specified using Triad's grammar for + schema definition. + + When the value of the `schema` variable is a string, it is assumed that + it corresponds to the full schema specification for the data. + + Alternatively, if the `schema` is specified as a dictionary, then only the + columns that are specified will be strictly mapped to a field type. The other + unspecified columns, if present, will be inferred from the data. + + This method converts the Triad-parsed schema into a pyarrow schema. + The output directly supports Dask's specifications for providing a schema + when saving to a parquet file. + + Note that if a `pa.Schema` object is passed directly in the `schema` argument, no + processing will be done. Additionally, the behavior when passing a `pa.Schema` + object is assumed to be consistent with how Dask sees it. That is, it should fully + define the schema for all fields. + """ + schema = self._save_args.get("schema") + + if isinstance(schema, dict): + # The schema may contain values of different types, e.g., pa.DataType, Python types, + # strings, etc. The latter requires a transformation, then we use triad handle all + # other value types. + + # Create a schema from values that triad can handle directly + triad_schema = triad.Schema( + {k: v for k, v in schema.items() if not isinstance(v, str)} + ) + + # Handle the schema keys that are represented as string and add them to the triad schema + triad_schema.update( + triad.Schema( + ",".join( + [f"{k}:{v}" for k, v in schema.items() if isinstance(v, str)] + ) + ) + ) + + # Update the schema argument with the normalized schema + self._save_args["schema"].update( + {col: field.type for col, field in triad_schema.items()} + ) + + elif isinstance(schema, str): + self._save_args["schema"] = triad.Schema(schema).pyarrow_schema + def _exists(self) -> bool: protocol = get_protocol_and_path(self._filepath)[0] file_system = fsspec.filesystem(protocol=protocol, **self.fs_args) diff --git a/kedro/extras/datasets/email/__init__.py b/kedro/extras/datasets/email/__init__.py index c5d1338fdb..97aa7a3455 100644 --- a/kedro/extras/datasets/email/__init__.py +++ b/kedro/extras/datasets/email/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementations for managing email messages.""" __all__ = ["EmailMessageDataSet"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .message_dataset import EmailMessageDataSet # NOQA + from .message_dataset import EmailMessageDataSet diff --git a/kedro/extras/datasets/email/message_dataset.py b/kedro/extras/datasets/email/message_dataset.py index 332d034ac4..8a725540c2 100644 --- a/kedro/extras/datasets/email/message_dataset.py +++ b/kedro/extras/datasets/email/message_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``EmailMessageDataSet`` loads/saves an email message from/to a file using an underlying filesystem (e.g.: local, S3, GCS). It uses the ``email`` package in the standard library to manage email messages. @@ -42,15 +14,19 @@ from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, ) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + class EmailMessageDataSet( - AbstractVersionedDataSet + AbstractVersionedDataSet[Message, Message] ): # pylint: disable=too-many-instance-attributes """``EmailMessageDataSet`` loads/saves an email message from/to a file using an underlying filesystem (e.g.: local, S3, GCS). It uses the @@ -74,7 +50,6 @@ class EmailMessageDataSet( >>> msg["From"] = '"sin studly17"' >>> msg["To"] = '"strong bad"' >>> - >>> # data_set = EmailMessageDataSet(filepath="gcs://bucket/test") >>> data_set = EmailMessageDataSet(filepath="test") >>> data_set.save(msg) >>> reloaded = data_set.load() @@ -85,8 +60,7 @@ class EmailMessageDataSet( DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -171,15 +145,15 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - parser_args=self._parser_args, - save_args=self._save_args, - generator_args=self._generator_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "parser_args": self._parser_args, + "save_args": self._save_args, + "generator_args": self._generator_args, + "version": self._version, + } def _load(self) -> Message: load_path = get_filepath_str(self._get_load_path(), self._protocol) @@ -198,7 +172,7 @@ def _save(self, data: Message) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/geopandas/__init__.py b/kedro/extras/datasets/geopandas/__init__.py index b0e7de3b10..966577fc37 100644 --- a/kedro/extras/datasets/geopandas/__init__.py +++ b/kedro/extras/datasets/geopandas/__init__.py @@ -1,35 +1,8 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -"""``GeoJSONLocalDataset`` is an ``AbstractVersionedDataSet`` to save and load GeoJSON files. +"""``GeoJSONDataSet`` is an ``AbstractVersionedDataSet`` to save and load GeoJSON files. """ __all__ = ["GeoJSONDataSet"] from contextlib import suppress with suppress(ImportError): - from .geojson_dataset import GeoJSONDataSet # NOQA + from .geojson_dataset import GeoJSONDataSet diff --git a/kedro/extras/datasets/geopandas/geojson_dataset.py b/kedro/extras/datasets/geopandas/geojson_dataset.py index 04f0c88735..88cce18dee 100644 --- a/kedro/extras/datasets/geopandas/geojson_dataset.py +++ b/kedro/extras/datasets/geopandas/geojson_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """GeoJSONDataSet loads and saves data to a local geojson file. The underlying functionality is supported by geopandas, so it supports all allowed geopandas (pandas) options for loading and saving geosjon files. @@ -39,14 +11,22 @@ from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, ) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class GeoJSONDataSet(AbstractVersionedDataSet): +class GeoJSONDataSet( + AbstractVersionedDataSet[ + gpd.GeoDataFrame, Union[gpd.GeoDataFrame, Dict[str, gpd.GeoDataFrame]] + ] +): """``GeoJSONDataSet`` loads/saves data to a GeoJSON file using an underlying filesystem (eg: local, S3, GCS). The underlying functionality is supported by geopandas, so it supports all @@ -61,10 +41,7 @@ class GeoJSONDataSet(AbstractVersionedDataSet): >>> >>> data = gpd.GeoDataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}, geometry=[Point(1,1), Point(2,4)]) - >>> # data_set = GeoJSONDataSet(filepath="gcs://bucket/test.geojson", - >>> save_args=None) - >>> data_set = GeoJSONDataSet(filepath="test.geojson", - >>> save_args=None) + >>> data_set = GeoJSONDataSet(filepath="test.geojson", save_args=None) >>> data_set.save(data) >>> reloaded = data_set.load() >>> @@ -75,8 +52,7 @@ class GeoJSONDataSet(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"driver": "GeoJSON"} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -96,10 +72,10 @@ def __init__( Note: `http(s)` doesn't support versioning. load_args: GeoPandas options for loading GeoJSON files. Here you can find all available arguments: - https://geopandas.org/docs/reference/api/geopandas.read_file.html#geopandas.read_file + https://geopandas.org/en/stable/docs/reference/api/geopandas.read_file.html save_args: GeoPandas options for saving geojson files. Here you can find all available arguments: - https://geopandas.org/docs/reference/api/geopandas.GeoDataFrame.to_file.html + https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_file.html The default_save_arg driver is 'GeoJSON', all others preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is @@ -158,18 +134,17 @@ def _save(self, data: gpd.GeoDataFrame) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _release(self) -> None: self.invalidate_cache() diff --git a/kedro/extras/datasets/holoviews/__init__.py b/kedro/extras/datasets/holoviews/__init__.py index 91c50ce1fd..c97bd72a6d 100644 --- a/kedro/extras/datasets/holoviews/__init__.py +++ b/kedro/extras/datasets/holoviews/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to save Holoviews objects as image files.""" __all__ = ["HoloviewsWriter"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .holoviews_writer import HoloviewsWriter # NOQA + from .holoviews_writer import HoloviewsWriter diff --git a/kedro/extras/datasets/holoviews/holoviews_writer.py b/kedro/extras/datasets/holoviews/holoviews_writer.py index a4ee6facd5..2ed30f7156 100644 --- a/kedro/extras/datasets/holoviews/holoviews_writer.py +++ b/kedro/extras/datasets/holoviews/holoviews_writer.py @@ -1,54 +1,31 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``HoloviewsWriter`` saves Holoviews objects as image file(s) to an underlying filesystem (e.g. local, S3, GCS).""" import io from copy import deepcopy from pathlib import PurePosixPath -from typing import Any, Dict, TypeVar +from typing import Any, Dict, NoReturn, TypeVar import fsspec import holoviews as hv from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, ) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + # HoloViews to be passed in `hv.save()` HoloViews = TypeVar("HoloViews") -class HoloviewsWriter(AbstractVersionedDataSet): +class HoloviewsWriter(AbstractVersionedDataSet[HoloViews, NoReturn]): """``HoloviewsWriter`` saves Holoviews objects to image file(s) in an underlying filesystem (e.g. local, S3, GCS). @@ -67,8 +44,7 @@ class HoloviewsWriter(AbstractVersionedDataSet): DEFAULT_SAVE_ARGS = {"fmt": "png"} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, fs_args: Dict[str, Any] = None, @@ -93,7 +69,7 @@ def __init__( E.g. for ``S3FileSystem`` it should look like: `{'key': '', 'secret': ''}}` save_args: Extra save args passed to `holoviews.save()`. See - http://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save + https://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -126,15 +102,15 @@ def __init__( self._save_args.update(save_args) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) - - def _load(self) -> str: - raise DataSetError(f"Loading not supported for `{self.__class__.__name__}`") + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } + + def _load(self) -> NoReturn: + raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") def _save(self, data: HoloViews) -> None: bytes_buffer = io.BytesIO() diff --git a/kedro/extras/datasets/json/__init__.py b/kedro/extras/datasets/json/__init__.py index 6e5352ec59..5f023b35f4 100644 --- a/kedro/extras/datasets/json/__init__.py +++ b/kedro/extras/datasets/json/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to load/save data from/to a JSON file.""" __all__ = ["JSONDataSet"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .json_dataset import JSONDataSet # NOQA + from .json_dataset import JSONDataSet diff --git a/kedro/extras/datasets/json/json_dataset.py b/kedro/extras/datasets/json/json_dataset.py index b65d280b4f..17cc2cf69e 100644 --- a/kedro/extras/datasets/json/json_dataset.py +++ b/kedro/extras/datasets/json/json_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. """ @@ -38,25 +10,43 @@ from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, ) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class JSONDataSet(AbstractVersionedDataSet): +class JSONDataSet(AbstractVersionedDataSet[Any, Any]): """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: json.JSONDataSet + filepath: gcs://your_bucket/cars.json + fs_args: + project: my-project + credentials: my_gcp_credentials + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.json import JSONDataSet >>> >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} >>> - >>> # data_set = JSONDataSet(filepath="gcs://bucket/test.json") >>> data_set = JSONDataSet(filepath="test.json") >>> data_set.save(data) >>> reloaded = data_set.load() @@ -66,8 +56,7 @@ class JSONDataSet(AbstractVersionedDataSet): DEFAULT_SAVE_ARGS = {"indent": 2} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, save_args: Dict[str, Any] = None, @@ -131,20 +120,20 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) - - def _load(self) -> Dict: + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } + + def _load(self) -> Any: load_path = get_filepath_str(self._get_load_path(), self._protocol) with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: return json.load(fs_file) - def _save(self, data: Dict) -> None: + def _save(self, data: Any) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: @@ -155,7 +144,7 @@ def _save(self, data: Dict) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/matplotlib/__init__.py b/kedro/extras/datasets/matplotlib/__init__.py index fe2899227d..ee2bc06466 100644 --- a/kedro/extras/datasets/matplotlib/__init__.py +++ b/kedro/extras/datasets/matplotlib/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to save matplotlib objects as image files.""" __all__ = ["MatplotlibWriter"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .matplotlib_writer import MatplotlibWriter # NOQA + from .matplotlib_writer import MatplotlibWriter diff --git a/kedro/extras/datasets/matplotlib/matplotlib_writer.py b/kedro/extras/datasets/matplotlib/matplotlib_writer.py index aff9d557ea..00a365f2ec 100644 --- a/kedro/extras/datasets/matplotlib/matplotlib_writer.py +++ b/kedro/extras/datasets/matplotlib/matplotlib_writer.py @@ -1,99 +1,109 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - -"""``MatplotlibWriter`` saves matplotlib objects as image file(s) to an underlying -filesystem (e.g. local, S3, GCS).""" +"""``MatplotlibWriter`` saves one or more Matplotlib objects as image +files to an underlying filesystem (e.g. local, S3, GCS).""" import io from copy import deepcopy from pathlib import PurePosixPath -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, NoReturn, Union +from warnings import warn import fsspec import matplotlib.pyplot as plt from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, ) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) -class MatplotlibWriter(AbstractVersionedDataSet): - """``MatplotlibWriter`` saves matplotlib objects to image file(s) in an underlying - filesystem (e.g. local, S3, GCS). - Example: +class MatplotlibWriter( + AbstractVersionedDataSet[ + Union[plt.figure, List[plt.figure], Dict[str, plt.figure]], NoReturn + ] +): + """``MatplotlibWriter`` saves one or more Matplotlib objects as + image files to an underlying filesystem (e.g. local, S3, GCS). + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + output_plot: + type: matplotlib.MatplotlibWriter + filepath: data/08_reporting/output_plot.png + save_args: + format: png + + Example usage for the + `Python API `_: :: >>> import matplotlib.pyplot as plt >>> from kedro.extras.datasets.matplotlib import MatplotlibWriter >>> - >>> # Saving single plot - >>> plt.plot([1, 2, 3], [4, 5, 6]) - >>> single_plot_writer = MatplotlibWriter( - >>> filepath="matplot_lib_single_plot.png" + >>> fig = plt.figure() + >>> plt.plot([1, 2, 3]) + >>> plot_writer = MatplotlibWriter( + >>> filepath="data/08_reporting/output_plot.png" >>> ) >>> plt.close() - >>> single_plot_writer.save(plt) + >>> plot_writer.save(fig) + + Example saving a plot as a PDF file: + :: + + >>> import matplotlib.pyplot as plt + >>> from kedro.extras.datasets.matplotlib import MatplotlibWriter >>> - >>> # MatplotlibWriter can output other formats as well, such as PDF files. - >>> # For this, we need to specify the format: - >>> plt.plot([1, 2, 3], [4, 5, 6]) - >>> single_plot_writer = MatplotlibWriter( - >>> filepath="matplot_lib_single_plot.pdf", + >>> fig = plt.figure() + >>> plt.plot([1, 2, 3]) + >>> pdf_plot_writer = MatplotlibWriter( + >>> filepath="data/08_reporting/output_plot.pdf", >>> save_args={"format": "pdf"}, >>> ) >>> plt.close() - >>> single_plot_writer.save(plt) + >>> pdf_plot_writer.save(fig) + + Example saving multiple plots in a folder, using a dictionary: + :: + + >>> import matplotlib.pyplot as plt + >>> from kedro.extras.datasets.matplotlib import MatplotlibWriter >>> - >>> # Saving dictionary of plots - >>> plots_dict = dict() + >>> plots_dict = {} >>> for colour in ["blue", "green", "red"]: - >>> plots_dict[colour] = plt.figure() - >>> plt.plot([1, 2, 3], [4, 5, 6], color=colour) + >>> plots_dict[f"{colour}.png"] = plt.figure() + >>> plt.plot([1, 2, 3], color=colour) + >>> >>> plt.close("all") >>> dict_plot_writer = MatplotlibWriter( - >>> filepath="matplotlib_dict" + >>> filepath="data/08_reporting/plots" >>> ) >>> dict_plot_writer.save(plots_dict) + + Example saving multiple plots in a folder, using a list: + :: + + >>> import matplotlib.pyplot as plt + >>> from kedro.extras.datasets.matplotlib import MatplotlibWriter >>> - >>> # Saving list of plots >>> plots_list = [] - >>> for index in range(5): + >>> for i in range(5): >>> plots_list.append(plt.figure()) - >>> plt.plot([1,2,3],[4,5,6]) + >>> plt.plot([i, i + 1, i + 2]) >>> plt.close("all") >>> list_plot_writer = MatplotlibWriter( - >>> filepath="matplotlib_list" + >>> filepath="data/08_reporting/plots" >>> ) >>> list_plot_writer.save(plots_list) @@ -101,21 +111,21 @@ class MatplotlibWriter(AbstractVersionedDataSet): DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, fs_args: Dict[str, Any] = None, credentials: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, + overwrite: bool = False, ) -> None: """Creates a new instance of ``MatplotlibWriter``. Args: - filepath: Filepath in POSIX format to a matplot object file(s) prefixed with a protocol - like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be - used. The prefix should be any protocol supported by ``fsspec``. + filepath: Filepath in POSIX format to save Matplotlib objects to, prefixed with a + protocol like `s3://`. If prefix is not provided, `file` protocol (local filesystem) + will be used. The prefix should be any protocol supported by ``fsspec``. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as to pass to the filesystem's `open` method through nested key `open_args_save`. @@ -131,6 +141,9 @@ def __init__( ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. + overwrite: If True, any existing image files will be removed. + Only relevant when saving multiple Matplotlib objects at + once. """ _credentials = deepcopy(credentials) or {} _fs_args = deepcopy(fs_args) or {} @@ -158,22 +171,34 @@ def __init__( if save_args is not None: self._save_args.update(save_args) + if overwrite and version is not None: + warn( + "Setting 'overwrite=True' is ineffective if versioning " + "is enabled, since the versioned path must not already " + "exist; overriding flag with 'overwrite=False' instead." + ) + overwrite = False + self._overwrite = overwrite + def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } - def _load(self) -> None: - raise DataSetError(f"Loading not supported for `{self.__class__.__name__}`") + def _load(self) -> NoReturn: + raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") def _save( self, data: Union[plt.figure, List[plt.figure], Dict[str, plt.figure]] ) -> None: save_path = self._get_save_path() + if isinstance(data, (list, dict)) and self._overwrite and self._exists(): + self._fs.rm(get_filepath_str(save_path, self._protocol), recursive=True) + if isinstance(data, list): for index, plot in enumerate(data): full_key_path = get_filepath_str( diff --git a/kedro/extras/datasets/networkx/__init__.py b/kedro/extras/datasets/networkx/__init__.py index 377692cc6c..73674c81fe 100644 --- a/kedro/extras/datasets/networkx/__init__.py +++ b/kedro/extras/datasets/networkx/__init__.py @@ -1,37 +1,15 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to save and load NetworkX graphs in JSON -format using ``NetworkX``.""" +, GraphML and GML formats using ``NetworkX``.""" -__all__ = ["NetworkXDataSet"] +__all__ = ["GMLDataSet", "GraphMLDataSet", "JSONDataSet"] from contextlib import suppress with suppress(ImportError): - from .networkx_dataset import NetworkXDataSet # NOQA + from .gml_dataset import GMLDataSet + +with suppress(ImportError): + from .graphml_dataset import GraphMLDataSet + +with suppress(ImportError): + from .json_dataset import JSONDataSet diff --git a/kedro/extras/datasets/networkx/gml_dataset.py b/kedro/extras/datasets/networkx/gml_dataset.py new file mode 100644 index 0000000000..d48f7d37e2 --- /dev/null +++ b/kedro/extras/datasets/networkx/gml_dataset.py @@ -0,0 +1,143 @@ +"""NetworkX ``GMLDataSet`` loads and saves graphs to a graph modelling language (GML) +file using an underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to +create GML data. +""" + +from copy import deepcopy +from pathlib import PurePosixPath +from typing import Any, Dict + +import fsspec +import networkx + +from kedro.io.core import ( + AbstractVersionedDataSet, + Version, + get_filepath_str, + get_protocol_and_path, +) + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + + +class GMLDataSet(AbstractVersionedDataSet[networkx.Graph, networkx.Graph]): + """``GMLDataSet`` loads and saves graphs to a GML file using an + underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to + create GML data. + See https://networkx.org/documentation/stable/tutorial.html for details. + + Example: + :: + + >>> from kedro.extras.datasets.networkx import GMLDataSet + >>> import networkx as nx + >>> graph = nx.complete_graph(100) + >>> graph_dataset = GMLDataSet(filepath="test.gml") + >>> graph_dataset.save(graph) + >>> reloaded = graph_dataset.load() + >>> assert nx.is_isomorphic(graph, reloaded) + + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + def __init__( # noqa: too-many-arguments + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``GMLDataSet``. + + Args: + filepath: Filepath in POSIX format to the NetworkX GML file. + load_args: Arguments passed on to ``networkx.read_gml``. + See the details in + https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.gml.read_gml.html + save_args: Arguments passed on to ``networkx.write_gml``. + See the details in + https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.gml.write_gml.html + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as + to pass to the filesystem's `open` method through nested keys + `open_args_load` and `open_args_save`. + Here you can find all available arguments for `open`: + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open + All defaults are preserved, except `mode`, which is set to `r` when loading + and to `w` when saving. + """ + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + _fs_open_args_load.setdefault("mode", "rb") + _fs_open_args_save.setdefault("mode", "wb") + self._fs_open_args_load = _fs_open_args_load + self._fs_open_args_save = _fs_open_args_save + + def _load(self) -> networkx.Graph: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + data = networkx.read_gml(fs_file, **self._load_args) + return data + + def _save(self, data: networkx.Graph) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + networkx.write_gml(data, fs_file, **self._save_args) + self._invalidate_cache() + + def _exists(self) -> bool: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + return self._fs.exists(load_path) + + def _describe(self) -> Dict[str, Any]: + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro/extras/datasets/networkx/graphml_dataset.py b/kedro/extras/datasets/networkx/graphml_dataset.py new file mode 100644 index 0000000000..54f5d496f7 --- /dev/null +++ b/kedro/extras/datasets/networkx/graphml_dataset.py @@ -0,0 +1,141 @@ +"""NetworkX ``GraphMLDataSet`` loads and saves graphs to a GraphML file using an underlying +filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to create GraphML data. +""" + +from copy import deepcopy +from pathlib import PurePosixPath +from typing import Any, Dict + +import fsspec +import networkx + +from kedro.io.core import ( + AbstractVersionedDataSet, + Version, + get_filepath_str, + get_protocol_and_path, +) + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + + +class GraphMLDataSet(AbstractVersionedDataSet[networkx.Graph, networkx.Graph]): + """``GraphMLDataSet`` loads and saves graphs to a GraphML file using an + underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to + create GraphML data. + See https://networkx.org/documentation/stable/tutorial.html for details. + + Example: + :: + + >>> from kedro.extras.datasets.networkx import GraphMLDataSet + >>> import networkx as nx + >>> graph = nx.complete_graph(100) + >>> graph_dataset = GraphMLDataSet(filepath="test.graphml") + >>> graph_dataset.save(graph) + >>> reloaded = graph_dataset.load() + >>> assert nx.is_isomorphic(graph, reloaded) + + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + def __init__( # noqa: too-many-arguments + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``GraphMLDataSet``. + + Args: + filepath: Filepath in POSIX format to the NetworkX GraphML file. + load_args: Arguments passed on to ``networkx.read_graphml``. + See the details in + https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.graphml.read_graphml.html + save_args: Arguments passed on to ``networkx.write_graphml``. + See the details in + https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.graphml.write_graphml.html + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as + to pass to the filesystem's `open` method through nested keys + `open_args_load` and `open_args_save`. + Here you can find all available arguments for `open`: + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open + All defaults are preserved, except `mode`, which is set to `r` when loading + and to `w` when saving. + """ + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + _fs_open_args_load.setdefault("mode", "rb") + _fs_open_args_save.setdefault("mode", "wb") + self._fs_open_args_load = _fs_open_args_load + self._fs_open_args_save = _fs_open_args_save + + def _load(self) -> networkx.Graph: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + return networkx.read_graphml(fs_file, **self._load_args) + + def _save(self, data: networkx.Graph) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + networkx.write_graphml(data, fs_file, **self._save_args) + self._invalidate_cache() + + def _exists(self) -> bool: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + return self._fs.exists(load_path) + + def _describe(self) -> Dict[str, Any]: + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro/extras/datasets/networkx/networkx_dataset.py b/kedro/extras/datasets/networkx/json_dataset.py similarity index 68% rename from kedro/extras/datasets/networkx/networkx_dataset.py rename to kedro/extras/datasets/networkx/json_dataset.py index 4940821470..4ae9940601 100644 --- a/kedro/extras/datasets/networkx/networkx_dataset.py +++ b/kedro/extras/datasets/networkx/json_dataset.py @@ -1,33 +1,4 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - -"""``NetworkXDataSet`` loads and saves graphs to a JSON file using an underlying +"""``JSONDataSet`` loads and saves graphs to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to create JSON data. """ @@ -46,9 +17,13 @@ get_protocol_and_path, ) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) -class NetworkXDataSet(AbstractVersionedDataSet): - """``NetworkXDataSet`` loads and saves graphs to a JSON file using an + +class JSONDataSet(AbstractVersionedDataSet[networkx.Graph, networkx.Graph]): + """NetworkX ``JSONDataSet`` loads and saves graphs to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to create JSON data. See https://networkx.org/documentation/stable/tutorial.html for details. @@ -56,10 +31,10 @@ class NetworkXDataSet(AbstractVersionedDataSet): Example: :: - >>> from kedro.extras.datasets.networkx import NetworkXDataSet + >>> from kedro.extras.datasets.networkx import JSONDataSet >>> import networkx as nx >>> graph = nx.complete_graph(100) - >>> graph_dataset = NetworkXDataSet(filepath="test.json") + >>> graph_dataset = JSONDataSet(filepath="test.json") >>> graph_dataset.save(graph) >>> reloaded = graph_dataset.load() >>> assert nx.is_isomorphic(graph, reloaded) @@ -69,8 +44,7 @@ class NetworkXDataSet(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -79,14 +53,14 @@ def __init__( credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, ) -> None: - """Creates a new instance of ``NetworkXDataSet``. + """Creates a new instance of ``JSONDataSet``. Args: filepath: Filepath in POSIX format to the NetworkX graph JSON file. - load_args: Arguments passed on to ```networkx.node_link_graph``. + load_args: Arguments passed on to ``networkx.node_link_graph``. See the details in https://networkx.org/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html - save_args: Arguments passed on to ```networkx.node_link_data``. + save_args: Arguments passed on to ``networkx.node_link_data``. See the details in https://networkx.org/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html version: If specified, should be an instance of @@ -157,13 +131,12 @@ def _exists(self) -> bool: return self._fs.exists(load_path) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _release(self) -> None: super()._release() diff --git a/kedro/extras/datasets/pandas/__init__.py b/kedro/extras/datasets/pandas/__init__.py index 18a1b3bd73..b84015d1d9 100644 --- a/kedro/extras/datasets/pandas/__init__.py +++ b/kedro/extras/datasets/pandas/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementations that produce pandas DataFrames.""" __all__ = [ @@ -33,29 +5,35 @@ "ExcelDataSet", "FeatherDataSet", "GBQTableDataSet", - "ExcelDataSet", + "GBQQueryDataSet", "HDFDataSet", "JSONDataSet", "ParquetDataSet", "SQLQueryDataSet", "SQLTableDataSet", + "XMLDataSet", + "GenericDataSet", ] from contextlib import suppress with suppress(ImportError): - from .csv_dataset import CSVDataSet # NOQA + from .csv_dataset import CSVDataSet +with suppress(ImportError): + from .excel_dataset import ExcelDataSet +with suppress(ImportError): + from .feather_dataset import FeatherDataSet with suppress(ImportError): - from .excel_dataset import ExcelDataSet # NOQA + from .gbq_dataset import GBQQueryDataSet, GBQTableDataSet with suppress(ImportError): - from .feather_dataset import FeatherDataSet # NOQA + from .hdf_dataset import HDFDataSet with suppress(ImportError): - from .gbq_dataset import GBQTableDataSet # NOQA + from .json_dataset import JSONDataSet with suppress(ImportError): - from .hdf_dataset import HDFDataSet # NOQA + from .parquet_dataset import ParquetDataSet with suppress(ImportError): - from .json_dataset import JSONDataSet # NOQA + from .sql_dataset import SQLQueryDataSet, SQLTableDataSet with suppress(ImportError): - from .parquet_dataset import ParquetDataSet # NOQA + from .xml_dataset import XMLDataSet with suppress(ImportError): - from .sql_dataset import SQLQueryDataSet, SQLTableDataSet # NOQA + from .generic_dataset import GenericDataSet diff --git a/kedro/extras/datasets/pandas/csv_dataset.py b/kedro/extras/datasets/pandas/csv_dataset.py index 37fe58f69d..597d03ecf9 100644 --- a/kedro/extras/datasets/pandas/csv_dataset.py +++ b/kedro/extras/datasets/pandas/csv_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``CSVDataSet`` loads/saves data from/to a CSV file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the CSV file. """ @@ -41,7 +13,7 @@ from kedro.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, @@ -49,12 +21,40 @@ logger = logging.getLogger(__name__) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class CSVDataSet(AbstractVersionedDataSet): +class CSVDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``CSVDataSet`` loads/saves data from/to a CSV file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the CSV file. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: pandas.CSVDataSet + filepath: data/01_raw/company/cars.csv + load_args: + sep: "," + na_values: ["#NA", NA] + save_args: + index: False + date_format: "%Y-%m-%d %H:%M" + decimal: . + + motorbikes: + type: pandas.CSVDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.csv + credentials: dev_s3 + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pandas import CSVDataSet @@ -63,7 +63,6 @@ class CSVDataSet(AbstractVersionedDataSet): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = CSVDataSet(filepath="gcs://bucket/test.csv") >>> data_set = CSVDataSet(filepath="test.csv") >>> data_set.save(data) >>> reloaded = data_set.load() @@ -74,8 +73,7 @@ class CSVDataSet(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -137,21 +135,20 @@ def __init__( if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( - "Dropping `storage_options` for %s, " - "please specify them under `fs_args` or `credentials`.", + "Dropping 'storage_options' for %s, " + "please specify them under 'fs_args' or 'credentials'.", self._filepath, ) self._save_args.pop("storage_options", None) self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = str(self._get_load_path()) @@ -181,7 +178,7 @@ def _save(self, data: pd.DataFrame) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py index bad2a674a7..05c1144721 100644 --- a/kedro/extras/datasets/pandas/excel_dataset.py +++ b/kedro/extras/datasets/pandas/excel_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``ExcelDataSet`` loads/saves data from/to a Excel file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Excel file. """ @@ -33,7 +5,7 @@ from copy import deepcopy from io import BytesIO from pathlib import PurePosixPath -from typing import Any, Dict +from typing import Any, Dict, Union import fsspec import pandas as pd @@ -41,7 +13,7 @@ from kedro.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, @@ -49,12 +21,44 @@ logger = logging.getLogger(__name__) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class ExcelDataSet(AbstractVersionedDataSet): +class ExcelDataSet( + AbstractVersionedDataSet[ + Union[pd.DataFrame, Dict[str, pd.DataFrame]], + Union[pd.DataFrame, Dict[str, pd.DataFrame]], + ] +): """``ExcelDataSet`` loads/saves data from/to a Excel file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Excel file. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + rockets: + type: pandas.ExcelDataSet + filepath: gcs://your_bucket/rockets.xlsx + fs_args: + project: my-project + credentials: my_gcp_credentials + save_args: + sheet_name: Sheet1 + load_args: + sheet_name: Sheet1 + + shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pandas import ExcelDataSet @@ -63,19 +67,53 @@ class ExcelDataSet(AbstractVersionedDataSet): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = ExcelDataSet(filepath="gcs://bucket/test.xlsx") >>> data_set = ExcelDataSet(filepath="test.xlsx") >>> data_set.save(data) >>> reloaded = data_set.load() >>> assert data.equals(reloaded) + To save a multi-sheet Excel file, no special ``save_args`` are required. + Instead, return a dictionary of ``Dict[str, pd.DataFrame]`` where the string + keys are your sheet names. + + Example usage for the + `YAML API `_ + for a multi-sheet Excel file: + + .. code-block:: yaml + + trains: + type: pandas.ExcelDataSet + filepath: data/02_intermediate/company/trains.xlsx + load_args: + sheet_name: [Sheet1, Sheet2, Sheet3] + + Example usage for the + `Python API `_ + for a multi-sheet Excel file: + :: + + >>> from kedro.extras.datasets.pandas import ExcelDataSet + >>> import pandas as pd + >>> + >>> dataframe = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> another_dataframe = pd.DataFrame({"x": [10, 20], "y": ["hello", "world"]}) + >>> multiframe = {"Sheet1": dataframe, "Sheet2": another_dataframe} + >>> data_set = ExcelDataSet(filepath="test.xlsx", load_args = {"sheet_name": None}) + >>> data_set.save(multiframe) + >>> reloaded = data_set.load() + >>> assert multiframe["Sheet1"].equals(reloaded["Sheet1"]) + >>> assert multiframe["Sheet2"].equals(reloaded["Sheet2"]) + """ DEFAULT_LOAD_ARGS = {"engine": "openpyxl"} DEFAULT_SAVE_ARGS = {"index": False} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, engine: str = "openpyxl", @@ -93,12 +131,13 @@ def __init__( `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. - engine: The engine used to write to excel files. The default + engine: The engine used to write to Excel files. The default engine is 'openpyxl'. load_args: Pandas options for loading Excel files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html All defaults are preserved, but "engine", which is set to "openpyxl". + Supports multi-sheet Excel files (include `sheet_name = None` in `load_args`). save_args: Pandas options for saving Excel files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html @@ -117,7 +156,7 @@ def __init__( (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). Raises: - DataSetError: If versioning is enabled while in append mode. + DatasetError: If versioning is enabled while in append mode. """ _fs_args = deepcopy(fs_args) or {} _credentials = deepcopy(credentials) or {} @@ -150,30 +189,30 @@ def __init__( self._writer_args.setdefault("engine", engine or "openpyxl") # type: ignore if version and self._writer_args.get("mode") == "a": # type: ignore - raise DataSetError( - "`ExcelDataSet` doesn't support versioning in append mode." + raise DatasetError( + "'ExcelDataSet' doesn't support versioning in append mode." ) if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( - "Dropping `storage_options` for %s, " - "please specify them under `fs_args` or `credentials`.", + "Dropping 'storage_options' for %s, " + "please specify them under 'fs_args' or 'credentials'.", self._filepath, ) self._save_args.pop("storage_options", None) self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - writer_args=self._writer_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "writer_args": self._writer_args, + "version": self._version, + } - def _load(self) -> pd.DataFrame: + def _load(self) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: load_path = str(self._get_load_path()) if self._protocol == "file": # file:// protocol seems to misbehave on Windows @@ -187,13 +226,19 @@ def _load(self) -> pd.DataFrame: load_path, storage_options=self._storage_options, **self._load_args ) - def _save(self, data: pd.DataFrame) -> None: + def _save(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]) -> None: output = BytesIO() save_path = get_filepath_str(self._get_save_path(), self._protocol) # pylint: disable=abstract-class-instantiated with pd.ExcelWriter(output, **self._writer_args) as writer: - data.to_excel(writer, **self._save_args) + if isinstance(data, dict): + for sheet_name, sheet_data in data.items(): + sheet_data.to_excel( + writer, sheet_name=sheet_name, **self._save_args + ) + else: + data.to_excel(writer, **self._save_args) with self._fs.open(save_path, mode="wb") as fs_file: fs_file.write(output.getvalue()) @@ -203,7 +248,7 @@ def _save(self, data: pd.DataFrame) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/pandas/feather_dataset.py b/kedro/extras/datasets/pandas/feather_dataset.py index ad7ea0f81a..534d84d9bf 100644 --- a/kedro/extras/datasets/pandas/feather_dataset.py +++ b/kedro/extras/datasets/pandas/feather_dataset.py @@ -1,32 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - """``FeatherDataSet`` is a data set used to load and save data to feather files using an underlying filesystem (e.g.: local, S3, GCS). The underlying functionality is supported by pandas, so it supports all operations the pandas supports. @@ -50,14 +21,38 @@ logger = logging.getLogger(__name__) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) -class FeatherDataSet(AbstractVersionedDataSet): + +class FeatherDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``FeatherDataSet`` loads and saves data to a feather file using an underlying filesystem (e.g.: local, S3, GCS). The underlying functionality is supported by pandas, so it supports all allowed pandas options for loading and saving csv files. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: pandas.FeatherDataSet + filepath: data/01_raw/company/cars.feather + load_args: + columns: ['col1', 'col2', 'col3'] + use_threads: True + + motorbikes: + type: pandas.FeatherDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.feather + credentials: dev_s3 + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pandas import FeatherDataSet @@ -66,7 +61,6 @@ class FeatherDataSet(AbstractVersionedDataSet): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = FeatherDataSet(filepath="gcs://bucket/test.feather") >>> data_set = FeatherDataSet(filepath="test.feather") >>> >>> data_set.save(data) @@ -79,8 +73,7 @@ class FeatherDataSet(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -142,20 +135,20 @@ def __init__( if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( - "Dropping `storage_options` for %s, " - "please specify them under `fs_args` or `credentials`.", + "Dropping 'storage_options' for %s, " + "please specify them under 'fs_args' or 'credentials'.", self._filepath, ) self._save_args.pop("storage_options", None) self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = str(self._get_load_path()) diff --git a/kedro/extras/datasets/pandas/gbq_dataset.py b/kedro/extras/datasets/pandas/gbq_dataset.py index 36e045f71c..dda5cf9d35 100644 --- a/kedro/extras/datasets/pandas/gbq_dataset.py +++ b/kedro/extras/datasets/pandas/gbq_dataset.py @@ -1,51 +1,54 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``GBQTableDataSet`` loads and saves data from/to Google BigQuery. It uses pandas-gbq to read and write from/to BigQuery table. """ import copy -from typing import Any, Dict, Union +from pathlib import PurePosixPath +from typing import Any, Dict, NoReturn, Union +import fsspec import pandas as pd from google.cloud import bigquery from google.cloud.exceptions import NotFound from google.oauth2.credentials import Credentials -from kedro.io.core import AbstractDataSet, DataSetError, validate_on_forbidden_chars +from kedro.io.core import ( + AbstractDataSet, + DatasetError, + get_filepath_str, + get_protocol_and_path, + validate_on_forbidden_chars, +) + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) -class GBQTableDataSet(AbstractDataSet): +class GBQTableDataSet(AbstractDataSet[None, pd.DataFrame]): """``GBQTableDataSet`` loads and saves data from/to Google BigQuery. It uses pandas-gbq to read and write from/to BigQuery table. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + vehicles: + type: pandas.GBQTableDataSet + dataset: big_query_dataset + table_name: big_query_table + project: my-project + credentials: gbq-creds + load_args: + reauth: True + save_args: + chunk_size: 100 + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pandas import GBQTableDataSet @@ -67,8 +70,7 @@ class GBQTableDataSet(AbstractDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"progress_bar": False} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, dataset: str, table_name: str, @@ -100,7 +102,7 @@ def __init__( All defaults are preserved, but "progress_bar", which is set to False. Raises: - DataSetError: When ``load_args['location']`` and ``save_args['location']`` + DatasetError: When ``load_args['location']`` and ``save_args['location']`` are different. """ # Handle default load and save arguments @@ -128,12 +130,12 @@ def __init__( ) def _describe(self) -> Dict[str, Any]: - return dict( - dataset=self._dataset, - table_name=self._table_name, - load_args=self._load_args, - save_args=self._save_args, - ) + return { + "dataset": self._dataset, + "table_name": self._table_name, + "load_args": self._load_args, + "save_args": self._save_args, + } def _load(self) -> pd.DataFrame: sql = f"select * from {self._dataset}.{self._table_name}" # nosec @@ -165,9 +167,147 @@ def _validate_location(self): load_location = self._load_args.get("location") if save_location != load_location: - raise DataSetError( - "`load_args['location']` is different from `save_args['location']`. " - "The `location` defines where BigQuery data is stored, therefore has " + raise DatasetError( + """"load_args['location']" is different from "save_args['location']". """ + "The 'location' defines where BigQuery data is stored, therefore has " "to be the same for save and load args. " "Details: https://cloud.google.com/bigquery/docs/locations" ) + + +class GBQQueryDataSet(AbstractDataSet[None, pd.DataFrame]): + """``GBQQueryDataSet`` loads data from a provided SQL query from Google + BigQuery. It uses ``pandas.read_gbq`` which itself uses ``pandas-gbq`` + internally to read from BigQuery table. Therefore it supports all allowed + pandas options on ``read_gbq``. + + Example adding a catalog entry with the ``YAML API``: + + .. code-block:: yaml + + >>> vehicles: + >>> type: pandas.GBQQueryDataSet + >>> sql: "select shuttle, shuttle_id from spaceflights.shuttles;" + >>> project: my-project + >>> credentials: gbq-creds + >>> load_args: + >>> reauth: True + + + Example using Python API: + :: + + >>> from kedro.extras.datasets.pandas import GBQQueryDataSet + >>> + >>> sql = "SELECT * FROM dataset_1.table_a" + >>> + >>> data_set = GBQQueryDataSet(sql, project='my-project') + >>> + >>> sql_data = data_set.load() + >>> + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + + def __init__( # noqa: too-many-arguments + self, + sql: str = None, + project: str = None, + credentials: Union[Dict[str, Any], Credentials] = None, + load_args: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + filepath: str = None, + ) -> None: + """Creates a new instance of ``GBQQueryDataSet``. + + Args: + sql: The sql query statement. + project: Google BigQuery Account project ID. + Optional when available from the environment. + https://cloud.google.com/resource-manager/docs/creating-managing-projects + credentials: Credentials for accessing Google APIs. + Either ``google.auth.credentials.Credentials`` object or dictionary with + parameters required to instantiate ``google.oauth2.credentials.Credentials``. + Here you can find all the arguments: + https://google-auth.readthedocs.io/en/latest/reference/google.oauth2.credentials.html + load_args: Pandas options for loading BigQuery table into DataFrame. + Here you can find all available arguments: + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html + All defaults are preserved. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``) used for reading the + SQL query from filepath. + filepath: A path to a file with a sql query statement. + + Raises: + DatasetError: When ``sql`` and ``filepath`` parameters are either both empty + or both provided, as well as when the `save()` method is invoked. + """ + if sql and filepath: + raise DatasetError( + "'sql' and 'filepath' arguments cannot both be provided." + "Please only provide one." + ) + + if not (sql or filepath): + raise DatasetError( + "'sql' and 'filepath' arguments cannot both be empty." + "Please provide a sql query or path to a sql query file." + ) + + # Handle default load arguments + self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + + self._project_id = project + + if isinstance(credentials, dict): + credentials = Credentials(**credentials) + + self._credentials = credentials + self._client = bigquery.Client( + project=self._project_id, + credentials=self._credentials, + location=self._load_args.get("location"), + ) + + # load sql query from arg or from file + if sql: + self._load_args["query"] = sql + self._filepath = None + else: + # filesystem for loading sql file + _fs_args = copy.deepcopy(fs_args) or {} + _fs_credentials = _fs_args.pop("credentials", {}) + protocol, path = get_protocol_and_path(str(filepath)) + + self._protocol = protocol + self._fs = fsspec.filesystem(self._protocol, **_fs_credentials, **_fs_args) + self._filepath = path + + def _describe(self) -> Dict[str, Any]: + load_args = copy.deepcopy(self._load_args) + desc = {} + desc["sql"] = str(load_args.pop("query", None)) + desc["filepath"] = str(self._filepath) + desc["load_args"] = str(load_args) + + return desc + + def _load(self) -> pd.DataFrame: + load_args = copy.deepcopy(self._load_args) + + if self._filepath: + load_path = get_filepath_str(PurePosixPath(self._filepath), self._protocol) + with self._fs.open(load_path, mode="r") as fs_file: + load_args["query"] = fs_file.read() + + return pd.read_gbq( + project_id=self._project_id, + credentials=self._credentials, + **load_args, + ) + + def _save(self, data: None) -> NoReturn: + raise DatasetError("'save' is not supported on GBQQueryDataSet") diff --git a/kedro/extras/datasets/pandas/generic_dataset.py b/kedro/extras/datasets/pandas/generic_dataset.py new file mode 100644 index 0000000000..bf44694a26 --- /dev/null +++ b/kedro/extras/datasets/pandas/generic_dataset.py @@ -0,0 +1,247 @@ +"""``GenericDataSet`` loads/saves data from/to a data file using an underlying +filesystem (e.g.: local, S3, GCS). It uses pandas to handle the +type of read/write target. +""" +from copy import deepcopy +from pathlib import PurePosixPath +from typing import Any, Dict + +import fsspec +import pandas as pd + +from kedro.io.core import ( + AbstractVersionedDataSet, + DatasetError, + Version, + get_filepath_str, + get_protocol_and_path, +) + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + + +NON_FILE_SYSTEM_TARGETS = [ + "clipboard", + "numpy", + "sql", + "period", + "records", + "timestamp", + "xarray", + "sql_table", +] + + +class GenericDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): + """`pandas.GenericDataSet` loads/saves data from/to a data file using an underlying + filesystem (e.g.: local, S3, GCS). It uses pandas to dynamically select the + appropriate type of read/write target on a best effort basis. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: pandas.GenericDataSet + file_format: csv + filepath: s3://data/01_raw/company/cars.csv + load_args: + sep: "," + na_values: ["#NA", NA] + save_args: + index: False + date_format: "%Y-%m-%d" + + This second example is able to load a SAS7BDAT file via the ``pd.read_sas`` method. + Trying to save this dataset will raise a ``DatasetError`` since pandas does not provide an + equivalent ``pd.DataFrame.to_sas`` write method. + + .. code-block:: yaml + + flights: + type: pandas.GenericDataSet + file_format: sas + filepath: data/01_raw/airplanes.sas7bdat + load_args: + format: sas7bdat + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.pandas import GenericDataSet + >>> import pandas as pd + >>> + >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> + >>> data_set = GenericDataSet(filepath="test.csv", file_format='csv') + >>> data_set.save(data) + >>> reloaded = data_set.load() + >>> assert data.equals(reloaded) + + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + def __init__( # noqa: too-many-arguments + self, + filepath: str, + file_format: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ): + """Creates a new instance of ``GenericDataSet`` pointing to a concrete data file + on a specific filesystem. The appropriate pandas load/save methods are + dynamically identified by string matching on a best effort basis. + + Args: + filepath: Filepath in POSIX format to a file prefixed with a protocol like `s3://`. + If prefix is not provided, `file` protocol (local filesystem) will be used. + The prefix should be any protocol supported by ``fsspec``. + Key assumption: The first argument of either load/save method points to a + filepath/buffer/io type location. There are some read/write targets such + as 'clipboard' or 'records' that will fail since they do not take a + filepath like argument. + file_format: String which is used to match the appropriate load/save method on a best + effort basis. For example if 'csv' is passed in the `pandas.read_csv` and + `pandas.DataFrame.to_csv` will be identified. An error will be raised unless + at least one matching `read_{file_format}` or `to_{file_format}` method is + identified. + load_args: Pandas options for loading files. + Here you can find all available arguments: + https://pandas.pydata.org/pandas-docs/stable/reference/io.html + All defaults are preserved. + save_args: Pandas options for saving files. + Here you can find all available arguments: + https://pandas.pydata.org/pandas-docs/stable/reference/io.html + All defaults are preserved, but "index", which is set to False. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as + to pass to the filesystem's `open` method through nested keys + `open_args_load` and `open_args_save`. + Here you can find all available arguments for `open`: + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open + All defaults are preserved, except `mode`, which is set to `r` when loading + and to `w` when saving. + + Raises: + DatasetError: Will be raised if at least less than one appropriate + read or write methods are identified. + """ + + self._file_format = file_format.lower() + + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + _fs_open_args_save.setdefault("mode", "w") + self._fs_open_args_load = _fs_open_args_load + self._fs_open_args_save = _fs_open_args_save + + def _ensure_file_system_target(self) -> None: + # Fail fast if provided a known non-filesystem target + if self._file_format in NON_FILE_SYSTEM_TARGETS: + raise DatasetError( + f"Cannot create a dataset of file_format '{self._file_format}' as it " + f"does not support a filepath target/source." + ) + + def _load(self) -> pd.DataFrame: + + self._ensure_file_system_target() + + load_path = get_filepath_str(self._get_load_path(), self._protocol) + load_method = getattr(pd, f"read_{self._file_format}", None) + if load_method: + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + return load_method(fs_file, **self._load_args) + raise DatasetError( + f"Unable to retrieve 'pandas.read_{self._file_format}' method, please ensure that your " + "'file_format' parameter has been defined correctly as per the Pandas API " + "https://pandas.pydata.org/docs/reference/io.html" + ) + + def _save(self, data: pd.DataFrame) -> None: + + self._ensure_file_system_target() + + save_path = get_filepath_str(self._get_save_path(), self._protocol) + save_method = getattr(data, f"to_{self._file_format}", None) + if save_method: + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + # KEY ASSUMPTION - first argument is path/buffer/io + save_method(fs_file, **self._save_args) + self._invalidate_cache() + else: + raise DatasetError( + f"Unable to retrieve 'pandas.DataFrame.to_{self._file_format}' method, please " + "ensure that your 'file_format' parameter has been defined correctly as " + "per the Pandas API " + "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html" + ) + + def _exists(self) -> bool: + try: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + except DatasetError: + return False + + return self._fs.exists(load_path) + + def _describe(self) -> Dict[str, Any]: + return { + "file_format": self._file_format, + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro/extras/datasets/pandas/hdf_dataset.py b/kedro/extras/datasets/pandas/hdf_dataset.py index 2c0d983aef..d60161d095 100644 --- a/kedro/extras/datasets/pandas/hdf_dataset.py +++ b/kedro/extras/datasets/pandas/hdf_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``HDFDataSet`` loads/saves data from/to a hdf file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas.HDFStore to handle the hdf file. """ @@ -39,7 +11,7 @@ from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, @@ -47,12 +19,30 @@ HDFSTORE_DRIVER = "H5FD_CORE" +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class HDFDataSet(AbstractVersionedDataSet): +class HDFDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``HDFDataSet`` loads/saves data from/to a hdf file using an underlying filesystem (e.g. local, S3, GCS). It uses pandas.HDFStore to handle the hdf file. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + hdf_dataset: + type: pandas.HDFDataSet + filepath: s3://my_bucket/raw/sensor_reading.h5 + credentials: aws_s3_creds + key: data + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pandas import HDFDataSet @@ -61,7 +51,6 @@ class HDFDataSet(AbstractVersionedDataSet): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = HDFDataSet(filepath="gcs://bucket/test.hdf", key='data') >>> data_set = HDFDataSet(filepath="test.h5", key='data') >>> data_set.save(data) >>> reloaded = data_set.load() @@ -75,8 +64,7 @@ class HDFDataSet(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, key: str, @@ -151,14 +139,14 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - key=self._key, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "key": self._key, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = get_filepath_str(self._get_load_path(), self._protocol) @@ -202,7 +190,7 @@ def _save(self, data: pd.DataFrame) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/pandas/json_dataset.py b/kedro/extras/datasets/pandas/json_dataset.py index c01d1c38bf..1d5e3cb2d1 100644 --- a/kedro/extras/datasets/pandas/json_dataset.py +++ b/kedro/extras/datasets/pandas/json_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the JSON file. """ @@ -41,7 +13,7 @@ from kedro.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, @@ -49,12 +21,35 @@ logger = logging.getLogger(__name__) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class JSONDataSet(AbstractVersionedDataSet): +class JSONDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the json file. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + clickstream_dataset: + type: pandas.JSONDataSet + filepath: abfs://landing_area/primary/click_stream.json + credentials: abfs_creds + + json_dataset: + type: pandas.JSONDataSet + filepath: data/01_raw/Video_Games.json + load_args: + lines: True + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pandas import JSONDataSet @@ -63,7 +58,6 @@ class JSONDataSet(AbstractVersionedDataSet): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = JSONDataSet(filepath="gcs://bucket/test.json") >>> data_set = JSONDataSet(filepath="test.json") >>> data_set.save(data) >>> reloaded = data_set.load() @@ -74,8 +68,7 @@ class JSONDataSet(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -136,23 +129,22 @@ def __init__( if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( - "Dropping `storage_options` for %s, " - "please specify them under `fs_args` or `credentials`.", + "Dropping 'storage_options' for %s, " + "please specify them under 'fs_args' or 'credentials'.", self._filepath, ) self._save_args.pop("storage_options", None) self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) - - def _load(self) -> Any: + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _load(self) -> pd.DataFrame: load_path = str(self._get_load_path()) if self._protocol == "file": # file:// protocol seems to misbehave on Windows @@ -180,7 +172,7 @@ def _save(self, data: pd.DataFrame) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/pandas/parquet_dataset.py b/kedro/extras/datasets/pandas/parquet_dataset.py index 4a05e9a36b..bf03f97ccd 100644 --- a/kedro/extras/datasets/pandas/parquet_dataset.py +++ b/kedro/extras/datasets/pandas/parquet_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``ParquetDataSet`` loads/saves data from/to a Parquet file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file. """ @@ -42,7 +14,7 @@ from kedro.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, @@ -50,12 +22,46 @@ logger = logging.getLogger(__name__) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class ParquetDataSet(AbstractVersionedDataSet): +class ParquetDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``ParquetDataSet`` loads/saves data from/to a Parquet file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + boats: + type: pandas.ParquetDataSet + filepath: data/01_raw/boats.parquet + load_args: + engine: pyarrow + use_nullable_dtypes: True + save_args: + file_scheme: hive + has_nulls: False + engine: pyarrow + + trucks: + type: pandas.ParquetDataSet + filepath: abfs://container/02_intermediate/trucks.parquet + credentials: dev_abs + load_args: + columns: [name, gear, disp, wt] + index: name + save_args: + compression: GZIP + partition_on: [name] + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pandas import ParquetDataSet @@ -64,7 +70,6 @@ class ParquetDataSet(AbstractVersionedDataSet): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = ParquetDataSet(filepath="gcs://bucket/test.parquet") >>> data_set = ParquetDataSet(filepath="test.parquet") >>> data_set.save(data) >>> reloaded = data_set.load() @@ -75,8 +80,7 @@ class ParquetDataSet(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -141,21 +145,20 @@ def __init__( if "storage_options" in self._save_args or "storage_options" in self._load_args: logger.warning( - "Dropping `storage_options` for %s, " - "please specify them under `fs_args` or `credentials`.", + "Dropping 'storage_options' for %s, " + "please specify them under 'fs_args' or 'credentials'.", self._filepath, ) self._save_args.pop("storage_options", None) self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = get_filepath_str(self._get_load_path(), self._protocol) @@ -191,14 +194,14 @@ def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) if Path(save_path).is_dir(): - raise DataSetError( + raise DatasetError( f"Saving {self.__class__.__name__} to a directory is not supported." ) if "partition_cols" in self._save_args: - raise DataSetError( + raise DatasetError( f"{self.__class__.__name__} does not support save argument " - f"`partition_cols`. Please use `kedro.io.PartitionedDataSet` instead." + f"'partition_cols'. Please use 'kedro.io.PartitionedDataSet' instead." ) bytes_buffer = BytesIO() @@ -212,7 +215,7 @@ def _save(self, data: pd.DataFrame) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/pandas/sql_dataset.py b/kedro/extras/datasets/pandas/sql_dataset.py index 5744fb5b56..7c084cb82e 100644 --- a/kedro/extras/datasets/pandas/sql_dataset.py +++ b/kedro/extras/datasets/pandas/sql_dataset.py @@ -1,41 +1,26 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``SQLDataSet`` to load and save data to a SQL backend.""" import copy import re -from typing import Any, Dict, Optional +from pathlib import PurePosixPath +from typing import Any, Dict, NoReturn, Optional +import fsspec import pandas as pd from sqlalchemy import create_engine from sqlalchemy.exc import NoSuchModuleError -from kedro.io.core import AbstractDataSet, DataSetError +from kedro.io.core import ( + AbstractDataSet, + DatasetError, + get_filepath_str, + get_protocol_and_path, +) + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + __all__ = ["SQLTableDataSet", "SQLQueryDataSet"] @@ -80,26 +65,26 @@ def _find_known_drivers(module_import_error: ImportError) -> Optional[str]: if KNOWN_PIP_INSTALL.get(missing_module): return ( - "You can also try installing missing driver with\n" - "\npip install {}".format(KNOWN_PIP_INSTALL.get(missing_module)) + f"You can also try installing missing driver with\n" + f"\npip install {KNOWN_PIP_INSTALL.get(missing_module)}" ) return None -def _get_missing_module_error(import_error: ImportError) -> DataSetError: +def _get_missing_module_error(import_error: ImportError) -> DatasetError: missing_module_instruction = _find_known_drivers(import_error) if missing_module_instruction is None: - return DataSetError( + return DatasetError( f"{DRIVER_ERROR_MESSAGE}Loading failed with error:\n\n{str(import_error)}" ) - return DataSetError(f"{DRIVER_ERROR_MESSAGE}{missing_module_instruction}") + return DatasetError(f"{DRIVER_ERROR_MESSAGE}{missing_module_instruction}") -def _get_sql_alchemy_missing_error() -> DataSetError: - return DataSetError( +def _get_sql_alchemy_missing_error() -> DatasetError: + return DatasetError( "The SQL dialect in your connection is not supported by " "SQLAlchemy. Please refer to " "https://docs.sqlalchemy.org/en/13/core/engines.html#supported-databases " @@ -107,7 +92,7 @@ def _get_sql_alchemy_missing_error() -> DataSetError: ) -class SQLTableDataSet(AbstractDataSet): +class SQLTableDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]): """``SQLTableDataSet`` loads data from a SQL table and saves a pandas dataframe to a table. It uses ``pandas.DataFrame`` internally, so it supports all allowed pandas options on ``read_sql_table`` and @@ -122,8 +107,32 @@ class SQLTableDataSet(AbstractDataSet): the data with no index. This is designed to make load and save methods symmetric. + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + shuttles_table_dataset: + type: pandas.SQLTableDataSet + credentials: db_credentials + table_name: shuttles + load_args: + schema: dwschema + save_args: + schema: dwschema + if_exists: replace + + Sample database credentials entry in ``credentials.yml``: + + .. code-block:: yaml + + db_credentials: + con: postgresql://scott:tiger@localhost/test - Example: + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pandas import SQLTableDataSet @@ -145,8 +154,11 @@ class SQLTableDataSet(AbstractDataSet): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {"index": False} + # using Any because of Sphinx but it should be + # sqlalchemy.engine.Engine or sqlalchemy.engine.base.Engine + engines: Dict[str, Any] = {} def __init__( self, @@ -182,15 +194,15 @@ def __init__( It has ``index=False`` in the default parameters. Raises: - DataSetError: When either ``table_name`` or ``con`` is empty. + DatasetError: When either ``table_name`` or ``con`` is empty. """ if not table_name: - raise DataSetError("`table_name` argument cannot be empty.") + raise DatasetError("'table_name' argument cannot be empty.") if not (credentials and "con" in credentials and credentials["con"]): - raise DataSetError( - "`con` argument cannot be empty. Please " + raise DatasetError( + "'con' argument cannot be empty. Please " "provide a SQLAlchemy connection string." ) @@ -205,46 +217,54 @@ def __init__( self._load_args["table_name"] = table_name self._save_args["name"] = table_name - self._load_args["con"] = self._save_args["con"] = credentials["con"] + self._connection_str = credentials["con"] + self.create_connection(self._connection_str) - def _describe(self) -> Dict[str, Any]: - load_args = self._load_args.copy() - save_args = self._save_args.copy() - del load_args["table_name"] - del load_args["con"] - del save_args["name"] - del save_args["con"] - return dict( - table_name=self._load_args["table_name"], - load_args=load_args, - save_args=save_args, - ) + @classmethod + def create_connection(cls, connection_str: str) -> None: + """Given a connection string, create singleton connection + to be used across all instances of `SQLTableDataSet` that + need to connect to the same source. + """ + if connection_str in cls.engines: + return - def _load(self) -> pd.DataFrame: try: - return pd.read_sql_table(**self._load_args) + engine = create_engine(connection_str) except ImportError as import_error: raise _get_missing_module_error(import_error) from import_error except NoSuchModuleError as exc: raise _get_sql_alchemy_missing_error() from exc + cls.engines[connection_str] = engine + + def _describe(self) -> Dict[str, Any]: + load_args = copy.deepcopy(self._load_args) + save_args = copy.deepcopy(self._save_args) + del load_args["table_name"] + del save_args["name"] + return { + "table_name": self._load_args["table_name"], + "load_args": load_args, + "save_args": save_args, + } + + def _load(self) -> pd.DataFrame: + engine = self.engines[self._connection_str] # type:ignore + return pd.read_sql_table(con=engine, **self._load_args) + def _save(self, data: pd.DataFrame) -> None: - try: - data.to_sql(**self._save_args) - except ImportError as import_error: - raise _get_missing_module_error(import_error) from import_error - except NoSuchModuleError as exc: - raise _get_sql_alchemy_missing_error() from exc + engine = self.engines[self._connection_str] # type: ignore + data.to_sql(con=engine, **self._save_args) def _exists(self) -> bool: - eng = create_engine(self._load_args["con"]) + eng = self.engines[self._connection_str] # type: ignore schema = self._load_args.get("schema", None) exists = self._load_args["table_name"] in eng.table_names(schema) - eng.dispose() return exists -class SQLQueryDataSet(AbstractDataSet): +class SQLQueryDataSet(AbstractDataSet[None, pd.DataFrame]): """``SQLQueryDataSet`` loads data from a provided SQL query. It uses ``pandas.DataFrame`` internally, so it supports all allowed pandas options on ``read_sql_query``. Since Pandas uses SQLAlchemy behind @@ -258,7 +278,40 @@ class SQLQueryDataSet(AbstractDataSet): To save data to a SQL server use ``SQLTableDataSet``. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + shuttle_id_dataset: + type: pandas.SQLQueryDataSet + sql: "select shuttle, shuttle_id from spaceflights.shuttles;" + credentials: db_credentials + + Advanced example using the ``stream_results`` and ``chunksize`` options to reduce memory usage: + + .. code-block:: yaml + + shuttle_id_dataset: + type: pandas.SQLQueryDataSet + sql: "select shuttle, shuttle_id from spaceflights.shuttles;" + credentials: db_credentials + execution_options: + stream_results: true + load_args: + chunksize: 1000 + + Sample database credentials entry in ``credentials.yml``: + + .. code-block:: yaml + + db_credentials: + con: postgresql://scott:tiger@localhost/test + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pandas import SQLQueryDataSet @@ -274,12 +327,21 @@ class SQLQueryDataSet(AbstractDataSet): >>> credentials=credentials) >>> >>> sql_data = data_set.load() - >>> """ - def __init__( - self, sql: str, credentials: Dict[str, Any], load_args: Dict[str, Any] = None + # using Any because of Sphinx but it should be + # sqlalchemy.engine.Engine or sqlalchemy.engine.base.Engine + engines: Dict[str, Any] = {} + + def __init__( # noqa: too-many-arguments + self, + sql: str = None, + credentials: Dict[str, Any] = None, + load_args: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + filepath: str = None, + execution_options: Optional[Dict[str, Any]] = None, ) -> None: """Creates a new ``SQLQueryDataSet``. @@ -297,19 +359,39 @@ def __init__( https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html To find all supported connection string formats, see here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as + to pass to the filesystem's `open` method through nested keys + `open_args_load` and `open_args_save`. + Here you can find all available arguments for `open`: + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open + All defaults are preserved, except `mode`, which is set to `r` when loading. + filepath: A path to a file with a sql query statement. + execution_options: A dictionary with non-SQL advanced options for the connection to + be applied to the underlying engine. To find all supported execution + options, see here: + https://docs.sqlalchemy.org/en/12/core/connections.html#sqlalchemy.engine.Connection.execution_options + Note that this is not a standard argument supported by pandas API, but could be + useful for handling large datasets. Raises: - DataSetError: When either ``sql`` or ``con`` parameters is emtpy. + DatasetError: When either ``sql`` or ``con`` parameters is empty. """ + if sql and filepath: + raise DatasetError( + "'sql' and 'filepath' arguments cannot both be provided." + "Please only provide one." + ) - if not sql: - raise DataSetError( - "`sql` argument cannot be empty. Please provide a sql query" + if not (sql or filepath): + raise DatasetError( + "'sql' and 'filepath' arguments cannot both be empty." + "Please provide a sql query or path to a sql query file." ) if not (credentials and "con" in credentials and credentials["con"]): - raise DataSetError( - "`con` argument cannot be empty. Please " + raise DatasetError( + "'con' argument cannot be empty. Please " "provide a SQLAlchemy connection string." ) @@ -321,22 +403,62 @@ def __init__( else default_load_args ) - self._load_args["sql"] = sql - self._load_args["con"] = credentials["con"] - - def _describe(self) -> Dict[str, Any]: - load_args = self._load_args.copy() - del load_args["sql"] - del load_args["con"] - return dict(sql=self._load_args["sql"], load_args=load_args) + # load sql query from file + if sql: + self._load_args["sql"] = sql + self._filepath = None + else: + # filesystem for loading sql file + _fs_args = copy.deepcopy(fs_args) or {} + _fs_credentials = _fs_args.pop("credentials", {}) + protocol, path = get_protocol_and_path(str(filepath)) + + self._protocol = protocol + self._fs = fsspec.filesystem(self._protocol, **_fs_credentials, **_fs_args) + self._filepath = path + self._connection_str = credentials["con"] + self._execution_options = execution_options or {} + self.create_connection(self._connection_str) + + @classmethod + def create_connection(cls, connection_str: str) -> None: + """Given a connection string, create singleton connection + to be used across all instances of `SQLQueryDataSet` that + need to connect to the same source. + """ + if connection_str in cls.engines: + return - def _load(self) -> pd.DataFrame: try: - return pd.read_sql_query(**self._load_args) + engine = create_engine(connection_str) except ImportError as import_error: raise _get_missing_module_error(import_error) from import_error except NoSuchModuleError as exc: raise _get_sql_alchemy_missing_error() from exc - def _save(self, data: pd.DataFrame) -> None: - raise DataSetError("`save` is not supported on SQLQueryDataSet") + cls.engines[connection_str] = engine + + def _describe(self) -> Dict[str, Any]: + load_args = copy.deepcopy(self._load_args) + return { + "sql": str(load_args.pop("sql", None)), + "filepath": str(self._filepath), + "load_args": str(load_args), + "execution_options": str(self._execution_options), + } + + def _load(self) -> pd.DataFrame: + load_args = copy.deepcopy(self._load_args) + engine = self.engines[self._connection_str].execution_options( + **self._execution_options + ) # type: ignore + + if self._filepath: + load_path = get_filepath_str(PurePosixPath(self._filepath), self._protocol) + with self._fs.open(load_path, mode="r") as fs_file: + load_args["sql"] = fs_file.read() + + return pd.read_sql_query(con=engine, **load_args) + + def _save(self, data: None) -> NoReturn: + raise DatasetError("'save' is not supported on SQLQueryDataSet") diff --git a/kedro/extras/datasets/pandas/xml_dataset.py b/kedro/extras/datasets/pandas/xml_dataset.py new file mode 100644 index 0000000000..9433ae238d --- /dev/null +++ b/kedro/extras/datasets/pandas/xml_dataset.py @@ -0,0 +1,171 @@ +"""``XMLDataSet`` loads/saves data from/to a XML file using an underlying +filesystem (e.g.: local, S3, GCS). It uses pandas to handle the XML file. +""" +import logging +from copy import deepcopy +from io import BytesIO +from pathlib import PurePosixPath +from typing import Any, Dict + +import fsspec +import pandas as pd + +from kedro.io.core import ( + PROTOCOL_DELIMITER, + AbstractVersionedDataSet, + DatasetError, + Version, + get_filepath_str, + get_protocol_and_path, +) + +logger = logging.getLogger(__name__) + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + + +class XMLDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): + """``XMLDataSet`` loads/saves data from/to a XML file using an underlying + filesystem (e.g.: local, S3, GCS). It uses pandas to handle the XML file. + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.pandas import XMLDataSet + >>> import pandas as pd + >>> + >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> + >>> data_set = XMLDataSet(filepath="test.xml") + >>> data_set.save(data) + >>> reloaded = data_set.load() + >>> assert data.equals(reloaded) + + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] + + def __init__( # noqa: too-many-arguments + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``XMLDataSet`` pointing to a concrete XML file + on a specific filesystem. + + Args: + filepath: Filepath in POSIX format to a XML file prefixed with a protocol like `s3://`. + If prefix is not provided, `file` protocol (local filesystem) will be used. + The prefix should be any protocol supported by ``fsspec``. + Note: `http(s)` doesn't support versioning. + load_args: Pandas options for loading XML files. + Here you can find all available arguments: + https://pandas.pydata.org/docs/reference/api/pandas.read_xml.html + All defaults are preserved. + save_args: Pandas options for saving XML files. + Here you can find all available arguments: + https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_xml.html + All defaults are preserved, but "index", which is set to False. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + """ + _fs_args = deepcopy(fs_args) or {} + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + if "storage_options" in self._save_args or "storage_options" in self._load_args: + logger.warning( + "Dropping 'storage_options' for %s, " + "please specify them under 'fs_args' or 'credentials'.", + self._filepath, + ) + self._save_args.pop("storage_options", None) + self._load_args.pop("storage_options", None) + + def _describe(self) -> Dict[str, Any]: + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _load(self) -> pd.DataFrame: + load_path = str(self._get_load_path()) + if self._protocol == "file": + # file:// protocol seems to misbehave on Windows + # (), + # so we don't join that back to the filepath; + # storage_options also don't work with local paths + return pd.read_xml(load_path, **self._load_args) + + load_path = f"{self._protocol}{PROTOCOL_DELIMITER}{load_path}" + return pd.read_xml( + load_path, storage_options=self._storage_options, **self._load_args + ) + + def _save(self, data: pd.DataFrame) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + + buf = BytesIO() + data.to_xml(path_or_buffer=buf, **self._save_args) + + with self._fs.open(save_path, mode="wb") as fs_file: + fs_file.write(buf.getvalue()) + + self._invalidate_cache() + + def _exists(self) -> bool: + try: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + except DatasetError: + return False + + return self._fs.exists(load_path) + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro/extras/datasets/pickle/__init__.py b/kedro/extras/datasets/pickle/__init__.py index b876689bd0..8e6707d450 100644 --- a/kedro/extras/datasets/pickle/__init__.py +++ b/kedro/extras/datasets/pickle/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to load/save data from/to a Pickle file.""" __all__ = ["PickleDataSet"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .pickle_dataset import PickleDataSet # NOQA + from .pickle_dataset import PickleDataSet diff --git a/kedro/extras/datasets/pickle/pickle_dataset.py b/kedro/extras/datasets/pickle/pickle_dataset.py index 7973241729..eb9fb55594 100644 --- a/kedro/extras/datasets/pickle/pickle_dataset.py +++ b/kedro/extras/datasets/pickle/pickle_dataset.py @@ -1,37 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``PickleDataSet`` loads/saves data from/to a Pickle file using an underlying filesystem (e.g.: local, S3, GCS). The underlying functionality is supported by -the ``pickle``, ``joblib``, and ``compress_pickle`` libraries, so it supports -all allowed options for loading and saving pickle files. +the specified backend library passed in (defaults to the ``pickle`` library), so it +supports all allowed options for loading and saving pickle files. """ -import pickle +import importlib from copy import deepcopy from pathlib import PurePosixPath from typing import Any, Dict @@ -40,30 +12,45 @@ from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, ) -try: - import joblib -except ImportError: # pragma: no cover - joblib = None - -try: - import compress_pickle -except ImportError: # pragma: no cover - compress_pickle = None +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) -class PickleDataSet(AbstractVersionedDataSet): +class PickleDataSet(AbstractVersionedDataSet[Any, Any]): """``PickleDataSet`` loads/saves data from/to a Pickle file using an underlying filesystem (e.g.: local, S3, GCS). The underlying functionality is supported by - the ``pickle`` and ``joblib`` libraries, so it supports all allowed options for - loading and saving pickle files. - - Example: + the specified backend library passed in (defaults to the ``pickle`` library), so it + supports all allowed options for loading and saving pickle files. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + test_model: # simple example without compression + type: pickle.PickleDataSet + filepath: data/07_model_output/test_model.pkl + backend: pickle + + final_model: # example with load and save args + type: pickle.PickleDataSet + filepath: s3://your_bucket/final_model.pkl.lz4 + backend: joblib + credentials: s3_credentials + save_args: + compress: lz4 + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pickle import PickleDataSet @@ -72,13 +59,11 @@ class PickleDataSet(AbstractVersionedDataSet): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = PickleDataSet(filepath="gcs://bucket/test.pkl") >>> data_set = PickleDataSet(filepath="test.pkl", backend="pickle") >>> data_set.save(data) >>> reloaded = data_set.load() >>> assert data.equals(reloaded) - - >>> # Add "compress_pickle[lz4]" to requirements.txt + >>> >>> data_set = PickleDataSet(filepath="test.pickle.lz4", >>> backend="compress_pickle", >>> load_args={"compression":"lz4"}, @@ -90,10 +75,8 @@ class PickleDataSet(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - BACKENDS = {"pickle": pickle, "joblib": joblib, "compress_pickle": compress_pickle} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments,too-many-locals self, filepath: str, backend: str = "pickle", @@ -104,24 +87,41 @@ def __init__( fs_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``PickleDataSet`` pointing to a concrete Pickle - file on a specific filesystem. ``PickleDataSet`` supports two backends to - serialize/deserialize objects: `pickle` and `joblib`. + file on a specific filesystem. ``PickleDataSet`` supports custom backends to + serialise/deserialise objects. + + Example backends that are compatible (non-exhaustive): + * `pickle` + * `joblib` + * `dill` + * `compress_pickle` + + Example backends that are incompatible: + * `torch` Args: filepath: Filepath in POSIX format to a Pickle file prefixed with a protocol like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. - backend: Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'. + backend: Backend to use, must be an import path to a module which satisfies the + ``pickle`` interface. That is, contains a `load` and `dump` function. + Defaults to 'pickle'. load_args: Pickle options for loading pickle files. - Here you can find all available arguments for different backends: + You can pass in arguments that the backend load function specified accepts, e.g: pickle.load: https://docs.python.org/3/library/pickle.html#pickle.load joblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html + dill.load: https://dill.readthedocs.io/en/latest/index.html#dill.load + compress_pickle.load: + https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.load All defaults are preserved. save_args: Pickle options for saving pickle files. - Here you can find all available arguments for different backends: + You can pass in arguments that the backend dump function specified accepts, e.g: pickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump joblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html + dill.dump: https://dill.readthedocs.io/en/latest/index.html#dill.dump + compress_pickle.dump: + https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dump All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is @@ -138,19 +138,28 @@ def __init__( All defaults are preserved, except `mode`, which is set to `wb` when saving. Raises: - ValueError: If ``backend`` is not one of ['pickle', 'joblib']. - ImportError: If ``backend`` library could not be imported. + ValueError: If ``backend`` does not satisfy the `pickle` interface. + ImportError: If the ``backend`` module could not be imported. """ - if backend not in self.BACKENDS: - raise ValueError( - f"'backend' should be one of {list(self.BACKENDS.keys())}, " - f"got '{backend}'." - ) - - if not self.BACKENDS[backend]: + # We do not store `imported_backend` as an attribute to be used in `load`/`save` + # as this would mean the dataset cannot be deepcopied (module objects cannot be + # pickled). The import here is purely to raise any errors as early as possible. + # Repeated imports in the `load` and `save` methods should not be a significant + # performance hit as Python caches imports. + try: + imported_backend = importlib.import_module(backend) + except ImportError as exc: raise ImportError( - f"Selected backend '{backend}' could not be " - "imported. Make sure it is installed." + f"Selected backend '{backend}' could not be imported. " + "Make sure it is installed and importable." + ) from exc + + if not ( + hasattr(imported_backend, "load") and hasattr(imported_backend, "dump") + ): + raise ValueError( + f"Selected backend '{backend}' should satisfy the pickle interface. " + "Missing one of 'load' and 'dump' on the backend." ) _fs_args = deepcopy(fs_args) or {} @@ -187,32 +196,32 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - backend=self._backend, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "backend": self._backend, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> Any: load_path = get_filepath_str(self._get_load_path(), self._protocol) with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return self.BACKENDS[self._backend].load( - fs_file, **self._load_args - ) # nosec + imported_backend = importlib.import_module(self._backend) + return imported_backend.load(fs_file, **self._load_args) # type: ignore def _save(self, data: Any) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: try: - self.BACKENDS[self._backend].dump(data, fs_file, **self._save_args) + imported_backend = importlib.import_module(self._backend) + imported_backend.dump(data, fs_file, **self._save_args) # type: ignore except Exception as exc: - raise DataSetError( - f"{data.__class__} was not serialized due to: {exc}" + raise DatasetError( + f"{data.__class__} was not serialised due to: {exc}" ) from exc self._invalidate_cache() @@ -220,7 +229,7 @@ def _save(self, data: Any) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/pillow/__init__.py b/kedro/extras/datasets/pillow/__init__.py index 1426ec8290..bd68c032c3 100644 --- a/kedro/extras/datasets/pillow/__init__.py +++ b/kedro/extras/datasets/pillow/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to load/save image data.""" __all__ = ["ImageDataSet"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .image_dataset import ImageDataSet # NOQA + from .image_dataset import ImageDataSet diff --git a/kedro/extras/datasets/pillow/image_dataset.py b/kedro/extras/datasets/pillow/image_dataset.py index 39c19804fb..35c84995f4 100644 --- a/kedro/extras/datasets/pillow/image_dataset.py +++ b/kedro/extras/datasets/pillow/image_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``ImageDataSet`` loads/saves image data as `numpy` from an underlying filesystem (e.g.: local, S3, GCS). It uses Pillow to handle image file. """ @@ -38,23 +10,28 @@ from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, ) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class ImageDataSet(AbstractVersionedDataSet): +class ImageDataSet(AbstractVersionedDataSet[Image.Image, Image.Image]): """``ImageDataSet`` loads/saves image data as `numpy` from an underlying filesystem (e.g.: local, S3, GCS). It uses Pillow to handle image file. - Example: + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.pillow import ImageDataSet >>> - >>> # data_set = ImageDataSet(filepath="gcs://bucket/test.png") >>> data_set = ImageDataSet(filepath="test.png") >>> image = data_set.load() >>> image.show() @@ -63,8 +40,7 @@ class ImageDataSet(AbstractVersionedDataSet): DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, save_args: Dict[str, Any] = None, @@ -128,20 +104,20 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) - - def _load(self) -> Image: + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } + + def _load(self) -> Image.Image: load_path = get_filepath_str(self._get_load_path(), self._protocol) with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: return Image.open(fs_file).copy() - def _save(self, data: Image) -> None: + def _save(self, data: Image.Image) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: @@ -152,7 +128,7 @@ def _save(self, data: Image) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/plotly/__init__.py b/kedro/extras/datasets/plotly/__init__.py index 229220e0a6..f864ea6dbe 100644 --- a/kedro/extras/datasets/plotly/__init__.py +++ b/kedro/extras/datasets/plotly/__init__.py @@ -1,36 +1,11 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +"""``AbstractDataSet`` implementations to load/save a plotly figure from/to a JSON +file.""" -"""``PlotlyDataSet`` implementation to load/save plotly data.""" - -__all__ = ["PlotlyDataSet"] +__all__ = ["PlotlyDataSet", "JSONDataSet"] from contextlib import suppress with suppress(ImportError): - from .plotly_dataset import PlotlyDataSet # NOQA + from .plotly_dataset import PlotlyDataSet +with suppress(ImportError): + from .json_dataset import JSONDataSet diff --git a/kedro/extras/datasets/plotly/json_dataset.py b/kedro/extras/datasets/plotly/json_dataset.py new file mode 100644 index 0000000000..a03ee5b812 --- /dev/null +++ b/kedro/extras/datasets/plotly/json_dataset.py @@ -0,0 +1,166 @@ +"""``JSONDataSet`` loads/saves a plotly figure from/to a JSON file using an underlying +filesystem (e.g.: local, S3, GCS). +""" +from copy import deepcopy +from pathlib import PurePosixPath +from typing import Any, Dict, Union + +import fsspec +import plotly.io as pio +from plotly import graph_objects as go + +from kedro.io.core import ( + AbstractVersionedDataSet, + Version, + get_filepath_str, + get_protocol_and_path, +) + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + + +class JSONDataSet( + AbstractVersionedDataSet[go.Figure, Union[go.Figure, go.FigureWidget]] +): + """``JSONDataSet`` loads/saves a plotly figure from/to a JSON file using an + underlying filesystem (e.g.: local, S3, GCS). + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + scatter_plot: + type: plotly.JSONDataSet + filepath: data/08_reporting/scatter_plot.json + save_args: + engine: auto + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.plotly import JSONDataSet + >>> import plotly.express as px + >>> + >>> fig = px.bar(x=["a", "b", "c"], y=[1, 3, 2]) + >>> data_set = JSONDataSet(filepath="test.json") + >>> data_set.save(fig) + >>> reloaded = data_set.load() + >>> assert fig == reloaded + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + def __init__( # noqa: too-many-arguments + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file + on a specific filesystem. + + Args: + filepath: Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`. + If prefix is not provided `file` protocol (local filesystem) will be used. + The prefix should be any protocol supported by ``fsspec``. + Note: `http(s)` doesn't support versioning. + load_args: Plotly options for loading JSON files. + Here you can find all available arguments: + https://plotly.com/python-api-reference/generated/plotly.io.from_json.html#plotly.io.from_json + All defaults are preserved. + save_args: Plotly options for saving JSON files. + Here you can find all available arguments: + https://plotly.com/python-api-reference/generated/plotly.io.write_json.html + All defaults are preserved. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{'token': None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as + to pass to the filesystem's `open` method through nested keys + `open_args_load` and `open_args_save`. + Here you can find all available arguments for `open`: + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open + All defaults are preserved, except `mode`, which is set to `w` when + saving. + """ + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + _fs_open_args_save.setdefault("mode", "w") + self._fs_open_args_load = _fs_open_args_load + self._fs_open_args_save = _fs_open_args_save + + def _describe(self) -> Dict[str, Any]: + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _load(self) -> Union[go.Figure, go.FigureWidget]: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + # read_json doesn't work correctly with file handler, so we have to read + # the file, decode it manually and pass to the low-level from_json instead. + return pio.from_json(str(fs_file.read(), "utf-8"), **self._load_args) + + def _save(self, data: go.Figure) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + data.write_json(fs_file, **self._save_args) + + self._invalidate_cache() + + def _exists(self) -> bool: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + + return self._fs.exists(load_path) + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro/extras/datasets/plotly/plotly_dataset.py b/kedro/extras/datasets/plotly/plotly_dataset.py index 038a0cb402..68dc27b012 100644 --- a/kedro/extras/datasets/plotly/plotly_dataset.py +++ b/kedro/extras/datasets/plotly/plotly_dataset.py @@ -1,76 +1,76 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``PlotlyDataSet`` saves plotly objects to a JSON file and loads JSON plotly figures -into plotly.graph_objects.Figure objects. +"""``PlotlyDataSet`` generates a plot from a pandas DataFrame and saves it to a JSON +file using an underlying filesystem (e.g.: local, S3, GCS). It loads the JSON into a +plotly figure. """ from copy import deepcopy from typing import Any, Dict import pandas as pd import plotly.express as px -import plotly.io as pio -from plotly import graph_objects +from plotly import graph_objects as go -from kedro.extras.datasets.pandas import JSONDataSet -from kedro.io.core import Version, get_filepath_str +from kedro.io.core import Version +from .json_dataset import JSONDataSet -class PlotlyDataSet(JSONDataSet): - """``PlotlyDataSet`` saves a pandas DataFrame to a plotly JSON file. +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) - The plotly JSON file can be saved to any underlying filesystem - supported by fsspec (e.g. local, S3, GCS). - Warning: This DataSet is not symmetric and doesn't load back - into pandas DataFrames, but into plotly.graph_objects.Figure. - Example configuration for a PlotlyDataSet in the catalog: +class PlotlyDataSet(JSONDataSet): + """``PlotlyDataSet`` generates a plot from a pandas DataFrame and saves it to a JSON + file using an underlying filesystem (e.g.: local, S3, GCS). It loads the JSON into a + plotly figure. + + ``PlotlyDataSet`` is a convenience wrapper for ``plotly.JSONDataSet``. It generates + the JSON file directly from a pandas DataFrame through ``plotly_args``. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + bar_plot: + type: plotly.PlotlyDataSet + filepath: data/08_reporting/bar_plot.json + plotly_args: + type: bar + fig: + x: features + y: importance + orientation: h + layout: + xaxis_title: x + yaxis_title: y + title: Title + + Example usage for the + `Python API `_: :: - >>> bar_plot: - >>> type: plotly.PlotlyDataSet - >>> filepath: data/08_reporting/bar_plot.json - >>> plotly_args: - >>> type: bar - >>> fig: - >>> x: features - >>> y: importance - >>> orientation: 'h' - >>> layout: - >>> xaxis_title: 'x' - >>> yaxis_title: 'y' - >>> title: 'Test' - """ + >>> from kedro.extras.datasets.plotly import PlotlyDataSet + >>> import plotly.express as px + >>> import pandas as pd + >>> + >>> df_data = pd.DataFrame([[0, 1], [1, 0]], columns=('x1', 'x2')) + >>> + >>> data_set = PlotlyDataSet( + >>> filepath='scatter_plot.json', + >>> plotly_args={ + >>> 'type': 'scatter', + >>> 'fig': {'x': 'x1', 'y': 'x2'}, + >>> } + >>> ) + >>> data_set.save(df_data) + >>> reloaded = data_set.load() + >>> assert px.scatter(df_data, x='x1', y='x2') == reloaded - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + """ - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, plotly_args: Dict[str, Any], @@ -80,16 +80,18 @@ def __init__( credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, ) -> None: - """Creates a new instance of ``PlotlyDataSet`` pointing to a plotly.graph_objects.Figure - saved as a concrete JSON file on a specific filesystem. + """Creates a new instance of ``PlotlyDataSet`` pointing to a concrete JSON file + on a specific filesystem. Args: filepath: Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`. If prefix is not provided `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. - plotly_args: Plotly configuration for generating a plotly graph object Figure - representing the plotted data. + plotly_args: Plotly configuration for generating a plotly figure from the + dataframe. Keys are `type` (plotly express function, e.g. bar, + line, scatter), `fig` (kwargs passed to the plotting function), theme + (defaults to `plotly`), `layout`. load_args: Plotly options for loading JSON files. Here you can find all available arguments: https://plotly.com/python-api-reference/generated/plotly.io.from_json.html#plotly.io.from_json @@ -97,7 +99,7 @@ def __init__( save_args: Plotly options for saving JSON files. Here you can find all available arguments: https://plotly.com/python-api-reference/generated/plotly.io.write_json.html - All defaults are preserved, but "index", which is set to False. + All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -127,40 +129,13 @@ def _describe(self) -> Dict[str, Any]: return {**super()._describe(), "plotly_args": self._plotly_args} def _save(self, data: pd.DataFrame) -> None: - plot_data = _plotly_express_wrapper(data, self._plotly_args) - - full_key_path = get_filepath_str(self._get_save_path(), self._protocol) - with self._fs.open(full_key_path, **self._fs_open_args_save) as fs_file: - plot_data.write_json(fs_file, **self._save_args) - - self._invalidate_cache() - - def _load(self) -> graph_objects.Figure: - load_path = get_filepath_str(self._get_load_path(), self._protocol) - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - # read_json doesn't work correctly with file handler, so we have to read the file, - # decode it manually and pass to the low-level from_json instead. - return pio.from_json(str(fs_file.read(), "utf-8"), **self._load_args) - - -def _plotly_express_wrapper( - data: pd.DataFrame, plotly_config: Dict[str, Any] -) -> graph_objects.Figure: - """Generates plotly graph object Figure based on the type of plotting - and config provided in the catalog. - - Args: - data: pandas dataframe to generate plotly Figure for - plotly_config: plotly configurations specified in the catalog to be used - - Returns: - A plotly graph_object figure representing the plotted data - """ - fig_params = plotly_config.get("fig") - plot = plotly_config.get("type") - theme = plotly_config.get("theme", "plotly") - layout_params = plotly_config.get("layout", {}) - fig = getattr(px, plot)(data, **fig_params) # type: ignore - fig.update_layout(template=theme) - fig.update_layout(layout_params) - return fig + fig = self._plot_dataframe(data) + super()._save(fig) + + def _plot_dataframe(self, data: pd.DataFrame) -> go.Figure: + plot_type = self._plotly_args.get("type") + fig_params = self._plotly_args.get("fig", {}) + fig = getattr(px, plot_type)(data, **fig_params) # type: ignore + fig.update_layout(template=self._plotly_args.get("theme", "plotly")) + fig.update_layout(self._plotly_args.get("layout", {})) + return fig diff --git a/kedro/extras/datasets/redis/__init__.py b/kedro/extras/datasets/redis/__init__.py new file mode 100644 index 0000000000..ba56e1fb85 --- /dev/null +++ b/kedro/extras/datasets/redis/__init__.py @@ -0,0 +1,8 @@ +"""``AbstractDataSet`` implementation to load/save data from/to a redis db.""" + +__all__ = ["PickleDataSet"] + +from contextlib import suppress + +with suppress(ImportError): + from .redis_dataset import PickleDataSet diff --git a/kedro/extras/datasets/redis/redis_dataset.py b/kedro/extras/datasets/redis/redis_dataset.py new file mode 100644 index 0000000000..c2bb2ca660 --- /dev/null +++ b/kedro/extras/datasets/redis/redis_dataset.py @@ -0,0 +1,190 @@ +"""``PickleDataSet`` loads/saves data from/to a Redis database. The underlying +functionality is supported by the redis library, so it supports all allowed +options for instantiating the redis app ``from_url`` and setting a value.""" + +import importlib +import os +from copy import deepcopy +from typing import Any, Dict + +import redis + +from kedro.io.core import AbstractDataSet, DatasetError + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + + +class PickleDataSet(AbstractDataSet[Any, Any]): + """``PickleDataSet`` loads/saves data from/to a Redis database. The + underlying functionality is supported by the redis library, so it supports + all allowed options for instantiating the redis app ``from_url`` and setting + a value. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + my_python_object: # simple example + type: redis.PickleDataSet + key: my_object + from_url_args: + url: redis://127.0.0.1:6379 + + final_python_object: # example with save args + type: redis.PickleDataSet + key: my_final_object + from_url_args: + url: redis://127.0.0.1:6379 + db: 1 + save_args: + ex: 10 + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.redis import PickleDataSet + >>> import pandas as pd + >>> + >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> + >>> my_data = PickleDataSet(key="my_data") + >>> my_data.save(data) + >>> reloaded = my_data.load() + >>> assert data.equals(reloaded) + """ + + DEFAULT_REDIS_URL = os.getenv("REDIS_URL", "redis://127.0.0.1:6379") + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + def __init__( # noqa: too-many-arguments + self, + key: str, + backend: str = "pickle", + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + credentials: Dict[str, Any] = None, + redis_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``PickleDataSet``. This loads/saves data from/to + a Redis database while deserialising/serialising. Supports custom backends to + serialise/deserialise objects. + + Example backends that are compatible (non-exhaustive): + * `pickle` + * `dill` + * `compress_pickle` + + Example backends that are incompatible: + * `torch` + + Args: + key: The key to use for saving/loading object to Redis. + backend: Backend to use, must be an import path to a module which satisfies the + ``pickle`` interface. That is, contains a `loads` and `dumps` function. + Defaults to 'pickle'. + load_args: Pickle options for loading pickle files. + You can pass in arguments that the backend load function specified accepts, e.g: + pickle.loads: https://docs.python.org/3/library/pickle.html#pickle.loads + dill.loads: https://dill.readthedocs.io/en/latest/index.html#dill.loads + compress_pickle.loads: + https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.loads + All defaults are preserved. + save_args: Pickle options for saving pickle files. + You can pass in arguments that the backend dump function specified accepts, e.g: + pickle.dumps: https://docs.python.org/3/library/pickle.html#pickle.dump + dill.dumps: https://dill.readthedocs.io/en/latest/index.html#dill.dumps + compress_pickle.dumps: + https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dumps + All defaults are preserved. + credentials: Credentials required to get access to the redis server. + E.g. `{"password": None}`. + redis_args: Extra arguments to pass into the redis client constructor + ``redis.StrictRedis.from_url``. (e.g. `{"socket_timeout": 10}`), as well as to pass + to the ``redis.StrictRedis.set`` through nested keys `from_url_args` and `set_args`. + Here you can find all available arguments for `from_url`: + https://redis-py.readthedocs.io/en/stable/connections.html?highlight=from_url#redis.Redis.from_url + All defaults are preserved, except `url`, which is set to `redis://127.0.0.1:6379`. + You could also specify the url through the env variable ``REDIS_URL``. + + Raises: + ValueError: If ``backend`` does not satisfy the `pickle` interface. + ImportError: If the ``backend`` module could not be imported. + """ + try: + imported_backend = importlib.import_module(backend) + except ImportError as exc: + raise ImportError( + f"Selected backend '{backend}' could not be imported. " + "Make sure it is installed and importable." + ) from exc + + if not ( + hasattr(imported_backend, "loads") and hasattr(imported_backend, "dumps") + ): + raise ValueError( + f"Selected backend '{backend}' should satisfy the pickle interface. " + "Missing one of 'loads' and 'dumps' on the backend." + ) + + self._backend = backend + + self._key = key + + _redis_args = deepcopy(redis_args) or {} + self._redis_from_url_args = _redis_args.pop("from_url_args", {}) + self._redis_from_url_args.setdefault("url", self.DEFAULT_REDIS_URL) + self._redis_set_args = _redis_args.pop("set_args", {}) + _credentials = deepcopy(credentials) or {} + + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + self._redis_db = redis.Redis.from_url( + **self._redis_from_url_args, **_credentials + ) + + def _describe(self) -> Dict[str, Any]: + return {"key": self._key, **self._redis_from_url_args} + + # `redis_db` mypy does not work since it is optional and optional is not + # accepted by pickle.loads. + def _load(self) -> Any: + if not self.exists(): + raise DatasetError(f"The provided key {self._key} does not exists.") + imported_backend = importlib.import_module(self._backend) + return imported_backend.loads( # type: ignore + self._redis_db.get(self._key), **self._load_args + ) # type: ignore + + def _save(self, data: Any) -> None: + try: + imported_backend = importlib.import_module(self._backend) + self._redis_db.set( + self._key, + imported_backend.dumps(data, **self._save_args), # type: ignore + **self._redis_set_args, + ) + except Exception as exc: + raise DatasetError( + f"{data.__class__} was not serialised due to: {exc}" + ) from exc + + def _exists(self) -> bool: + try: + return bool(self._redis_db.exists(self._key)) + except Exception as exc: + raise DatasetError( + f"The existence of key {self._key} could not be established due to: {exc}" + ) from exc diff --git a/kedro/extras/datasets/spark/__init__.py b/kedro/extras/datasets/spark/__init__.py index fc637663cd..3dede09aa8 100644 --- a/kedro/extras/datasets/spark/__init__.py +++ b/kedro/extras/datasets/spark/__init__.py @@ -1,40 +1,14 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Provides I/O modules for Apache Spark.""" -__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet"] +__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet"] from contextlib import suppress with suppress(ImportError): - from .spark_dataset import SparkDataSet # NOQA + from .spark_dataset import SparkDataSet +with suppress(ImportError): + from .spark_hive_dataset import SparkHiveDataSet with suppress(ImportError): - from .spark_hive_dataset import SparkHiveDataSet # NOQA + from .spark_jdbc_dataset import SparkJDBCDataSet with suppress(ImportError): - from .spark_jdbc_dataset import SparkJDBCDataSet # NOQA + from .deltatable_dataset import DeltaTableDataSet diff --git a/kedro/extras/datasets/spark/deltatable_dataset.py b/kedro/extras/datasets/spark/deltatable_dataset.py new file mode 100644 index 0000000000..3d56f81048 --- /dev/null +++ b/kedro/extras/datasets/spark/deltatable_dataset.py @@ -0,0 +1,110 @@ +"""``AbstractDataSet`` implementation to access DeltaTables using +``delta-spark`` +""" +from pathlib import PurePosixPath +from typing import NoReturn + +from delta.tables import DeltaTable +from pyspark.sql import SparkSession +from pyspark.sql.utils import AnalysisException + +from kedro.extras.datasets.spark.spark_dataset import ( + _split_filepath, + _strip_dbfs_prefix, +) +from kedro.io.core import AbstractDataSet, DatasetError + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + + +class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]): + """``DeltaTableDataSet`` loads data into DeltaTable objects. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + weather@spark: + type: spark.SparkDataSet + filepath: data/02_intermediate/data.parquet + file_format: "delta" + + weather@delta: + type: spark.DeltaTableDataSet + filepath: data/02_intermediate/data.parquet + + Example usage for the + `Python API `_: + :: + + >>> from pyspark.sql import SparkSession + >>> from pyspark.sql.types import (StructField, StringType, + >>> IntegerType, StructType) + >>> + >>> from kedro.extras.datasets.spark import DeltaTableDataSet, SparkDataSet + >>> + >>> schema = StructType([StructField("name", StringType(), True), + >>> StructField("age", IntegerType(), True)]) + >>> + >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> + >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) + >>> + >>> data_set = SparkDataSet(filepath="test_data", file_format="delta") + >>> data_set.save(spark_df) + >>> deltatable_dataset = DeltaTableDataSet(filepath="test_data") + >>> delta_table = deltatable_dataset.load() + >>> + >>> delta_table.update() + """ + + # this dataset cannot be used with ``ParallelRunner``, + # therefore it has the attribute ``_SINGLE_PROCESS = True`` + # for parallelism within a Spark pipeline please consider + # using ``ThreadRunner`` instead + _SINGLE_PROCESS = True + + def __init__(self, filepath: str) -> None: + """Creates a new instance of ``DeltaTableDataSet``. + + Args: + filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks + and working with data written to mount path points, + specify ``filepath``s for (versioned) ``SparkDataSet``s + starting with ``/dbfs/mnt``. + """ + fs_prefix, filepath = _split_filepath(filepath) + + self._fs_prefix = fs_prefix + self._filepath = PurePosixPath(filepath) + + @staticmethod + def _get_spark(): + return SparkSession.builder.getOrCreate() + + def _load(self) -> DeltaTable: + load_path = self._fs_prefix + str(self._filepath) + return DeltaTable.forPath(self._get_spark(), load_path) + + def _save(self, data: None) -> NoReturn: + raise DatasetError(f"{self.__class__.__name__} is a read only dataset type") + + def _exists(self) -> bool: + load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath)) + + try: + self._get_spark().read.load(path=load_path, format="delta") + except AnalysisException as exception: + if "is not a Delta table" in exception.desc: + return False + raise + + return True + + def _describe(self): + return {"filepath": str(self._filepath), "fs_prefix": self._fs_prefix} diff --git a/kedro/extras/datasets/spark/spark_dataset.py b/kedro/extras/datasets/spark/spark_dataset.py index 7e4040fc15..b27147b7a6 100644 --- a/kedro/extras/datasets/spark/spark_dataset.py +++ b/kedro/extras/datasets/spark/spark_dataset.py @@ -1,34 +1,7 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``AbstractDataSet`` implementation to access Spark dataframes using +"""``AbstractVersionedDataSet`` implementation to access Spark dataframes using ``pyspark`` """ +import json from copy import deepcopy from fnmatch import fnmatch from functools import partial @@ -36,12 +9,24 @@ from typing import Any, Dict, List, Optional, Tuple from warnings import warn +import fsspec from hdfs import HdfsError, InsecureClient from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.types import StructType from pyspark.sql.utils import AnalysisException from s3fs import S3FileSystem -from kedro.io.core import AbstractVersionedDataSet, Version +from kedro.io.core import ( + AbstractVersionedDataSet, + DatasetError, + Version, + get_filepath_str, + get_protocol_and_path, +) + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) def _parse_glob_pattern(pattern: str) -> str: @@ -56,7 +41,7 @@ def _parse_glob_pattern(pattern: str) -> str: def _split_filepath(filepath: str) -> Tuple[str, str]: split_ = filepath.split("://", 1) - if len(split_) == 2: + if len(split_) == 2: # noqa: PLR2004 return split_[0] + "://", split_[1] return "", split_[0] @@ -164,11 +149,11 @@ def hdfs_glob(self, pattern: str) -> List[str]: for dpath, _, fnames in self.walk(prefix): if fnmatch(dpath, pattern): matched.add(dpath) - matched |= set( + matched |= { f"{dpath}/{fname}" for fname in fnames if fnmatch(f"{dpath}/{fname}", pattern) - ) + } except HdfsError: # pragma: no cover # HdfsError is raised by `self.walk()` if prefix does not exist in HDFS. # Ignore and return an empty list. @@ -176,9 +161,46 @@ def hdfs_glob(self, pattern: str) -> List[str]: return sorted(matched) -class SparkDataSet(AbstractVersionedDataSet): +class SparkDataSet(AbstractVersionedDataSet[DataFrame, DataFrame]): """``SparkDataSet`` loads and saves Spark dataframes. - Example: + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + weather: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/weather/* + file_format: csv + load_args: + header: True + inferSchema: True + save_args: + sep: '|' + header: True + + weather_with_schema: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/weather/* + file_format: csv + load_args: + header: True + schema: + filepath: path/to/schema.json + save_args: + sep: '|' + header: True + + weather_cleaned: + type: spark.SparkDataSet + filepath: data/02_intermediate/data.parquet + file_format: parquet + + Example usage for the + `Python API `_: :: >>> from pyspark.sql import SparkSession @@ -210,7 +232,7 @@ class SparkDataSet(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - def __init__( # pylint: disable=too-many-arguments + def __init__( # noqa: too-many-arguments self, filepath: str, file_format: str = "parquet", @@ -228,21 +250,21 @@ def __init__( # pylint: disable=too-many-arguments starting with ``/dbfs/mnt``. file_format: File format used during load and save operations. These are formats supported by the running - SparkContext include parquet, csv. For a list of supported + SparkContext include parquet, csv, delta. For a list of supported formats please refer to Apache Spark documentation at https://spark.apache.org/docs/latest/sql-programming-guide.html load_args: Load args passed to Spark DataFrameReader load method. It is dependent on the selected file format. You can find a list of read options for each supported format in Spark DataFrame read documentation: - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis + https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html save_args: Save args passed to Spark DataFrame write options. Similar to load_args this is dependent on the selected file format. You can pass ``mode`` and ``partitionBy`` to specify your overwrite mode and partitioning respectively. You can find a list of options for each format in Spark DataFrame write documentation: - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis + https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -260,8 +282,8 @@ def __init__( # pylint: disable=too-many-arguments if fs_prefix in ("s3a://", "s3n://"): if fs_prefix == "s3n://": warn( - "`s3n` filesystem has now been deprecated by Spark, " - "please consider switching to `s3a`", + "'s3n' filesystem has now been deprecated by Spark, " + "please consider switching to 's3a'", DeprecationWarning, ) _s3 = S3FileSystem(**credentials) @@ -271,10 +293,9 @@ def __init__( # pylint: disable=too-many-arguments elif fs_prefix == "hdfs://" and version: warn( - "HDFS filesystem support for versioned {} is in beta and uses " - "`hdfs.client.InsecureClient`, please use with caution".format( - self.__class__.__name__ - ) + f"HDFS filesystem support for versioned {self.__class__.__name__} is " + f"in beta and uses 'hdfs.client.InsecureClient', please use with " + f"caution" ) # default namenode address @@ -310,17 +331,51 @@ def __init__( # pylint: disable=too-many-arguments if save_args is not None: self._save_args.update(save_args) + # Handle schema load argument + self._schema = self._load_args.pop("schema", None) + if self._schema is not None: + if isinstance(self._schema, dict): + self._schema = self._load_schema_from_file(self._schema) + self._file_format = file_format self._fs_prefix = fs_prefix + self._handle_delta_format() + + @staticmethod + def _load_schema_from_file(schema: Dict[str, Any]) -> StructType: + + filepath = schema.get("filepath") + if not filepath: + raise DatasetError( + "Schema load argument does not specify a 'filepath' attribute. Please" + "include a path to a JSON-serialised 'pyspark.sql.types.StructType'." + ) + + credentials = deepcopy(schema.get("credentials")) or {} + protocol, schema_path = get_protocol_and_path(filepath) + file_system = fsspec.filesystem(protocol, **credentials) + pure_posix_path = PurePosixPath(schema_path) + load_path = get_filepath_str(pure_posix_path, protocol) + + # Open schema file + with file_system.open(load_path) as fs_file: + + try: + return StructType.fromJson(json.loads(fs_file.read())) + except Exception as exc: + raise DatasetError( + f"Contents of 'schema.filepath' ({schema_path}) are invalid. Please" + f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'." + ) from exc def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._fs_prefix + str(self._filepath), - file_format=self._file_format, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._fs_prefix + str(self._filepath), + "file_format": self._file_format, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } @staticmethod def _get_spark(): @@ -328,10 +383,13 @@ def _get_spark(): def _load(self) -> DataFrame: load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._get_load_path())) + read_obj = self._get_spark().read - return self._get_spark().read.load( - load_path, self._file_format, **self._load_args - ) + # Pass schema if defined + if self._schema: + read_obj = read_obj.schema(self._schema) + + return read_obj.load(load_path, self._file_format, **self._load_args) def _save(self, data: DataFrame) -> None: save_path = _strip_dbfs_prefix(self._fs_prefix + str(self._get_save_path())) @@ -343,7 +401,24 @@ def _exists(self) -> bool: try: self._get_spark().read.load(load_path, self._file_format) except AnalysisException as exception: - if exception.desc.startswith("Path does not exist:"): + if ( + exception.desc.startswith("Path does not exist:") + or "is not a Delta table" in exception.desc + ): return False raise return True + + def _handle_delta_format(self) -> None: + supported_modes = {"append", "overwrite", "error", "errorifexists", "ignore"} + write_mode = self._save_args.get("mode") + if ( + write_mode + and self._file_format == "delta" + and write_mode not in supported_modes + ): + raise DatasetError( + f"It is not possible to perform 'save()' for file format 'delta' " + f"with mode '{write_mode}' on 'SparkDataSet'. " + f"Please use 'spark.DeltaTableDataSet' instead." + ) diff --git a/kedro/extras/datasets/spark/spark_hive_dataset.py b/kedro/extras/datasets/spark/spark_hive_dataset.py index 6d1b9bc6aa..81f09b9daa 100644 --- a/kedro/extras/datasets/spark/spark_hive_dataset.py +++ b/kedro/extras/datasets/spark/spark_hive_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to access Spark dataframes using ``pyspark`` on Apache Hive. """ @@ -36,24 +8,44 @@ from pyspark.sql import DataFrame, SparkSession, Window from pyspark.sql.functions import col, lit, row_number -from kedro.io.core import AbstractDataSet, DataSetError +from kedro.io.core import AbstractDataSet, DatasetError + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) -# pylint:disable=too-many-instance-attributes -class SparkHiveDataSet(AbstractDataSet): +# noqa: too-many-instance-attributes +class SparkHiveDataSet(AbstractDataSet[DataFrame, DataFrame]): """``SparkHiveDataSet`` loads and saves Spark dataframes stored on Hive. This data set also handles some incompatible file types such as using partitioned parquet on hive which will not normally allow upserts to existing data without a complete replacement of the existing file/partition. This DataSet has some key assumptions: + - Schemas do not change during the pipeline run (defined PKs must be present for the - duration of the pipeline) + duration of the pipeline) - Tables are not being externally modified during upserts. The upsert method is NOT ATOMIC + to external changes to the target table while executing. Upsert methodology works by leveraging Spark DataFrame execution plan checkpointing. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + hive_dataset: + type: spark.SparkHiveDataSet + database: hive_database + table: table_name + write_mode: overwrite + + Example usage for the + `Python API `_: :: >>> from pyspark.sql import SparkSession @@ -79,8 +71,7 @@ class SparkHiveDataSet(AbstractDataSet): DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint:disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, database: str, table: str, @@ -110,17 +101,17 @@ def __init__( or directly in the Spark conf folder. Raises: - DataSetError: Invalid configuration supplied + DatasetError: Invalid configuration supplied """ _write_modes = ["append", "error", "errorifexists", "upsert", "overwrite"] if write_mode not in _write_modes: valid_modes = ", ".join(_write_modes) - raise DataSetError( - f"Invalid `write_mode` provided: {write_mode}. " - f"`write_mode` must be one of: {valid_modes}" + raise DatasetError( + f"Invalid 'write_mode' provided: {write_mode}. " + f"'write_mode' must be one of: {valid_modes}" ) if write_mode == "upsert" and not table_pk: - raise DataSetError("`table_pk` must be set to utilise `upsert` read mode") + raise DatasetError("'table_pk' must be set to utilise 'upsert' read mode") self._write_mode = write_mode self._table_pk = table_pk or [] @@ -130,18 +121,18 @@ def __init__( self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args) - self._format = self._save_args.get("format") or "hive" + self._format = self._save_args.pop("format", None) or "hive" self._eager_checkpoint = self._save_args.pop("eager_checkpoint", None) or True def _describe(self) -> Dict[str, Any]: - return dict( - database=self._database, - table=self._table, - write_mode=self._write_mode, - table_pk=self._table_pk, - partition_by=self._save_args.get("partitionBy"), - format=self._format, - ) + return { + "database": self._database, + "table": self._table, + "write_mode": self._write_mode, + "table_pk": self._table_pk, + "partition_by": self._save_args.get("partitionBy"), + "format": self._format, + } @staticmethod def _get_spark() -> SparkSession: @@ -174,7 +165,7 @@ def _save(self, data: DataFrame) -> None: if self._write_mode == "upsert": # check if _table_pk is a subset of df columns if not set(self._table_pk) <= set(self._load().columns): - raise DataSetError( + raise DatasetError( f"Columns {str(self._table_pk)} selected as primary key(s) not found in " f"table {self._full_table_address}" ) @@ -211,14 +202,14 @@ def _validate_save(self, data: DataFrame): if data_dtypes != hive_dtypes: new_cols = data_dtypes - hive_dtypes missing_cols = hive_dtypes - data_dtypes - raise DataSetError( + raise DatasetError( f"Dataset does not match hive table schema.\n" f"Present on insert only: {sorted(new_cols)}\n" f"Present on schema only: {sorted(missing_cols)}" ) def _exists(self) -> bool: - # noqa # pylint:disable=protected-access + # noqa # noqa: protected-access return ( self._get_spark() ._jsparkSession.catalog() diff --git a/kedro/extras/datasets/spark/spark_jdbc_dataset.py b/kedro/extras/datasets/spark/spark_jdbc_dataset.py index 78d956b19e..26a1ed2481 100644 --- a/kedro/extras/datasets/spark/spark_jdbc_dataset.py +++ b/kedro/extras/datasets/spark/spark_jdbc_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """SparkJDBCDataSet to load and save a PySpark DataFrame via JDBC.""" from copy import deepcopy @@ -33,20 +5,43 @@ from pyspark.sql import DataFrame, SparkSession -from kedro.io.core import AbstractDataSet, DataSetError +from kedro.io.core import AbstractDataSet, DatasetError __all__ = ["SparkJDBCDataSet"] +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class SparkJDBCDataSet(AbstractDataSet): +class SparkJDBCDataSet(AbstractDataSet[DataFrame, DataFrame]): """``SparkJDBCDataSet`` loads data from a database table accessible via JDBC URL url and connection properties and saves the content of a PySpark DataFrame to an external database table via JDBC. It uses ``pyspark.sql.DataFrameReader`` and ``pyspark.sql.DataFrameWriter`` internally, so it supports all allowed PySpark options on ``jdbc``. - - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + weather: + type: spark.SparkJDBCDataSet + table: weather_table + url: jdbc:postgresql://localhost/test + credentials: db_credentials + load_args: + properties: + driver: org.postgresql.Driver + save_args: + properties: + driver: org.postgresql.Driver + + Example usage for the + `Python API `_: :: >>> import pandas as pd @@ -76,8 +71,7 @@ class SparkJDBCDataSet(AbstractDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, url: str, table: str, @@ -98,27 +92,27 @@ def __init__( load_args: Provided to underlying PySpark ``jdbc`` function along with the JDBC URL and the name of the table. To find all supported arguments, see here: - https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html + https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.jdbc.html save_args: Provided to underlying PySpark ``jdbc`` function along with the JDBC URL and the name of the table. To find all supported arguments, see here: - https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html + https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.jdbc.html Raises: - DataSetError: When either ``url`` or ``table`` is empty or + DatasetError: When either ``url`` or ``table`` is empty or when a property is provided with a None value. """ if not url: - raise DataSetError( - "`url` argument cannot be empty. Please " + raise DatasetError( + "'url' argument cannot be empty. Please " "provide a JDBC URL of the form " - "``jdbc:subprotocol:subname``." + "'jdbc:subprotocol:subname'." ) if not table: - raise DataSetError( - "`table` argument cannot be empty. Please " + raise DatasetError( + "'table' argument cannot be empty. Please " "provide the name of the table to load or save " "data to." ) @@ -140,9 +134,9 @@ def __init__( # Check credentials for bad inputs. for cred_key, cred_value in credentials.items(): if cred_value is None: - raise DataSetError( - "Credential property `{}` cannot be None. " - "Please provide a value.".format(cred_key) + raise DatasetError( + f"Credential property '{cred_key}' cannot be None. " + f"Please provide a value." ) load_properties = self._load_args.get("properties", {}) @@ -166,9 +160,12 @@ def _describe(self) -> Dict[str, Any]: save_properties.pop("password", None) save_args = {**save_args, "properties": save_properties} - return dict( - url=self._url, table=self._table, load_args=load_args, save_args=save_args - ) + return { + "url": self._url, + "table": self._table, + "load_args": load_args, + "save_args": save_args, + } @staticmethod def _get_spark(): diff --git a/kedro/extras/datasets/svmlight/__init__.py b/kedro/extras/datasets/svmlight/__init__.py new file mode 100644 index 0000000000..4ea2429612 --- /dev/null +++ b/kedro/extras/datasets/svmlight/__init__.py @@ -0,0 +1,8 @@ +"""``AbstractDataSet`` implementation to load/save data from/to a svmlight/ +libsvm sparse data file.""" +__all__ = ["SVMLightDataSet"] + +from contextlib import suppress + +with suppress(ImportError): + from .svmlight_dataset import SVMLightDataSet diff --git a/kedro/extras/datasets/svmlight/svmlight_dataset.py b/kedro/extras/datasets/svmlight/svmlight_dataset.py new file mode 100644 index 0000000000..f8820b036f --- /dev/null +++ b/kedro/extras/datasets/svmlight/svmlight_dataset.py @@ -0,0 +1,168 @@ +"""``SVMLightDataSet`` loads/saves data from/to a svmlight/libsvm file using an +underlying filesystem (e.g.: local, S3, GCS). It uses sklearn functions +``dump_svmlight_file`` to save and ``load_svmlight_file`` to load a file. +""" +from copy import deepcopy +from pathlib import PurePosixPath +from typing import Any, Dict, Optional, Tuple, Union + +import fsspec +from numpy import ndarray +from scipy.sparse.csr import csr_matrix +from sklearn.datasets import dump_svmlight_file, load_svmlight_file + +from kedro.io.core import ( + AbstractVersionedDataSet, + DatasetError, + Version, + get_filepath_str, + get_protocol_and_path, +) + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + +# Type of data input +_DI = Tuple[Union[ndarray, csr_matrix], ndarray] +# Type of data output +_DO = Tuple[csr_matrix, ndarray] + + +class SVMLightDataSet(AbstractVersionedDataSet[_DI, _DO]): + """``SVMLightDataSet`` loads/saves data from/to a svmlight/libsvm file using an + underlying filesystem (e.g.: local, S3, GCS). It uses sklearn functions + ``dump_svmlight_file`` to save and ``load_svmlight_file`` to load a file. + + Data is loaded as a tuple of features and labels. Labels is NumPy array, + and features is Compressed Sparse Row matrix. + + This format is a text-based format, with one sample per line. It does + not store zero valued features hence it is suitable for sparse datasets. + + This format is used as the default format for both svmlight and the + libsvm command line programs. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + svm_dataset: + type: svmlight.SVMLightDataSet + filepath: data/01_raw/location.svm + load_args: + zero_based: False + save_args: + zero_based: False + + cars: + type: svmlight.SVMLightDataSet + filepath: gcs://your_bucket/cars.svm + fs_args: + project: my-project + credentials: my_gcp_credentials + load_args: + zero_based: False + save_args: + zero_based: False + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.svmlight import SVMLightDataSet + >>> import numpy as np + >>> + >>> # Features and labels. + >>> data = (np.array([[0, 1], [2, 3.14159]]), np.array([7, 3])) + >>> + >>> data_set = SVMLightDataSet(filepath="test.svm") + >>> data_set.save(data) + >>> reloaded_features, reloaded_labels = data_set.load() + >>> assert (data[0] == reloaded_features).all() + >>> assert (data[1] == reloaded_labels).all() + + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + def __init__( # noqa: too-many-arguments + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Optional[Version] = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + + self._protocol = protocol + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + _fs_open_args_load.setdefault("mode", "rb") + _fs_open_args_save.setdefault("mode", "wb") + self._fs_open_args_load = _fs_open_args_load + self._fs_open_args_save = _fs_open_args_save + + def _describe(self): + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _load(self) -> _DO: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + return load_svmlight_file(fs_file, **self._load_args) + + def _save(self, data: _DI) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + dump_svmlight_file(data[0], data[1], fs_file, **self._save_args) + + self._invalidate_cache() + + def _exists(self) -> bool: + try: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + except DatasetError: + return False + + return self._fs.exists(load_path) + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro/extras/datasets/tensorflow/README.md b/kedro/extras/datasets/tensorflow/README.md index f478db571c..704d164977 100644 --- a/kedro/extras/datasets/tensorflow/README.md +++ b/kedro/extras/datasets/tensorflow/README.md @@ -1,7 +1,7 @@ # TensorFlowModelDataset ``TensorflowModelDataset`` loads and saves TensorFlow models. -The underlying functionality is supported by, and passes input arguments to TensorFlow 2.X load_model and save_model methods. Only TF2 is currently supported for saving and loading, V1 requires HDF5 and serializes differently. +The underlying functionality is supported by, and passes input arguments to TensorFlow 2.X load_model and save_model methods. Only TF2 is currently supported for saving and loading, V1 requires HDF5 and serialises differently. #### Example use: ```python @@ -27,6 +27,8 @@ np.testing.assert_allclose(predictions, new_predictions, rtol=1e-6, atol=1e-6) example_tensorflow_data: type: tensorflow.TensorFlowModelDataset filepath: data/08_reporting/tf_model_dirname + load_args: + tf_device: "/CPU:0" # optional ``` Contributed by (Aleks Hughes)[https://github.com/w0rdsm1th]. diff --git a/kedro/extras/datasets/tensorflow/__init__.py b/kedro/extras/datasets/tensorflow/__init__.py index 8fac25a8bd..20e1311ded 100644 --- a/kedro/extras/datasets/tensorflow/__init__.py +++ b/kedro/extras/datasets/tensorflow/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Provides I/O for TensorFlow Models.""" __all__ = ["TensorFlowModelDataset"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .tensorflow_model_dataset import TensorFlowModelDataset # NOQA + from .tensorflow_model_dataset import TensorFlowModelDataset diff --git a/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py b/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py index 693586b219..e1b35e6620 100644 --- a/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py @@ -1,37 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``TensorflowModelDataset`` is a data set implementation which can save and load TensorFlow models. """ import copy import tempfile -from pathlib import Path, PurePath, PurePosixPath +from pathlib import PurePath, PurePosixPath from typing import Any, Dict import fsspec @@ -39,7 +11,7 @@ from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, @@ -47,20 +19,42 @@ TEMPORARY_H5_FILE = "tmp_tensorflow_model.h5" +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class TensorFlowModelDataset(AbstractVersionedDataSet): +class TensorFlowModelDataset(AbstractVersionedDataSet[tf.keras.Model, tf.keras.Model]): """``TensorflowModelDataset`` loads and saves TensorFlow models. The underlying functionality is supported by, and passes input arguments through to, TensorFlow 2.X load_model and save_model methods. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + tensorflow_model: + type: tensorflow.TensorFlowModelDataset + filepath: data/06_models/tensorflow_model.h5 + load_args: + compile: False + save_args: + overwrite: True + include_optimizer: False + credentials: tf_creds + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.tensorflow import TensorFlowModelDataset >>> import tensorflow as tf >>> import numpy as np >>> - >>> data_set = TensorFlowModelDataset("saved_model_path") + >>> data_set = TensorFlowModelDataset("data/06_models/tensorflow_model.h5") >>> model = tf.keras.Model() >>> predictions = model.predict([...]) >>> @@ -74,8 +68,7 @@ class TensorFlowModelDataset(AbstractVersionedDataSet): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"save_format": "tf"} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -140,24 +133,30 @@ def _load(self) -> tf.keras.Model: with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path: if self._is_h5: - path = str(PurePath(path) / TEMPORARY_H5_FILE) + path = str( # noqa: PLW2901 + PurePath(path) / TEMPORARY_H5_FILE + ) # noqa: redefined-loop-name self._fs.copy(load_path, path) else: self._fs.get(load_path, path, recursive=True) # Pass the local temporary directory/file path to keras.load_model - return tf.keras.models.load_model(path, **self._load_args) + device_name = self._load_args.pop("tf_device", None) + if device_name: + with tf.device(device_name): + model = tf.keras.models.load_model(path, **self._load_args) + else: + model = tf.keras.models.load_model(path, **self._load_args) + return model def _save(self, data: tf.keras.Model) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - # Make sure all intermediate directories are created. - save_dir = Path(save_path).parent - save_dir.mkdir(parents=True, exist_ok=True) - with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path: if self._is_h5: - path = str(PurePath(path) / TEMPORARY_H5_FILE) + path = str( # noqa: PLW2901 + PurePath(path) / TEMPORARY_H5_FILE + ) # noqa: redefined-loop-name tf.keras.models.save_model(data, path, **self._save_args) @@ -166,23 +165,24 @@ def _save(self, data: tf.keras.Model) -> None: if self._is_h5: self._fs.copy(path, save_path) else: + if self._fs.exists(save_path): + self._fs.rm(save_path, recursive=True) self._fs.put(path, save_path, recursive=True) def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _release(self) -> None: super()._release() diff --git a/kedro/extras/datasets/text/__init__.py b/kedro/extras/datasets/text/__init__.py index 5c0618cbfc..fab08acea4 100644 --- a/kedro/extras/datasets/text/__init__.py +++ b/kedro/extras/datasets/text/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to load/save data from/to a text file.""" __all__ = ["TextDataSet"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .text_dataset import TextDataSet # NOQA + from .text_dataset import TextDataSet diff --git a/kedro/extras/datasets/text/text_dataset.py b/kedro/extras/datasets/text/text_dataset.py index 9cfe9ec7e4..2b02bfba3d 100644 --- a/kedro/extras/datasets/text/text_dataset.py +++ b/kedro/extras/datasets/text/text_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``TextDataSet`` loads/saves data from/to a text file using an underlying filesystem (e.g.: local, S3, GCS). """ @@ -37,25 +9,40 @@ from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, ) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class TextDataSet(AbstractVersionedDataSet): +class TextDataSet(AbstractVersionedDataSet[str, str]): """``TextDataSet`` loads/saves data from/to a text file using an underlying filesystem (e.g.: local, S3, GCS) - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + alice_book: + type: text.TextDataSet + filepath: data/01_raw/alice.txt + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.text import TextDataSet >>> >>> string_to_write = "This will go in a file." >>> - >>> # data_set = TextDataSet(filepath="gcs://bucket/test.md") >>> data_set = TextDataSet(filepath="test.md") >>> data_set.save(string_to_write) >>> reloaded = data_set.load() @@ -118,11 +105,11 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "version": self._version, + } def _load(self) -> str: load_path = get_filepath_str(self._get_load_path(), self._protocol) @@ -141,7 +128,7 @@ def _save(self, data: str) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/datasets/tracking/__init__.py b/kedro/extras/datasets/tracking/__init__.py new file mode 100644 index 0000000000..2b4d185ba8 --- /dev/null +++ b/kedro/extras/datasets/tracking/__init__.py @@ -0,0 +1,11 @@ +"""Dataset implementations to save data for Kedro Experiment Tracking""" + +__all__ = ["MetricsDataSet", "JSONDataSet"] + + +from contextlib import suppress + +with suppress(ImportError): + from kedro.extras.datasets.tracking.metrics_dataset import MetricsDataSet +with suppress(ImportError): + from kedro.extras.datasets.tracking.json_dataset import JSONDataSet diff --git a/kedro/extras/datasets/tracking/json_dataset.py b/kedro/extras/datasets/tracking/json_dataset.py new file mode 100644 index 0000000000..a77e162719 --- /dev/null +++ b/kedro/extras/datasets/tracking/json_dataset.py @@ -0,0 +1,48 @@ +"""``JSONDataSet`` saves data to a JSON file using an underlying +filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. +The ``JSONDataSet`` is part of Kedro Experiment Tracking. The dataset is versioned by default. +""" +from typing import NoReturn + +from kedro.extras.datasets.json import JSONDataSet as JDS +from kedro.io.core import DatasetError + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + + +class JSONDataSet(JDS): + """``JSONDataSet`` saves data to a JSON file using an underlying + filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. + The ``JSONDataSet`` is part of Kedro Experiment Tracking. + The dataset is write-only and it is versioned by default. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: tracking.JSONDataSet + filepath: data/09_tracking/cars.json + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.tracking import JSONDataSet + >>> + >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} + >>> + >>> data_set = JSONDataSet(filepath="test.json") + >>> data_set.save(data) + + """ + + versioned = True + + def _load(self) -> NoReturn: + raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") diff --git a/kedro/extras/datasets/tracking/metrics_dataset.py b/kedro/extras/datasets/tracking/metrics_dataset.py new file mode 100644 index 0000000000..3b615b6d64 --- /dev/null +++ b/kedro/extras/datasets/tracking/metrics_dataset.py @@ -0,0 +1,69 @@ +"""``MetricsDataSet`` saves data to a JSON file using an underlying +filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. +The ``MetricsDataSet`` is part of Kedro Experiment Tracking. The dataset is versioned by default +and only takes metrics of numeric values. +""" +import json +from typing import Dict, NoReturn + +from kedro.extras.datasets.json import JSONDataSet +from kedro.io.core import DatasetError, get_filepath_str + +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + + +class MetricsDataSet(JSONDataSet): + """``MetricsDataSet`` saves data to a JSON file using an underlying + filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. The + ``MetricsDataSet`` is part of Kedro Experiment Tracking. The dataset is write-only, + it is versioned by default and only takes metrics of numeric values. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: metrics.MetricsDataSet + filepath: data/09_tracking/cars.json + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.tracking import MetricsDataSet + >>> + >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} + >>> + >>> data_set = MetricsDataSet(filepath="test.json") + >>> data_set.save(data) + + """ + + versioned = True + + def _load(self) -> NoReturn: + raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") + + def _save(self, data: Dict[str, float]) -> None: + """Converts all values in the data from a ``MetricsDataSet`` to float to make sure + they are numeric values which can be displayed in Kedro Viz and then saves the dataset. + """ + try: + for key, value in data.items(): + data[key] = float(value) + except ValueError as exc: + raise DatasetError( + f"The MetricsDataSet expects only numeric values. {exc}" + ) from exc + + save_path = get_filepath_str(self._get_save_path(), self._protocol) + + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + json.dump(data, fs_file, **self._save_args) + + self._invalidate_cache() diff --git a/kedro/extras/datasets/video/__init__.py b/kedro/extras/datasets/video/__init__.py new file mode 100644 index 0000000000..f5f7af9461 --- /dev/null +++ b/kedro/extras/datasets/video/__init__.py @@ -0,0 +1,5 @@ +"""Dataset implementation to load/save data from/to a video file.""" + +__all__ = ["VideoDataSet"] + +from kedro.extras.datasets.video.video_dataset import VideoDataSet diff --git a/kedro/extras/datasets/video/video_dataset.py b/kedro/extras/datasets/video/video_dataset.py new file mode 100644 index 0000000000..4aba723afa --- /dev/null +++ b/kedro/extras/datasets/video/video_dataset.py @@ -0,0 +1,356 @@ +"""``VideoDataSet`` loads/saves video data from an underlying +filesystem (e.g.: local, S3, GCS). It uses OpenCV VideoCapture to read +and decode videos and OpenCV VideoWriter to encode and write video. +""" +import itertools +import tempfile +from collections import abc +from copy import deepcopy +from pathlib import Path, PurePosixPath +from typing import Any, Dict, Generator, Optional, Sequence, Tuple, Union + +import cv2 +import fsspec +import numpy as np +import PIL.Image + +from kedro.io.core import AbstractDataSet, get_protocol_and_path + + +class SlicedVideo: + """A representation of slices of other video types""" + + def __init__(self, video, slice_indexes): + self.video = video + self.indexes = range(*slice_indexes.indices(len(video))) + + def __getitem__(self, index: Union[int, slice]) -> PIL.Image.Image: + if isinstance(index, slice): + return SlicedVideo(self, index) + return self.video[self.indexes[index]] + + def __len__(self) -> int: + return len(self.indexes) + + def __getattr__(self, item): + return getattr(self.video, item) + + +class AbstractVideo(abc.Sequence): + """Base class for the underlying video data""" + + _n_frames = 0 + _index = 0 # Next available frame + + @property + def fourcc(self) -> str: + """Get the codec fourcc specification""" + raise NotImplementedError() + + @property + def fps(self) -> float: + """Get the video frame rate""" + raise NotImplementedError() + + @property + def size(self) -> Tuple[int, int]: + """Get the resolution of the video""" + raise NotImplementedError() + + def __len__(self) -> int: + return self._n_frames + + def __getitem__(self, index: Union[int, slice]): + """Get a frame from the video""" + raise NotImplementedError() + + +class FileVideo(AbstractVideo): + """A video object read from a file""" + + def __init__(self, filepath: str) -> None: + self._filepath = filepath + self._cap = cv2.VideoCapture(filepath) + self._n_frames = self._get_length() + + @property + def fourcc(self) -> str: + fourcc = self._cap.get(cv2.CAP_PROP_FOURCC) + return int(fourcc).to_bytes(4, "little").decode("ascii") + + @property + def fps(self) -> float: + return self._cap.get(cv2.CAP_PROP_FPS) + + @property + def size(self) -> Tuple[int, int]: + width = int(self._cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(self._cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + return width, height + + def __getitem__(self, index: Union[int, slice]): + if isinstance(index, slice): + return SlicedVideo(self, index) + + if index < 0: + index += len(self) + if index >= len(self): + raise IndexError() + + if index != self._index: + self._cap.set(cv2.CAP_PROP_POS_FRAMES, index) + self._index = index + 1 # Next frame to decode after this + ret, frame_bgr = self._cap.read() + if not ret: + raise IndexError() + + height, width = frame_bgr.shape[:2] + return PIL.Image.frombuffer( # Convert to PIL image with RGB instead of BGR + "RGB", (width, height), frame_bgr, "raw", "BGR", 0, 0 + ) + + def _get_length(self) -> int: + # OpenCV's frame count might be an approximation depending on what + # headers are available in the video file + length = int(round(self._cap.get(cv2.CAP_PROP_FRAME_COUNT))) + if length >= 0: + return length + + # Getting the frame count with OpenCV can fail on some video files, + # counting the frames would be too slow so it is better to raise an exception. + raise ValueError( + "Failed to load video since number of frames can't be inferred" + ) + + +class SequenceVideo(AbstractVideo): + """A video object read from an indexable sequence of frames""" + + def __init__( + self, frames: Sequence[PIL.Image.Image], fps: float, fourcc: str = "mp4v" + ) -> None: + self._n_frames = len(frames) + self._frames = frames + self._fourcc = fourcc + self._size = frames[0].size + self._fps = fps + + @property + def fourcc(self) -> str: + return self._fourcc + + @property + def fps(self) -> float: + return self._fps + + @property + def size(self) -> Tuple[int, int]: + return self._size + + def __getitem__(self, index: Union[int, slice]): + if isinstance(index, slice): + return SlicedVideo(self, index) + return self._frames[index] + + +class GeneratorVideo(AbstractVideo): + """A video object with frames yielded by a generator""" + + def __init__( + self, + frames: Generator[PIL.Image.Image, None, None], + length, + fps: float, + fourcc: str = "mp4v", + ) -> None: + self._n_frames = length + first = next(frames) + self._gen = itertools.chain([first], frames) + self._fourcc = fourcc + self._size = first.size + self._fps = fps + + @property + def fourcc(self) -> str: + return self._fourcc + + @property + def fps(self) -> float: + return self._fps + + @property + def size(self) -> Tuple[int, int]: + return self._size + + def __getitem__(self, index: Union[int, slice]): + raise NotImplementedError("Underlying video is a generator") + + def __next__(self): + return next(self._gen) + + def __iter__(self): + return self + + +class VideoDataSet(AbstractDataSet[AbstractVideo, AbstractVideo]): + """``VideoDataSet`` loads / save video data from a given filepath as sequence + of PIL.Image.Image using OpenCV. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: video.VideoDataSet + filepath: data/01_raw/cars.mp4 + + motorbikes: + type: video.VideoDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.mp4 + credentials: dev_s3 + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.video import VideoDataSet + >>> import numpy as np + >>> + >>> video = VideoDataSet(filepath='/video/file/path.mp4').load() + >>> frame = video[0] + >>> np.sum(np.asarray(frame)) + + + Example creating a video from numpy frames using Python API: + :: + + >>> from kedro.extras.datasets.video.video_dataset import VideoDataSet, SequenceVideo + >>> import numpy as np + >>> from PIL import Image + >>> + >>> frame = np.ones((640,480,3), dtype=np.uint8) * 255 + >>> imgs = [] + >>> for i in range(255): + >>> imgs.append(Image.fromarray(frame)) + >>> frame -= 1 + >>> + >>> video = VideoDataSet("my_video.mp4") + >>> video.save(SequenceVideo(imgs, fps=25)) + + + Example creating a video from numpy frames using a generator and the Python API: + :: + + >>> from kedro.extras.datasets.video.video_dataset import VideoDataSet, GeneratorVideo + >>> import numpy as np + >>> from PIL import Image + >>> + >>> def gen(): + >>> frame = np.ones((640,480,3), dtype=np.uint8) * 255 + >>> for i in range(255): + >>> yield Image.fromarray(frame) + >>> frame -= 1 + >>> + >>> video = VideoDataSet("my_video.mp4") + >>> video.save(GeneratorVideo(gen(), fps=25, length=None)) + + """ + + def __init__( + self, + filepath: str, + fourcc: Optional[str] = "mp4v", + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of VideoDataSet to load / save video data for given filepath. + + Args: + filepath: The location of the video file to load / save data. + fourcc: The codec to use when writing video, note that depending on how opencv is + installed there might be more or less codecs avaiable. If set to None, the + fourcc from the video object will be used. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + """ + # parse the path and protocol (e.g. file, http, s3, etc.) + protocol, path = get_protocol_and_path(filepath) + self._protocol = protocol + self._filepath = PurePosixPath(path) + self._fourcc = fourcc + _fs_args = deepcopy(fs_args) or {} + _credentials = deepcopy(credentials) or {} + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + + def _load(self) -> AbstractVideo: + """Loads data from the video file. + + Returns: + Data from the video file as a AbstractVideo object + """ + with fsspec.open( + f"filecache::{self._protocol}://{self._filepath}", + mode="rb", + **{self._protocol: self._storage_options}, + ) as fs_file: + return FileVideo(fs_file.name) + + def _save(self, data: AbstractVideo) -> None: + """Saves video data to the specified filepath.""" + if self._protocol == "file": + # Write directly to the local file destination + self._write_to_filepath(data, str(self._filepath)) + else: + # VideoWriter can't write to an open file object, instead write to a + # local tmpfile and then copy that to the destination with fsspec. + # Note that the VideoWriter fails to write to the file on Windows if + # the file is already open, thus we can't use NamedTemporaryFile. + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_file = Path(tmp_dir) / self._filepath.name + self._write_to_filepath(data, str(tmp_file)) + with fsspec.open( + f"{self._protocol}://{self._filepath}", + "wb", + **self._storage_options, + ) as f_target: + with tmp_file.open("r+b") as f_tmp: + f_target.write(f_tmp.read()) + + def _write_to_filepath(self, video: AbstractVideo, filepath: str) -> None: + # TODO: This uses the codec specified in the VideoDataSet if it is not None, this is due + # to compatibility issues since e.g. h264 coded is licensed and is thus not included in + # opencv if installed from a binary distribution. Since a h264 video can be read, but not + # written, it would be error prone to use the videos fourcc code. Further, an issue is + # that the video object does not know what container format will be used since that is + # selected by the suffix in the file name of the VideoDataSet. Some combinations of codec + # and container format might not work or will have bad support. + fourcc = self._fourcc or video.fourcc + + writer = cv2.VideoWriter( + filepath, cv2.VideoWriter_fourcc(*fourcc), video.fps, video.size + ) + if not writer.isOpened(): + raise ValueError( + "Failed to open video writer with params: " + + f"fourcc={fourcc} fps={video.fps} size={video.size[0]}x{video.size[1]} " + + f"path={filepath}" + ) + try: + for frame in iter(video): + writer.write( # PIL images are RGB, opencv expects BGR + np.asarray(frame)[:, :, ::-1] + ) + finally: + writer.release() + + def _describe(self) -> Dict[str, Any]: + return {"filepath": self._filepath, "protocol": self._protocol} + + def _exists(self) -> bool: + return self._fs.exists(self._filepath) diff --git a/kedro/extras/datasets/yaml/__init__.py b/kedro/extras/datasets/yaml/__init__.py index b443fd271b..b3780de3a6 100644 --- a/kedro/extras/datasets/yaml/__init__.py +++ b/kedro/extras/datasets/yaml/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``AbstractDataSet`` implementation to load/save data from/to a YAML file.""" __all__ = ["YAMLDataSet"] @@ -33,4 +5,4 @@ from contextlib import suppress with suppress(ImportError): - from .yaml_dataset import YAMLDataSet # NOQA + from .yaml_dataset import YAMLDataSet diff --git a/kedro/extras/datasets/yaml/yaml_dataset.py b/kedro/extras/datasets/yaml/yaml_dataset.py index e7784a76f1..91c6b474cf 100644 --- a/kedro/extras/datasets/yaml/yaml_dataset.py +++ b/kedro/extras/datasets/yaml/yaml_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``YAMLDataSet`` loads/saves data from/to a YAML file using an underlying filesystem (e.g.: local, S3, GCS). It uses PyYAML to handle the YAML file. """ @@ -38,25 +10,40 @@ from kedro.io.core import ( AbstractVersionedDataSet, - DataSetError, + DatasetError, Version, get_filepath_str, get_protocol_and_path, ) +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + -class YAMLDataSet(AbstractVersionedDataSet): +class YAMLDataSet(AbstractVersionedDataSet[Dict, Dict]): """``YAMLDataSet`` loads/saves data from/to a YAML file using an underlying filesystem (e.g.: local, S3, GCS). It uses PyYAML to handle the YAML file. - Example: + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: yaml.YAMLDataSet + filepath: cars.yaml + + Example usage for the + `Python API `_: :: >>> from kedro.extras.datasets.yaml import YAMLDataSet >>> >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} >>> - >>> # data_set = YAMLDataSet(filepath="gcs://bucket/test.yaml") >>> data_set = YAMLDataSet(filepath="test.yaml") >>> data_set.save(data) >>> reloaded = data_set.load() @@ -66,8 +53,7 @@ class YAMLDataSet(AbstractVersionedDataSet): DEFAULT_SAVE_ARGS = {"default_flow_style": False} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, save_args: Dict[str, Any] = None, @@ -131,12 +117,12 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> Dict: load_path = get_filepath_str(self._get_load_path(), self._protocol) @@ -154,7 +140,7 @@ def _save(self, data: Dict) -> None: def _exists(self) -> bool: try: load_path = get_filepath_str(self._get_load_path(), self._protocol) - except DataSetError: + except DatasetError: return False return self._fs.exists(load_path) diff --git a/kedro/extras/decorators/README.md b/kedro/extras/decorators/README.md deleted file mode 100644 index 143bee23bd..0000000000 --- a/kedro/extras/decorators/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# Decorators - -Welcome to `kedro.extras.decorators`, the home of Kedro's node and pipeline decorators, which enable additional functionality by wrapping your functions, for example: - - Retry nodes that have failed to run - - Profile how much memory is being consumed by a node - -Further information on [node and pipeline decorators](https://kedro.readthedocs.io/en/stable/03_tutorial/04_create_pipelines.html#using-decorators-for-nodes-and-pipelines) has been added to the documentation. Before writing a decorator to implement a certain functionality that interacts a pipeline or node lifecycle event, you may want to consider using [Hooks](https://kedro.readthedocs.io/en/latest/04_user_guide/15_hooks.html) instead. - -## What decorators are currently supported? -View a full list of supported decorators [**here**](https://kedro.readthedocs.io/en/stable/kedro.extras.decorators.html). - -Examples of decorators supported include: - - **A retry decorator**: A function decorator which catches exceptions from the wrapped function at most `n_times`, after which it bundles and propagates them. By default, all exceptions are caught, but you can narrow your scope using the `exceptions` argument. You can also specify the time delay (in seconds) between a failure and the next retry, using the `delay_sec` parameter. - - **A node and pipeline memory profiler**: A function decorator which profiles the memory used when executing the function. The logged memory is collected by taking memory snapshots every 100ms, and includes memory used by children processes. The implementation uses the `memory_profiler` Python package under the hood. - -> _Note_: The node and pipeline memory profiler will only work on functions that take longer than 0.5s to execute, see [class documentation](memory_profiler.py) for more details. - -### What pre-requisites are required for the node and pipeline memory profiler? - -On Unix-like operating systems, you will need to install a C-compiler and related build tools for your platform. - - #### macOS - To install Command Line Tools for Xcode, run the following from the terminal: - - ```bash - xcode-select --install - ``` - - #### GNU / Linux - - ##### Debian/Ubuntu - - The following command (run with root permissions) will install the `build-essential` metapackage for Debian-based distributions: - - ```bash - apt-get update && apt-get install build-essential - ``` - - ##### Red Hat Enterprise Linux / Centos - The following command (run with root permissions) will install the "Develop Tools" group of packages on RHEL / Centos: - - ```bash - yum groupinstall 'Development Tools' - ``` diff --git a/kedro/extras/decorators/__init__.py b/kedro/extras/decorators/__init__.py deleted file mode 100644 index e980b95426..0000000000 --- a/kedro/extras/decorators/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``kedro.extras.decorators`` provides Node/Pipeline Decorators.""" diff --git a/kedro/extras/decorators/memory_profiler.py b/kedro/extras/decorators/memory_profiler.py deleted file mode 100644 index f5d6e12aef..0000000000 --- a/kedro/extras/decorators/memory_profiler.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This module contains function decorators for memory-profiler, which can -be used as ``Node`` decorators. See ``kedro.pipeline.node.decorate`` -""" -import logging -from functools import wraps -from typing import Callable - -from kedro.pipeline.decorators import _func_full_name - -try: - from memory_profiler import memory_usage -except ImportError as exc: - raise ImportError( - f"{exc}: `pip install kedro[profilers]` to get the required " - "memory profiler dependencies." - ) from exc - - -def mem_profile(func: Callable) -> Callable: - """A function decorator which profiles the memory used when executing the - function. The logged memory is collected by using the memory_profiler - python module and includes memory used by children processes. The usage - is collected by taking memory snapshots every 100ms. This decorator will - only work with functions taking at least 0.5s to execute due to a bug in - the memory_profiler python module. For more information about the bug, - please see https://github.com/pythonprofilers/memory_profiler/issues/216 - - Args: - func: The function to be profiled. - - Returns: - A wrapped function, which will execute the provided function and log - its max memory usage upon completion. - - """ - - @wraps(func) - def with_memory(*args, **kwargs): - log = logging.getLogger(__name__) - mem_usage, result = memory_usage( - (func, args, kwargs), - interval=0.1, - timeout=1, - max_usage=True, - retval=True, - include_children=True, - ) - # memory_profiler < 0.56.0 returns list instead of float - mem_usage = mem_usage[0] if isinstance(mem_usage, (list, tuple)) else mem_usage - log.info( - "Running %r consumed %2.2fMiB memory at peak time", - _func_full_name(func), - mem_usage, - ) - return result - - return with_memory diff --git a/kedro/extras/decorators/retry_node.py b/kedro/extras/decorators/retry_node.py deleted file mode 100644 index 7109bbb970..0000000000 --- a/kedro/extras/decorators/retry_node.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This module contains the retry decorator, which can be used as -``Node`` decorators to retry nodes. See ``kedro.pipeline.node.decorate`` -""" - -import logging -from functools import wraps -from time import sleep -from typing import Callable, Type - - -def retry( - exceptions: Type[Exception] = Exception, n_times: int = 1, delay_sec: float = 0 -) -> Callable: - """ - Catches exceptions from the wrapped function at most n_times and then - bundles and propagates them. - - **Make sure your function does not mutate the arguments** - - Args: - exceptions: The superclass of exceptions to catch. - By default catch all exceptions. - n_times: At most let the function fail n_times. The bundle the - errors and propagate them. By default retry only once. - delay_sec: Delay between failure and next retry in seconds - - Returns: - The original function with retry functionality. - - """ - - def _retry(func: Callable): - @wraps(func) - def _wrapper(*args, **kwargs): - counter = n_times - errors = [] - while counter >= 0: - try: - return func(*args, **kwargs) - # pylint: disable=broad-except - except exceptions as exc: - errors.append(exc) - if counter != 0: - sleep(delay_sec) - counter -= 1 - - if errors: - log = logging.getLogger(__name__) - log.error( - "Function `%s` failed %i times. Errors:\n", func.__name__, n_times - ) - log.error("\n".join(str(err) for err in errors)) - log.error("Raising last exception") - raise errors[-1] - - return _wrapper - - return _retry diff --git a/kedro/extras/extensions/__init__.py b/kedro/extras/extensions/__init__.py deleted file mode 100644 index bb6fa0c5e6..0000000000 --- a/kedro/extras/extensions/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This module contains an IPython extension. -""" diff --git a/kedro/extras/extensions/ipython.py b/kedro/extras/extensions/ipython.py deleted file mode 100644 index 1b3bcb6a15..0000000000 --- a/kedro/extras/extensions/ipython.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -# pylint: disable=import-outside-toplevel,global-statement,invalid-name -""" -This script creates an IPython extension to load Kedro-related variables in -local scope. -""" -import logging.config -import sys -from pathlib import Path -from typing import Any, Dict - -from IPython import get_ipython -from IPython.core.magic import needs_local_scope, register_line_magic - -startup_path = Path.cwd() -project_path = startup_path - - -def _remove_cached_modules(package_name): - to_remove = [mod for mod in sys.modules if mod.startswith(package_name)] - # `del` is used instead of `reload()` because: If the new version of a module does not - # define a name that was defined by the old version, the old definition remains. - for module in to_remove: - del sys.modules[module] # pragma: no cover - - -def _clear_hook_manager(): - from kedro.framework.hooks import get_hook_manager - - hook_manager = get_hook_manager() - name_plugin_pairs = hook_manager.list_name_plugin() - for name, plugin in name_plugin_pairs: - hook_manager.unregister(name=name, plugin=plugin) # pragma: no cover - - -def _find_kedro_project(current_dir): # pragma: no cover - from kedro.framework.startup import _is_project - - while current_dir != current_dir.parent: - if _is_project(current_dir): - return current_dir - current_dir = current_dir.parent - - return None - - -def reload_kedro(path, env: str = None, extra_params: Dict[str, Any] = None): - """Line magic which reloads all Kedro default variables.""" - - import kedro.config.default_logger # noqa: F401 # pylint: disable=unused-import - from kedro.framework.cli import load_entry_points - from kedro.framework.session import KedroSession - from kedro.framework.session.session import _activate_session - from kedro.framework.startup import bootstrap_project - - _clear_hook_manager() - - path = path or project_path - metadata = bootstrap_project(path) - - _remove_cached_modules(metadata.package_name) - - session = KedroSession.create( - metadata.package_name, path, env=env, extra_params=extra_params - ) - _activate_session(session, force=True) - logging.debug("Loading the context from %s", str(path)) - context = session.load_context() - catalog = context.catalog - - get_ipython().push( - variables={"context": context, "catalog": catalog, "session": session} - ) - - logging.info("** Kedro project %s", str(metadata.project_name)) - logging.info("Defined global variable `context`, `session` and `catalog`") - - for line_magic in load_entry_points("line_magic"): - register_line_magic(needs_local_scope(line_magic)) - logging.info("Registered line magic `%s`", line_magic.__name__) # type: ignore - - -def init_kedro(path=""): - """Line magic to set path to Kedro project. - `%reload_kedro` will default to this location. - """ - global project_path - if path: - project_path = Path(path).expanduser().resolve() - logging.info("Updated path to Kedro project: %s", str(project_path)) - else: - logging.info("No path argument was provided. Using: %s", str(project_path)) - - -def load_ipython_extension(ipython): - """Main entry point when %load_ext is executed""" - - global project_path - global startup_path - - ipython.register_magic_function(init_kedro, "line") - ipython.register_magic_function(reload_kedro, "line", "reload_kedro") - - project_path = _find_kedro_project(startup_path) - - try: - reload_kedro(project_path) - except (ImportError, ModuleNotFoundError): - logging.error("Kedro appears not to be installed in your current environment.") - except Exception: # pylint: disable=broad-except - logging.warning( - "Kedro extension was registered. Make sure you pass the project path to " - "`%reload_kedro` or set it using `%init_kedro`." - ) diff --git a/kedro/extras/logging/__init__.py b/kedro/extras/logging/__init__.py deleted file mode 100644 index 337060fb10..0000000000 --- a/kedro/extras/logging/__init__.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This module contains a logging handler class which produces coloured logs. -""" - -from .color_logger import ColorHandler # NOQA diff --git a/kedro/extras/logging/color_logger.py b/kedro/extras/logging/color_logger.py deleted file mode 100644 index f208fd56bb..0000000000 --- a/kedro/extras/logging/color_logger.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -"""A logging handler class which produces coloured logs.""" - - -import logging - -import click - - -class ColorHandler(logging.StreamHandler): - """A color log handler. - - You can use this handler by incorporating the example below into your - logging configuration: - - ``conf/project/logging.yml``: - :: - - formatters: - simple: - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - - handlers: - console: - class: kedro.extras.logging.ColorHandler - level: INFO - formatter: simple - stream: ext://sys.stdout - # defining colors is optional - colors: - debug: white - info: magenta - warning: yellow - - root: - level: INFO - handlers: [console] - - The ``colors`` parameter is optional, and you can use any ANSI color. - - * Black - * Red - * Green - * Yellow - * Blue - * Magenta - * Cyan - * White - - The default colors are: - - * debug: magenta - * info: cyan - * warning: yellow - * error: red - * critical: red - """ - - def __init__(self, stream=None, colors=None): - logging.StreamHandler.__init__(self, stream) - colors = colors or {} - self.colors = { - "critical": colors.get("critical", "red"), - "error": colors.get("error", "red"), - "warning": colors.get("warning", "yellow"), - "info": colors.get("info", "cyan"), - "debug": colors.get("debug", "magenta"), - } - - def _get_color(self, level): - if level >= logging.CRITICAL: - return self.colors["critical"] # pragma: no cover - if level >= logging.ERROR: - return self.colors["error"] # pragma: no cover - if level >= logging.WARNING: - return self.colors["warning"] # pragma: no cover - if level >= logging.INFO: - return self.colors["info"] - if level >= logging.DEBUG: # pragma: no cover - return self.colors["debug"] # pragma: no cover - - return None # pragma: no cover - - def format(self, record: logging.LogRecord) -> str: - """The handler formatter. - - Args: - record: The record to format. - - Returns: - The record formatted as a string. - - """ - text = logging.StreamHandler.format(self, record) - color = self._get_color(record.levelno) - return click.style(text, color) diff --git a/kedro/extras/transformers/README.md b/kedro/extras/transformers/README.md deleted file mode 100644 index 9cbe3fb6fb..0000000000 --- a/kedro/extras/transformers/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# Transformers - -Welcome to `kedro.extras.transformers`, the home of Kedro's dataset transformers. Transformers intercept the load and save operations on Kedro datasets. Use cases that transformers enable include: - - Performing data validation, - - Tracking operation performance, - - And, converting data between formats (although we would recommend [transcoding](https://kedro.readthedocs.io/en/stable/04_user_guide/04_data_catalog.html#transcoding-datasets) for this). - -Further information on [transformers](https://kedro.readthedocs.io/en/stable/04_user_guide/04_data_catalog.html#transforming-datasets) has been added to the documentation. - -## What transformers are currently supported? -View a full list of supported transformers [**here**](https://kedro.readthedocs.io/en/stable/kedro.extras.transformers.html). - -Examples of transformers supported include: - - **A dataset time profiler**: A transformer that logs the runtime of data set load and save calls - - **A dataset memory profiler**: A transformer that logs the maximum memory consumption during load and save calls - -### What pre-requisites are required for the dataset memory profiler? - -On Unix-like operating systems, you will need to install a C-compiler and related build tools for your platform. - - #### macOS - To install `Command Line Tools for Xcode`, run the following from the terminal: - - ```bash - xcode-select --install - ``` - - #### GNU / Linux - - ##### Debian / Ubuntu - - The following command (run with root permissions) will install the `build-essential` metapackage for Debian-based distributions: - - ```bash - apt-get update && apt-get install build-essential - ``` - - ##### Red Hat Enterprise Linux / Centos - The following command (run with root permissions) will install the "Develop Tools" group of packages on RHEL / Centos: - - ```bash - yum groupinstall 'Development Tools' - ``` diff --git a/kedro/extras/transformers/__init__.py b/kedro/extras/transformers/__init__.py deleted file mode 100644 index 302136e01f..0000000000 --- a/kedro/extras/transformers/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``kedro.extras.transformers`` is the home of Kedro's dataset transformers.""" - -from .memory_profiler import ProfileMemoryTransformer # NOQA -from .time_profiler import ProfileTimeTransformer # NOQA diff --git a/kedro/extras/transformers/memory_profiler.py b/kedro/extras/transformers/memory_profiler.py deleted file mode 100644 index a4bb240540..0000000000 --- a/kedro/extras/transformers/memory_profiler.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``Transformers`` modify the loading and saving of ``DataSets`` in a -``DataCatalog``. -""" - -import logging -from typing import Any, Callable - -from kedro.io import AbstractTransformer - -try: - from memory_profiler import memory_usage -except ImportError as exc: - raise ImportError( - f"{exc}: `pip install kedro[profilers]` to get the required " - "memory profiler dependencies." - ) from exc - - -def _normalise_mem_usage(mem_usage): - # memory_profiler < 0.56.0 returns list instead of float - return mem_usage[0] if isinstance(mem_usage, (list, tuple)) else mem_usage - - -class ProfileMemoryTransformer(AbstractTransformer): - """A transformer that logs the maximum memory consumption during load and save calls.""" - - @property - def _logger(self): - return logging.getLogger(self.__class__.__name__) - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - mem_usage, data = memory_usage( - (load, [], {}), - interval=0.1, - max_usage=True, - retval=True, - include_children=True, - ) - mem_usage = _normalise_mem_usage(mem_usage) - - self._logger.info( - "Loading %s consumed %2.2fMiB memory at peak time", data_set_name, mem_usage - ) - return data - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - mem_usage = memory_usage( - (save, [data], {}), - interval=0.1, - max_usage=True, - retval=False, - include_children=True, - ) - mem_usage = _normalise_mem_usage(mem_usage) - - self._logger.info( - "Saving %s consumed %2.2fMiB memory at peak time", data_set_name, mem_usage - ) diff --git a/kedro/extras/transformers/time_profiler.py b/kedro/extras/transformers/time_profiler.py deleted file mode 100644 index 3683a4f826..0000000000 --- a/kedro/extras/transformers/time_profiler.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``Transformers`` modify the loading and saving of ``DataSets`` in a -``DataCatalog``. -""" - -import logging -import time -from typing import Any, Callable - -from kedro.io import AbstractTransformer - - -class ProfileTimeTransformer(AbstractTransformer): - """A transformer that logs the runtime of data set load and save calls.""" - - @property - def _logger(self): - return logging.getLogger("ProfileTimeTransformer") - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - start = time.time() - data = load() - self._logger.info( - "Loading %s took %0.3f seconds", data_set_name, time.time() - start - ) - return data - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - start = time.time() - save(data) - self._logger.info( - "Saving %s took %0.3f seconds", data_set_name, time.time() - start - ) diff --git a/kedro/framework/__init__.py b/kedro/framework/__init__.py index 80667105ac..0de0f8cbf9 100644 --- a/kedro/framework/__init__.py +++ b/kedro/framework/__init__.py @@ -1,28 +1 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``kedro.framework`` provides Kedro's framework components """ diff --git a/kedro/framework/cli/__init__.py b/kedro/framework/cli/__init__.py index ef812fc25b..fcbb427ef7 100644 --- a/kedro/framework/cli/__init__.py +++ b/kedro/framework/cli/__init__.py @@ -1,33 +1,7 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.framework.cli`` implements commands available from Kedro's CLI. """ -from .cli import main # NOQA -from .utils import command_with_verbosity, load_entry_points # NOQA +from .cli import main +from .utils import command_with_verbosity, load_entry_points + +__all__ = ["main", "command_with_verbosity", "load_entry_points"] diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index ed19afc0ff..6129732042 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """A collection of CLI commands for working with Kedro catalog.""" from collections import defaultdict @@ -41,15 +13,10 @@ def _create_session(package_name: str, **kwargs): kwargs.setdefault("save_on_close", False) - try: - return KedroSession.create(package_name, **kwargs) - except Exception as exc: - raise KedroCliError( - f"Unable to instantiate Kedro session.\nError: {exc}" - ) from exc + return KedroSession.create(package_name, **kwargs) -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def catalog_cli(): # pragma: no cover pass @@ -60,11 +27,12 @@ def catalog(): """Commands for working with catalog.""" -# pylint: disable=too-many-locals +# noqa: too-many-locals @catalog.command("list") @env_option @click.option( "--pipeline", + "-p", type=str, default="", help="Name of the modular pipeline to run. If not set, " @@ -74,14 +42,19 @@ def catalog(): @click.pass_obj def list_datasets(metadata: ProjectMetadata, pipeline, env): """Show datasets per type.""" - title = "DataSets in '{}' pipeline" + title = "Datasets in '{}' pipeline" not_mentioned = "Datasets not mentioned in pipeline" mentioned = "Datasets mentioned in pipeline" session = _create_session(metadata.package_name, env=env) context = session.load_context() - datasets_meta = context.catalog._data_sets # pylint: disable=protected-access - catalog_ds = set(context.catalog.list()) + try: + datasets_meta = context.catalog._data_sets # noqa: protected-access + catalog_ds = set(context.catalog.list()) + except Exception as exc: + raise KedroCliError( + f"Unable to instantiate Kedro Catalog.\nError: {exc}" + ) from exc target_pipelines = pipeline or pipelines.keys() @@ -93,7 +66,7 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env): else: existing_pls = ", ".join(sorted(pipelines.keys())) raise KedroCliError( - f"`{pipe}` pipeline not found! Existing pipelines: {existing_pls}" + f"'{pipe}' pipeline not found! Existing pipelines: {existing_pls}" ) unused_ds = catalog_ds - pipeline_ds @@ -104,7 +77,7 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env): used_by_type = _map_type_to_datasets(used_ds, datasets_meta) if default_ds: - used_by_type["DefaultDataSet"].extend(default_ds) + used_by_type["DefaultDataset"].extend(default_ds) data = ((not_mentioned, dict(unused_by_type)), (mentioned, dict(used_by_type))) result[title.format(pipe)] = {key: value for key, value in data if value} @@ -130,6 +103,7 @@ def _map_type_to_datasets(datasets, datasets_meta): @env_option(help="Environment to create Data Catalog YAML file in. Defaults to `base`.") @click.option( "--pipeline", + "-p", "pipeline_name", type=str, required=True, @@ -139,9 +113,9 @@ def _map_type_to_datasets(datasets, datasets_meta): def create_catalog(metadata: ProjectMetadata, pipeline_name, env): """Create Data Catalog YAML configuration with missing datasets. - Add `MemoryDataSet` datasets to Data Catalog YAML configuration file - for each dataset in a registered pipeline if it is missing from - the `DataCatalog`. + Add ``MemoryDataset`` datasets to Data Catalog YAML configuration + file for each dataset in a registered pipeline if it is missing from + the ``DataCatalog``. The catalog configuration will be saved to `//catalog/.yml` file. @@ -155,7 +129,7 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name, env): if not pipeline: existing_pipelines = ", ".join(sorted(pipelines.keys())) raise KedroCliError( - f"`{pipeline_name}` pipeline not found! Existing pipelines: {existing_pipelines}" + f"'{pipeline_name}' pipeline not found! Existing pipelines: {existing_pipelines}" ) pipe_datasets = { @@ -166,7 +140,7 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name, env): catalog_datasets = { ds_name - for ds_name in context.catalog._data_sets.keys() # pylint: disable=protected-access + for ds_name in context.catalog._data_sets.keys() # noqa: protected-access if not ds_name.startswith("params:") and ds_name != "parameters" } @@ -193,7 +167,7 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path): catalog_config = {} for ds_name in missing_ds: - catalog_config[ds_name] = {"type": "MemoryDataSet"} + catalog_config[ds_name] = {"type": "MemoryDataset"} # Create only `catalog` folder under existing environment # (all parent folders must exist). diff --git a/kedro/framework/cli/cli.py b/kedro/framework/cli/cli.py index c95dbae568..304fb6b4bc 100644 --- a/kedro/framework/cli/cli.py +++ b/kedro/framework/cli/cli.py @@ -1,51 +1,20 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """kedro is a CLI for managing Kedro projects. This module implements commands available from the kedro CLI. """ import importlib -import webbrowser +import sys from collections import defaultdict from pathlib import Path from typing import Sequence import click -import pkg_resources -from click.utils import get_os_args -# pylint: disable=unused-import -import kedro.config.default_logger # noqa from kedro import __version__ as version from kedro.framework.cli.catalog import catalog_cli -from kedro.framework.cli.hooks import CLIHooksManager +from kedro.framework.cli.hooks import get_cli_hook_manager from kedro.framework.cli.jupyter import jupyter_cli +from kedro.framework.cli.micropkg import micropkg_cli from kedro.framework.cli.pipeline import pipeline_cli from kedro.framework.cli.project import project_group from kedro.framework.cli.registry import registry_cli @@ -55,8 +24,10 @@ ENTRY_POINT_GROUPS, CommandCollection, KedroCliError, + _get_entry_points, load_entry_points, ) +from kedro.framework.project import LOGGING # noqa # noqa: unused-import from kedro.framework.startup import _is_project, bootstrap_project LOGO = rf""" @@ -84,18 +55,17 @@ def info(): """Get more information about kedro.""" click.secho(LOGO, fg="green") click.echo( - "kedro allows teams to create analytics\n" - "projects. It is developed as part of\n" - "the Kedro initiative at QuantumBlack." + "Kedro is a Python framework for\n" + "creating reproducible, maintainable\n" + "and modular data science code." ) plugin_versions = {} plugin_entry_points = defaultdict(set) - for plugin_entry_point, group in ENTRY_POINT_GROUPS.items(): - for entry_point in pkg_resources.iter_entry_points(group=group): - module_name = entry_point.module_name.split(".")[0] - plugin_version = pkg_resources.get_distribution(module_name).version - plugin_versions[module_name] = plugin_version + for plugin_entry_point in ENTRY_POINT_GROUPS: + for entry_point in _get_entry_points(plugin_entry_point): + module_name = entry_point.module.split(".")[0] + plugin_versions[module_name] = entry_point.dist.version plugin_entry_points[module_name].add(plugin_entry_point) click.echo() @@ -110,24 +80,10 @@ def info(): click.echo("No plugins installed") -@cli.command(short_help="See the kedro API docs and introductory tutorial.") -def docs(): - """Display the API docs and introductory tutorial in the browser, - using the packaged HTML doc files.""" - html_path = str((Path(__file__).parent.parent / "html" / "index.html").resolve()) - index_path = f"file://{html_path}" - click.echo(f"Opening {index_path}") - webbrowser.open(index_path) - - -def _init_plugins(): - group = ENTRY_POINT_GROUPS["init"] - for entry_point in pkg_resources.iter_entry_points(group=group): - try: - init_hook = entry_point.load() - init_hook() - except Exception as exc: - raise KedroCliError(f"Initializing {entry_point}") from exc +def _init_plugins() -> None: + init_hooks = load_entry_points("init") + for init_hook in init_hooks: + init_hook() class KedroCLI(CommandCollection): @@ -139,7 +95,7 @@ def __init__(self, project_path: Path): self._metadata = None # running in package mode if _is_project(project_path): self._metadata = bootstrap_project(project_path) - self._cli_hook_manager = CLIHooksManager() + self._cli_hook_manager = get_cli_hook_manager() super().__init__( ("Global commands", self.global_groups), @@ -160,19 +116,26 @@ def main( # This is how click's internals parse sys.argv, which include the command, # subcommand, arguments and options. click doesn't store this information anywhere # so we have to re-do it. - # https://github.com/pallets/click/blob/master/src/click/core.py#L942-L945 - args = get_os_args() if args is None else list(args) - self._cli_hook_manager.hook.before_command_run( # pylint: disable=no-member + args = sys.argv[1:] if args is None else list(args) + self._cli_hook_manager.hook.before_command_run( project_metadata=self._metadata, command_args=args ) - super().main( - args=args, - prog_name=prog_name, - complete_var=complete_var, - standalone_mode=standalone_mode, - **extra, - ) + try: + super().main( + args=args, + prog_name=prog_name, + complete_var=complete_var, + standalone_mode=standalone_mode, + **extra, + ) + # click.core.main() method exits by default, we capture this and then + # exit as originally intended + except SystemExit as exc: + self._cli_hook_manager.hook.after_command_run( + project_metadata=self._metadata, command_args=args, exit_code=exc.code + ) + sys.exit(exc.code) @property def global_groups(self) -> Sequence[click.MultiCommand]: @@ -184,18 +147,25 @@ def global_groups(self) -> Sequence[click.MultiCommand]: @property def project_groups(self) -> Sequence[click.MultiCommand]: - # pylint: disable=line-too-long + # noqa: line-too-long """Property which loads all project command groups from the project and the plugins, then combines them with the built-in ones. Built-in commands can be overridden by plugins, which can be overridden by a custom project cli.py. - See https://kedro.readthedocs.io/en/stable/07_extend_kedro/01_common_use_cases.html#use-case-3-how-to-add-or-modify-cli-commands + See https://kedro.readthedocs.io/en/stable/extend_kedro/common_use_cases.html#use-case-3-how-to-add-or-modify-cli-commands on how to add this. """ if not self._metadata: return [] - built_in = [catalog_cli, jupyter_cli, pipeline_cli, project_group, registry_cli] + built_in = [ + catalog_cli, + jupyter_cli, + pipeline_cli, + micropkg_cli, + project_group, + registry_cli, + ] plugins = load_entry_points("project") diff --git a/kedro/framework/cli/hooks/__init__.py b/kedro/framework/cli/hooks/__init__.py index bb6a1a2a98..7a6ed8a52d 100644 --- a/kedro/framework/cli/hooks/__init__.py +++ b/kedro/framework/cli/hooks/__init__.py @@ -1,30 +1,5 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``kedro.framework.cli.hooks`` provides primitives to use hooks to extend KedroCLI's behaviour""" -from .manager import CLIHooksManager # NOQA -from .markers import cli_hook_impl # NOQA +from .manager import CLIHooksManager, get_cli_hook_manager +from .markers import cli_hook_impl + +__all__ = ["CLIHooksManager", "cli_hook_impl", "get_cli_hook_manager"] diff --git a/kedro/framework/cli/hooks/manager.py b/kedro/framework/cli/hooks/manager.py index e9020a580f..a1be3e5784 100644 --- a/kedro/framework/cli/hooks/manager.py +++ b/kedro/framework/cli/hooks/manager.py @@ -1,31 +1,5 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module defines a dedicated hook manager for hooks that extends Kedro CLI behaviour.""" +# noqa: global-statement,invalid-name import logging from pluggy import PluginManager @@ -33,9 +7,23 @@ from .markers import CLI_HOOK_NAMESPACE from .specs import CLICommandSpecs +logger = logging.getLogger(__name__) + +_cli_hook_manager = None + _CLI_PLUGIN_HOOKS = "kedro.cli_hooks" +def get_cli_hook_manager(): + """Create or return the global _hook_manager singleton instance.""" + global _cli_hook_manager # noqa: PLW0603 + if _cli_hook_manager is None: + _cli_hook_manager = CLIHooksManager() + _cli_hook_manager.trace.root.setwriter(logger.debug) + _cli_hook_manager.enable_tracing() + return _cli_hook_manager + + class CLIHooksManager(PluginManager): """Hooks manager to manage CLI hooks""" @@ -47,7 +35,7 @@ def __init__(self) -> None: def _register_cli_hooks_setuptools(self) -> None: """Register CLI hook implementations from setuptools entrypoints""" already_registered = self.get_plugins() - num_cli_hooks_found = self.load_setuptools_entrypoints(_CLI_PLUGIN_HOOKS) + self.load_setuptools_entrypoints(_CLI_PLUGIN_HOOKS) # Get list of plugin/distinfo tuples for all setuptools registered plugins. plugininfo = self.list_plugin_distinfo() @@ -58,8 +46,8 @@ def _register_cli_hooks_setuptools(self) -> None: } if plugin_names: - logging.info( + logger.debug( "Registered CLI hooks from %d installed plugin(s): %s", - num_cli_hooks_found, + len(plugin_names), ", ".join(sorted(plugin_names)), ) diff --git a/kedro/framework/cli/hooks/markers.py b/kedro/framework/cli/hooks/markers.py index ea3cccc420..dca1769e5e 100644 --- a/kedro/framework/cli/hooks/markers.py +++ b/kedro/framework/cli/hooks/markers.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module provides markers to declare Kedro CLI's hook specs and implementations. For more information, please see [Pluggy's documentation](https://pluggy.readthedocs.io/en/stable/#marking-hooks). diff --git a/kedro/framework/cli/hooks/specs.py b/kedro/framework/cli/hooks/specs.py index 55fad40739..cc8c23a9f2 100644 --- a/kedro/framework/cli/hooks/specs.py +++ b/kedro/framework/cli/hooks/specs.py @@ -1,36 +1,8 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """A module containing specifications for all callable hooks in the Kedro CLI's execution timeline. For more information about these specifications, please visit [Pluggy's documentation](https://pluggy.readthedocs.io/en/stable/#specs) """ -from typing import List +from __future__ import annotations from kedro.framework.startup import ProjectMetadata @@ -44,7 +16,7 @@ class CLICommandSpecs: def before_command_run( self, project_metadata: ProjectMetadata, - command_args: List[str], + command_args: list[str], ): """Hooks to be invoked before a CLI command runs. It receives the ``project_metadata`` as well as @@ -56,3 +28,19 @@ def before_command_run( command_args: The command line arguments that were used. """ pass + + @cli_hook_spec + def after_command_run( + self, project_metadata: ProjectMetadata, command_args: list[str], exit_code: int + ): + """Hooks to be invoked after a CLI command runs. + It receives the ``project_metadata`` as well as + all command line arguments that were used, including the command + and subcommand themselves and if the operation was successful or not. + + Args: + project_metadata: The Kedro project's metadata. + command_args: The command line arguments that were used. + exit_code: Exit code raised by Click application after completion + """ + pass diff --git a/kedro/framework/cli/jupyter.py b/kedro/framework/cli/jupyter.py index 367b1fbe07..e7cfbc166e 100644 --- a/kedro/framework/cli/jupyter.py +++ b/kedro/framework/cli/jupyter.py @@ -1,48 +1,20 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """A collection of helper functions to integrate with Jupyter/IPython and CLI commands for working with Kedro catalog. """ +from __future__ import annotations + import json import os -import re +import shutil import sys from collections import Counter from glob import iglob from pathlib import Path -from typing import Any, Dict, Iterable, List +from typing import Any from warnings import warn import click from click import secho -from jupyter_client.kernelspec import NATIVE_KERNEL_NAME, KernelSpecManager -from traitlets import Unicode from kedro.framework.cli.utils import ( KedroCliError, @@ -50,18 +22,11 @@ command_with_verbosity, env_option, forward_command, - ipython_message, - load_entry_points, python_call, ) +from kedro.framework.project import validate_settings from kedro.framework.startup import ProjectMetadata -JUPYTER_IP_HELP = "IP address of the Jupyter server." -JUPYTER_ALL_KERNELS_HELP = "Display all available Python kernels." -JUPYTER_IDLE_TIMEOUT_HELP = """When a notebook is closed, Jupyter server will -terminate its kernel after so many seconds of inactivity. This does not affect -any open notebooks.""" - CONVERT_ALL_HELP = """Extract the nodes from all notebooks in the Kedro project directory, including sub-folders.""" @@ -69,131 +34,162 @@ overwrite its contents.""" -def collect_line_magic(): - """Interface function for collecting line magic functions from plugin entry points.""" - return load_entry_points("line_magic") - +class JupyterCommandGroup(click.Group): + """A custom class for ordering the `kedro jupyter` command groups""" -class SingleKernelSpecManager(KernelSpecManager): - """A custom KernelSpec manager to be used by Kedro projects. - It limits the kernels to the default one only, - to make it less confusing for users, and gives it a sensible name. - """ - - default_kernel_name = Unicode( - "Kedro", config=True, help="Alternative name for the default kernel" - ) - whitelist = [NATIVE_KERNEL_NAME] + def list_commands(self, ctx): + """List commands according to a custom order""" + return ["setup", "notebook", "lab", "convert"] - def get_kernel_spec(self, kernel_name): - """ - This function will only be called by Jupyter to get a KernelSpec - for the default kernel. - We replace the name by something sensible here. - """ - kernelspec = super().get_kernel_spec(kernel_name) - if kernel_name == NATIVE_KERNEL_NAME: - kernelspec.display_name = self.default_kernel_name - - return kernelspec - - -def _update_ipython_dir(project_path: Path) -> None: - os.environ["IPYTHONDIR"] = str(project_path / ".ipython") - - -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def jupyter_cli(): # pragma: no cover pass -@jupyter_cli.group() +@jupyter_cli.group(cls=JupyterCommandGroup) def jupyter(): """Open Jupyter Notebook / Lab with project specific variables loaded, or convert notebooks into Kedro code. """ +@forward_command(jupyter, "setup", forward_help=True) +@click.pass_obj # this will pass the metadata as first argument +def setup(metadata: ProjectMetadata, args, **kwargs): # noqa: unused-argument + """Initialise the Jupyter Kernel for a kedro project.""" + _check_module_importable("ipykernel") + validate_settings() + + kernel_name = f"kedro_{metadata.package_name}" + kernel_path = _create_kernel(kernel_name, f"Kedro ({metadata.package_name})") + click.secho(f"\nThe kernel has been created successfully at {kernel_path}") + + @forward_command(jupyter, "notebook", forward_help=True) -@click.option( - "--ip", - "ip_address", - type=str, - default="127.0.0.1", - help="IP address of the Jupyter server.", -) -@click.option( - "--all-kernels", is_flag=True, default=False, help=JUPYTER_ALL_KERNELS_HELP -) -@click.option("--idle-timeout", type=int, default=30, help=JUPYTER_IDLE_TIMEOUT_HELP) @env_option @click.pass_obj # this will pass the metadata as first argument def jupyter_notebook( metadata: ProjectMetadata, - ip_address, - all_kernels, env, - idle_timeout, args, **kwargs, -): # pylint: disable=unused-argument,too-many-arguments +): # noqa: unused-argument """Open Jupyter Notebook with project specific variables loaded.""" - _check_module_importable("jupyter_core") - - if "-h" not in args and "--help" not in args: - ipython_message(all_kernels) - - _update_ipython_dir(metadata.project_path) - arguments = _build_jupyter_command( - "notebook", - ip_address=ip_address, - all_kernels=all_kernels, - args=args, - idle_timeout=idle_timeout, - project_name=metadata.project_name, - ) + _check_module_importable("notebook") + validate_settings() - python_call_kwargs = _build_jupyter_env(env) - python_call("jupyter", arguments, **python_call_kwargs) + kernel_name = f"kedro_{metadata.package_name}" + _create_kernel(kernel_name, f"Kedro ({metadata.package_name})") + + if env: + os.environ["KEDRO_ENV"] = env + + python_call( + "jupyter", + ["notebook", f"--MultiKernelManager.default_kernel_name={kernel_name}"] + + list(args), + ) @forward_command(jupyter, "lab", forward_help=True) -@click.option("--ip", "ip_address", type=str, default="127.0.0.1", help=JUPYTER_IP_HELP) -@click.option( - "--all-kernels", is_flag=True, default=False, help=JUPYTER_ALL_KERNELS_HELP -) -@click.option("--idle-timeout", type=int, default=30, help=JUPYTER_IDLE_TIMEOUT_HELP) @env_option @click.pass_obj # this will pass the metadata as first argument def jupyter_lab( metadata: ProjectMetadata, - ip_address, - all_kernels, env, - idle_timeout, args, **kwargs, -): # pylint: disable=unused-argument,too-many-arguments +): # noqa: unused-argument """Open Jupyter Lab with project specific variables loaded.""" - _check_module_importable("jupyter_core") - - if "-h" not in args and "--help" not in args: - ipython_message(all_kernels) - - _update_ipython_dir(metadata.project_path) - arguments = _build_jupyter_command( - "lab", - ip_address=ip_address, - all_kernels=all_kernels, - args=args, - idle_timeout=idle_timeout, - project_name=metadata.project_name, + _check_module_importable("jupyterlab") + validate_settings() + + kernel_name = f"kedro_{metadata.package_name}" + _create_kernel(kernel_name, f"Kedro ({metadata.package_name})") + + if env: + os.environ["KEDRO_ENV"] = env + + python_call( + "jupyter", + ["lab", f"--MultiKernelManager.default_kernel_name={kernel_name}"] + list(args), ) - python_call_kwargs = _build_jupyter_env(env) - python_call("jupyter", arguments, **python_call_kwargs) + +def _create_kernel(kernel_name: str, display_name: str) -> str: + """Creates an IPython kernel for the kedro project. If one with the same kernel_name + exists already it will be replaced. + + Installs the default IPython kernel (which points towards `sys.executable`) + and customises it to make the launch command load the kedro extension. + This is equivalent to the method recommended for creating a custom IPython kernel + on the CLI: https://ipython.readthedocs.io/en/stable/install/kernel_install.html. + + On linux this creates a directory ~/.local/share/jupyter/kernels/{kernel_name} + containing kernel.json, logo-32x32.png, logo-64x64.png and logo-svg.svg. An example kernel.json + looks as follows: + + { + "argv": [ + "/Users/antony_milne/miniconda3/envs/spaceflights/bin/python", + "-m", + "ipykernel_launcher", + "-f", + "{connection_file}", + "--ext", + "kedro.ipython" + ], + "display_name": "Kedro (spaceflights)", + "language": "python", + "metadata": { + "debugger": false + } + } + + Args: + kernel_name: Name of the kernel to create. + display_name: Kernel name as it is displayed in the UI. + + Returns: + String of the path of the created kernel. + + Raises: + KedroCliError: When kernel cannot be setup. + """ + # These packages are required by jupyter lab and notebook, which we have already + # checked are importable, so we don't run _check_module_importable on them. + # noqa: import-outside-toplevel + from ipykernel.kernelspec import install + + try: + # Install with user=True rather than system-wide to minimise footprint and + # ensure that we have permissions to write there. Under the hood this calls + # jupyter_client.KernelSpecManager.install_kernel_spec, which automatically + # removes an old kernel spec if it already exists. + kernel_path = install( + user=True, + kernel_name=kernel_name, + display_name=display_name, + ) + + kernel_json = Path(kernel_path) / "kernel.json" + kernel_spec = json.loads(kernel_json.read_text(encoding="utf-8")) + kernel_spec["argv"].extend(["--ext", "kedro.ipython"]) + # indent=1 is to match the default ipykernel style (see + # ipykernel.write_kernel_spec). + kernel_json.write_text(json.dumps(kernel_spec, indent=1), encoding="utf-8") + + kedro_ipython_dir = Path(__file__).parents[2] / "ipython" + shutil.copy(kedro_ipython_dir / "logo-32x32.png", kernel_path) + shutil.copy(kedro_ipython_dir / "logo-64x64.png", kernel_path) + shutil.copy(kedro_ipython_dir / "logo-svg.svg", kernel_path) + except Exception as exc: + raise KedroCliError( + f"Cannot setup kedro kernel for Jupyter.\nError: {exc}" + ) from exc + return kernel_path @command_with_verbosity(jupyter, "convert") @@ -209,7 +205,7 @@ def jupyter_lab( @click.pass_obj # this will pass the metadata as first argument def convert_notebook( metadata: ProjectMetadata, all_flag, overwrite_flag, filepath, env, **kwargs -): # pylint: disable=unused-argument, too-many-locals +): # noqa: unused-argument, too-many-locals """Convert selected or all notebooks found in a Kedro project to Kedro code, by exporting code from the appropriately-tagged cells: Cells tagged as `node` will be copied over to a Python file matching @@ -217,14 +213,19 @@ def convert_notebook( *Note*: Make sure your notebooks have unique names! FILEPATH: Path(s) to exact notebook file(s) to be converted. Both relative and absolute paths are accepted. - Should not be provided if --all flag is already present. + Should not be provided if --all flag is already present. (DEPRECATED) """ + + deprecation_message = ( + "DeprecationWarning: Command 'kedro jupyter convert' is deprecated and " + "will not be available from Kedro 0.19.0." + ) + click.secho(deprecation_message, fg="red") + project_path = metadata.project_path source_path = metadata.source_dir package_name = metadata.package_name - _update_ipython_dir(project_path) - if not filepath and not all_flag: secho( "Please specify a notebook filepath " @@ -270,46 +271,6 @@ def convert_notebook( secho("Done!", color="green") # type: ignore -def _build_jupyter_command( # pylint: disable=too-many-arguments - base: str, - ip_address: str, - all_kernels: bool, - args: Iterable[str], - idle_timeout: int, - project_name: str = "Kedro", -) -> List[str]: - cmd = [ - base, - "--ip", - ip_address, - f"--MappingKernelManager.cull_idle_timeout={idle_timeout}", - f"--MappingKernelManager.cull_interval={idle_timeout}", - ] - - if not all_kernels: - kernel_name = re.sub(r"[^\w]+", "", project_name).strip() or "Kedro" - - cmd += [ - "--NotebookApp.kernel_spec_manager_class=" - "kedro.framework.cli.jupyter.SingleKernelSpecManager", - f"--KernelSpecManager.default_kernel_name='{kernel_name}'", - ] - - return cmd + list(args) - - -def _build_jupyter_env(kedro_env: str) -> Dict[str, Any]: - """Build the environment dictionary that gets injected into the subprocess running - Jupyter. Since the subprocess has access only to the environment variables passed - in, we need to copy the current environment and add ``KEDRO_ENV``. - """ - if not kedro_env: - return {} - jupyter_env = os.environ.copy() - jupyter_env["KEDRO_ENV"] = kedro_env - return {"env": jupyter_env} - - def _export_nodes(filepath: Path, output_path: Path) -> None: """Copy code from Jupyter cells into nodes in src//nodes/, under filename with same name as notebook. @@ -341,7 +302,7 @@ def _export_nodes(filepath: Path, output_path: Path) -> None: warn(f"Skipping notebook '{filepath}' - no nodes to export.") -def _append_source_code(cell: Dict[str, Any], path: Path) -> None: +def _append_source_code(cell: dict[str, Any], path: Path) -> None: source_code = "".join(cell["source"]).strip() + "\n" with path.open(mode="a") as file_: file_.write(source_code) diff --git a/kedro/framework/cli/micropkg.py b/kedro/framework/cli/micropkg.py new file mode 100644 index 0000000000..3e0a559cf1 --- /dev/null +++ b/kedro/framework/cli/micropkg.py @@ -0,0 +1,989 @@ +"""A collection of CLI commands for working with Kedro micro-packages.""" +# ruff: noqa: I001 # https://github.com/kedro-org/kedro/pull/2634 +from __future__ import annotations + +import re +import shutil +import sys +import tarfile +import tempfile +from importlib import import_module +from pathlib import Path +from typing import Any, Iterable, Iterator, List, Tuple, Union + +import click +from build.util import project_wheel_metadata +from packaging.requirements import InvalidRequirement, Requirement +from packaging.utils import canonicalize_name +from rope.base.project import Project +from rope.contrib import generate +from rope.refactor.move import MoveModule +from rope.refactor.rename import Rename +from setuptools.discovery import FlatLayoutPackageFinder + +from kedro.framework.cli.pipeline import ( + _assert_pkg_name_ok, + _check_pipeline_name, + _get_artifacts_to_package, + _sync_dirs, +) +from kedro.framework.cli.utils import ( + KedroCliError, + _clean_pycache, + call, + command_with_verbosity, + env_option, + python_call, +) +from kedro.framework.startup import ProjectMetadata + +_PYPROJECT_TOML_TEMPLATE = """ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "{name}" +version = "{version}" +description = "Micro-package `{name}`" +dependencies = {install_requires} + +[tool.setuptools.packages] +find = {{}} +""" + + +class _EquivalentRequirement(Requirement): + """Parse a requirement according to PEP 508. + + This class overrides __eq__ to be backwards compatible with pkg_resources.Requirement + while making __str__ and __hash__ use the non-canonicalized name + as agreed in https://github.com/pypa/packaging/issues/644, + + Implementation taken from https://github.com/pypa/packaging/pull/696/ + """ + + def _iter_parts(self, name: str) -> Iterator[str]: + yield name + + if self.extras: + formatted_extras = ",".join(sorted(self.extras)) + yield f"[{formatted_extras}]" + + if self.specifier: + yield str(self.specifier) + + if self.url: + yield f"@ {self.url}" + if self.marker: + yield " " + + if self.marker: + yield f"; {self.marker}" + + def __str__(self) -> str: + return "".join(self._iter_parts(self.name)) + + def __hash__(self) -> int: + return hash( + ( + self.__class__.__name__, + *self._iter_parts(canonicalize_name(self.name)), + ) + ) + + def __eq__(self, other: Any) -> bool: + return ( + canonicalize_name(self.name) == canonicalize_name(other.name) + and self.extras == other.extras + and self.specifier == other.specifier + and self.url == other.url + and self.marker == other.marker + ) + + +def _check_module_path(ctx, param, value): # noqa: unused-argument + if value and not re.match(r"^[\w.]+$", value): + message = ( + "The micro-package location you provided is not a valid Python module path" + ) + raise KedroCliError(message) + return value + + +# noqa: missing-function-docstring +@click.group(name="Kedro") +def micropkg_cli(): # pragma: no cover + pass + + +@micropkg_cli.group() +def micropkg(): + """Commands for working with micro-packages.""" + + +@command_with_verbosity(micropkg, "pull") +@click.argument("package_path", nargs=1, required=False) +@click.option( + "--all", + "-a", + "all_flag", + is_flag=True, + help="Pull and unpack all micro-packages in the `pyproject.toml` package manifest section.", +) +@env_option( + help="Environment to install the micro-package configuration to. Defaults to `base`." +) +@click.option("--alias", type=str, default="", help="Rename the package.") +@click.option( + "-d", + "--destination", + type=click.Path(file_okay=False, dir_okay=False), + default=None, + help="Module location where to unpack under.", +) +@click.option( + "--fs-args", + type=click.Path( + exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True + ), + default=None, + help="Location of a configuration file for the fsspec filesystem used to pull the package.", +) +@click.pass_obj # this will pass the metadata as first argument +def pull_package( # noqa: unused-argument, too-many-arguments + metadata: ProjectMetadata, + package_path, + env, + alias, + destination, + fs_args, + all_flag, + **kwargs, +) -> None: + """Pull and unpack a modular pipeline and other micro-packages in your project.""" + if not package_path and not all_flag: + click.secho( + "Please specify a package path or add '--all' to pull all micro-packages in the " + "'pyproject.toml' package manifest section." + ) + sys.exit(1) + + if all_flag: + _pull_packages_from_manifest(metadata) + return + + _pull_package( + package_path, + metadata, + env=env, + alias=alias, + destination=destination, + fs_args=fs_args, + ) + as_alias = f" as '{alias}'" if alias else "" + message = f"Micro-package {package_path} pulled and unpacked{as_alias}!" + click.secho(message, fg="green") + + +def _pull_package( # noqa: too-many-arguments + package_path: str, + metadata: ProjectMetadata, + env: str = None, + alias: str = None, + destination: str = None, + fs_args: str = None, +): + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir).resolve() + _unpack_sdist(package_path, temp_dir_path, fs_args) + + # temp_dir_path is the parent directory of the project root dir + contents = [member for member in temp_dir_path.iterdir() if member.is_dir()] + if len(contents) != 1: + raise KedroCliError( + "Invalid sdist was extracted: exactly one directory was expected, " + f"got {contents}" + ) + project_root_dir = contents[0] + + # This is much slower than parsing the requirements + # directly from the metadata files + # because it installs the package in an isolated environment, + # but it's the only reliable way of doing it + # without making assumptions on the project metadata. + library_meta = project_wheel_metadata(project_root_dir) + + # Project name will be `my-pipeline` even if `pyproject.toml` says `my_pipeline` + # because standards mandate normalization of names for comparison, + # see https://packaging.python.org/en/latest/specifications/core-metadata/#name + # The proper way to get it would be + # project_name = library_meta.get("Name") + # However, the rest of the code expects the non-normalized package name, + # so we have to find it. + packages = [ + package + for package in FlatLayoutPackageFinder().find(project_root_dir) + if "." not in package + ] + if len(packages) != 1: + # Should not happen if user is calling `micropkg pull` + # with the result of a `micropkg package`, + # and in general if the distribution only contains one package (most likely), + # but helps give a sensible error message otherwise + raise KedroCliError( + "Invalid package contents: exactly one package was expected, " + f"got {packages}" + ) + package_name = packages[0] + + package_reqs = _get_all_library_reqs(library_meta) + + if package_reqs: + requirements_txt = metadata.source_dir / "requirements.txt" + _append_package_reqs(requirements_txt, package_reqs, package_name) + + _clean_pycache(temp_dir_path) + _install_files( + metadata, + package_name, + project_root_dir, + env, + alias, + destination, + ) + + +def _pull_packages_from_manifest(metadata: ProjectMetadata) -> None: + # noqa: import-outside-toplevel + import anyconfig # for performance reasons + + config_dict = anyconfig.load(metadata.config_file) + config_dict = config_dict["tool"]["kedro"] + build_specs = config_dict.get("micropkg", {}).get("pull") + + if not build_specs: + click.secho( + "Nothing to pull. Please update the 'pyproject.toml' package manifest section.", + fg="yellow", + ) + return + + for package_path, specs in build_specs.items(): + if "alias" in specs: + _assert_pkg_name_ok(specs["alias"].split(".")[-1]) + _pull_package(package_path, metadata, **specs) + click.secho(f"Pulled and unpacked '{package_path}'!") + + click.secho("Micro-packages pulled and unpacked!", fg="green") + + +def _package_micropkgs_from_manifest(metadata: ProjectMetadata) -> None: + # noqa: import-outside-toplevel + import anyconfig # for performance reasons + + config_dict = anyconfig.load(metadata.config_file) + config_dict = config_dict["tool"]["kedro"] + build_specs = config_dict.get("micropkg", {}).get("package") + + if not build_specs: + click.secho( + "Nothing to package. Please update the 'pyproject.toml' package manifest section.", + fg="yellow", + ) + return + + for package_name, specs in build_specs.items(): + if "alias" in specs: + _assert_pkg_name_ok(specs["alias"]) + _package_micropkg(package_name, metadata, **specs) + click.secho(f"Packaged '{package_name}' micro-package!") + + click.secho("Micro-packages packaged!", fg="green") + + +@command_with_verbosity(micropkg, "package") +@env_option( + help="Environment where the micro-package configuration lives. Defaults to `base`." +) +@click.option( + "--alias", + type=str, + default="", + callback=_check_pipeline_name, + help="Alternative name to package under.", +) +@click.option( + "-d", + "--destination", + type=click.Path(resolve_path=True, file_okay=False), + help="Location where to create the source distribution file. Defaults to `dist/`.", +) +@click.option( + "--all", + "-a", + "all_flag", + is_flag=True, + help="Package all micro-packages in the `pyproject.toml` package manifest section.", +) +@click.argument("module_path", nargs=1, required=False, callback=_check_module_path) +@click.pass_obj # this will pass the metadata as first argument +def package_micropkg( # noqa: too-many-arguments + metadata: ProjectMetadata, + module_path, + env, + alias, + destination, + all_flag, + **kwargs, +): + """Package up a modular pipeline or micro-package as a Python source distribution.""" + if not module_path and not all_flag: + click.secho( + "Please specify a micro-package name or add '--all' to package all micro-packages in " + "the 'pyproject.toml' package manifest section." + ) + sys.exit(1) + + if all_flag: + _package_micropkgs_from_manifest(metadata) + return + + result_path = _package_micropkg( + module_path, metadata, alias=alias, destination=destination, env=env + ) + + as_alias = f" as '{alias}'" if alias else "" + message = ( + f"'{metadata.package_name}.{module_path}' packaged{as_alias}! " + f"Location: {result_path}" + ) + click.secho(message, fg="green") + + +def _get_fsspec_filesystem(location: str, fs_args: str | None): + # noqa: import-outside-toplevel + import anyconfig + import fsspec + + from kedro.io.core import get_protocol_and_path + + protocol, _ = get_protocol_and_path(location) + fs_args_config = anyconfig.load(fs_args) if fs_args else {} + + try: + return fsspec.filesystem(protocol, **fs_args_config) + except Exception as exc: # noqa: broad-except + # Specified protocol is not supported by `fsspec` + # or requires extra dependencies + click.secho(str(exc), fg="red") + click.secho("Trying to use 'pip download'...", fg="red") + return None + + +def _is_within_directory(directory, target): + abs_directory = directory.resolve() + abs_target = target.resolve() + return abs_directory in abs_target.parents + + +def safe_extract(tar, path): + for member in tar.getmembers(): + member_path = path / member.name + if not _is_within_directory(path, member_path): + # noqa: broad-exception-raised + raise Exception("Failed to safely extract tar file.") + tar.extractall(path) # nosec B202 + + +def _unpack_sdist(location: str, destination: Path, fs_args: str | None) -> None: + filesystem = _get_fsspec_filesystem(location, fs_args) + + if location.endswith(".tar.gz") and filesystem and filesystem.exists(location): + with filesystem.open(location) as fs_file: + with tarfile.open(fileobj=fs_file, mode="r:gz") as tar_file: + safe_extract(tar_file, destination) + else: + python_call( + "pip", + [ + "download", + "--no-deps", + "--no-binary", # the micropackaging expects an sdist, + ":all:", # wheels are not supported + "--dest", + str(destination), + location, + ], + ) + sdist_file = list(destination.glob("*.tar.gz")) + # `--no-deps --no-binary :all:` should fetch only one source distribution file, + # and CLI should fail if that's not the case. + if len(sdist_file) != 1: + file_names = [sf.name for sf in sdist_file] + raise KedroCliError( + f"More than 1 or no sdist files found: {file_names}. " + f"There has to be exactly one source distribution file." + ) + with tarfile.open(sdist_file[0], "r:gz") as fs_file: + safe_extract(fs_file, destination) + + +def _rename_files(conf_source: Path, old_name: str, new_name: str): + config_files_to_rename = ( + each + for each in conf_source.rglob("*") + if each.is_file() and old_name in each.name + ) + for config_file in config_files_to_rename: + new_config_name = config_file.name.replace(old_name, new_name) + config_file.rename(config_file.parent / new_config_name) + + +def _refactor_code_for_unpacking( # noqa: too-many-arguments + project: Project, + package_path: Path, + tests_path: Path, + alias: str | None, + destination: str | None, + project_metadata: ProjectMetadata, +) -> tuple[Path, Path]: + """This is the reverse operation of `_refactor_code_for_package`, i.e + we go from: + # also the root of the Rope project + |__ # or + |__ __init__.py + |__ tests # only tests for + |__ __init__.py + |__ tests.py + + to: + # also the root of the Rope project + |__ + |__ __init__.py + |__ + |__ __init__.py + |__ + |__ __init__.py + |__ tests + |__ __init__.py + |__ + |__ __init__.py + |__ + |__ __init__.py + """ + + def _move_package_with_conflicting_name( + target: Path, original_name: str, desired_name: str = None + ) -> Path: + _rename_package(project, original_name, "tmp_name") + full_path = _create_nested_package(project, target) + _move_package(project, "tmp_name", target.as_posix()) + desired_name = desired_name or original_name + _rename_package(project, (target / "tmp_name").as_posix(), desired_name) + return full_path + + package_name = package_path.stem + package_target = Path(project_metadata.package_name) + tests_target = Path("tests") + + if destination: + destination_path = Path(destination) + package_target = package_target / destination_path + tests_target = tests_target / destination_path + + if alias and alias != package_name: + _rename_package(project, package_name, alias) + package_name = alias + + if package_name == project_metadata.package_name: + full_path = _move_package_with_conflicting_name(package_target, package_name) + else: + full_path = _create_nested_package(project, package_target) + _move_package(project, package_name, package_target.as_posix()) + + refactored_package_path = full_path / package_name + + if not tests_path.exists(): + return refactored_package_path, tests_path + + # we can't rename the tests package to + # because it will conflict with existing top-level package; + # hence we give it a temp name, create the expected + # nested folder structure, move the contents there, + # then rename the temp name to . + full_path = _move_package_with_conflicting_name( + tests_target, original_name="tests", desired_name=package_name + ) + + refactored_tests_path = full_path / package_name + + return refactored_package_path, refactored_tests_path + + +def _install_files( # noqa: too-many-arguments, too-many-locals + project_metadata: ProjectMetadata, + package_name: str, + source_path: Path, + env: str = None, + alias: str = None, + destination: str = None, +): + env = env or "base" + + package_source, test_source, conf_source = _get_package_artifacts( + source_path, package_name + ) + + if conf_source.is_dir() and alias: + _rename_files(conf_source, package_name, alias) + + module_path = alias or package_name + if destination: + module_path = f"{destination}.{module_path}" + + package_dest, test_dest, conf_dest = _get_artifacts_to_package( + project_metadata, module_path=module_path, env=env + ) + + if conf_source.is_dir(): + _sync_dirs(conf_source, conf_dest) + # `config` dir was packaged under `package_name` directory with + # `kedro micropkg package`. Since `config` was already synced, + # we don't want to copy it again when syncing the package, so we remove it. + shutil.rmtree(str(conf_source)) + + project = Project(source_path) + refactored_package_source, refactored_test_source = _refactor_code_for_unpacking( + project, package_source, test_source, alias, destination, project_metadata + ) + project.close() + + if refactored_test_source.is_dir(): + _sync_dirs(refactored_test_source, test_dest) + + # Sync everything under package directory, except `config` + # since it has already been copied. + if refactored_package_source.is_dir(): + _sync_dirs(refactored_package_source, package_dest) + + +def _find_config_files( + source_config_dir: Path, glob_patterns: list[str] +) -> list[tuple[Path, str]]: + config_files: list[tuple[Path, str]] = [] + + if source_config_dir.is_dir(): + config_files = [ + (path, path.parent.relative_to(source_config_dir).as_posix()) + for glob_pattern in glob_patterns + for path in source_config_dir.glob(glob_pattern) + if path.is_file() + ] + + return config_files + + +def _get_default_version(metadata: ProjectMetadata, micropkg_module_path: str) -> str: + # default to micropkg package version + try: + micropkg_module = import_module( + f"{metadata.package_name}.{micropkg_module_path}" + ) + return micropkg_module.__version__ # type: ignore + except (AttributeError, ModuleNotFoundError): + # if micropkg version doesn't exist, take the project one + project_module = import_module(f"{metadata.package_name}") + return project_module.__version__ # type: ignore + + +def _package_micropkg( + micropkg_module_path: str, + metadata: ProjectMetadata, + alias: str = None, + destination: str = None, + env: str = None, +) -> Path: + micropkg_name = micropkg_module_path.split(".")[-1] + package_dir = metadata.source_dir / metadata.package_name + env = env or "base" + + package_source, package_tests, package_conf = _get_artifacts_to_package( + metadata, module_path=micropkg_module_path, env=env + ) + # as the source distribution will only contain parameters, we aren't listing other + # config files not to confuse users and avoid useless file copies + configs_to_package = _find_config_files( + package_conf, + [f"parameters*/**/{micropkg_name}.yml", f"parameters*/**/{micropkg_name}/**/*"], + ) + + source_paths = (package_source, package_tests, configs_to_package) + + # Check that micropkg directory exists and not empty + _validate_dir(package_source) + + destination = Path(destination) if destination else metadata.project_path / "dist" + version = _get_default_version(metadata, micropkg_module_path) + + _generate_sdist_file( + micropkg_name=micropkg_name, + destination=destination.resolve(), + source_paths=source_paths, + version=version, + metadata=metadata, + alias=alias, + ) + + _clean_pycache(package_dir) + _clean_pycache(metadata.project_path) + + return destination + + +def _validate_dir(path: Path) -> None: + if not path.is_dir(): + raise KedroCliError(f"Directory '{path}' doesn't exist.") + if not list(path.iterdir()): + raise KedroCliError(f"'{path}' is an empty directory.") + + +def _get_sdist_name(name, version): + return f"{name}-{version}.tar.gz" + + +def _sync_path_list(source: list[tuple[Path, str]], target: Path) -> None: + for source_path, suffix in source: + target_with_suffix = (target / suffix).resolve() + _sync_dirs(source_path, target_with_suffix) + + +def _drop_comment(line): + # https://github.com/pypa/setuptools/blob/b545fc7/\ + # pkg_resources/_vendor/jaraco/text/__init__.py#L554-L566 + return line.partition(" #")[0] + + +def _make_install_requires(requirements_txt: Path) -> list[str]: + """Parses each line of requirements.txt into a version specifier valid to put in + install_requires. + Matches pkg_resources.parse_requirements""" + if not requirements_txt.exists(): + return [] + return [ + str(_EquivalentRequirement(_drop_comment(requirement_line))) + for requirement_line in requirements_txt.read_text().splitlines() + if requirement_line and not requirement_line.startswith("#") + ] + + +def _create_nested_package(project: Project, package_path: Path) -> Path: + # fails if parts of the path exists already + packages = package_path.parts + parent = generate.create_package(project, packages[0]) + nested_path = Path(project.address) / packages[0] + for package in packages[1:]: + parent = generate.create_package(project, package, sourcefolder=parent) + nested_path = nested_path / package + return nested_path + + +def _move_package(project: Project, source: str, target: str) -> None: + """ + Move a Python package, refactoring relevant imports along the way. + A target of empty string means moving to the root of the `project`. + + Args: + project: rope.base.Project holding the scope of the refactoring. + source: Name of the Python package to be moved. Can be a fully + qualified module path relative to the `project` root, e.g. + "package.pipelines.pipeline" or "package/pipelines/pipeline". + target: Destination of the Python package to be moved. Can be a fully + qualified module path relative to the `project` root, e.g. + "package.pipelines.pipeline" or "package/pipelines/pipeline". + """ + src_folder = project.get_module(source).get_resource() + target_folder = project.get_module(target).get_resource() + change = MoveModule(project, src_folder).get_changes(dest=target_folder) + project.do(change) + + +def _rename_package(project: Project, old_name: str, new_name: str) -> None: + """ + Rename a Python package, refactoring relevant imports along the way, + as well as references in comments. + + Args: + project: rope.base.Project holding the scope of the refactoring. + old_name: Old module name. Can be a fully qualified module path, + e.g. "package.pipelines.pipeline" or "package/pipelines/pipeline", + relative to the `project` root. + new_name: New module name. Can't be a fully qualified module path. + """ + folder = project.get_folder(old_name) + change = Rename(project, folder).get_changes(new_name, docs=True) + project.do(change) + + +def _refactor_code_for_package( + project: Project, + package_path: Path, + tests_path: Path, + alias: str | None, + project_metadata: ProjectMetadata, +) -> None: + """In order to refactor the imports properly, we need to recreate + the same nested structure as in the project. Therefore, we create: + # also the root of the Rope project + |__ + |__ __init__.py + |__ + |__ __init__.py + |__ + |__ __init__.py + |__ tests + |__ __init__.py + |__ path_to_micro_package + |__ __init__.py + |__ + |__ __init__.py + We then move outside of package src to top level ("") + in temp_dir, and rename folder & imports if alias provided. + + For tests, we need to extract all the contents of + at into top-level `tests` folder. This is not possible in one go with + the Rope API, so we have to do it in a bit of a hacky way. + We rename to a `tmp_name` and move it at top-level ("") + in temp_dir. We remove the old `tests` folder and rename `tmp_name` to `tests`. + + The final structure should be: + # also the root of the Rope project + |__ # or + |__ __init__.py + |__ tests # only tests for + |__ __init__.py + |__ test.py + """ + + def _move_package_with_conflicting_name(target: Path, conflicting_name: str): + tmp_name = "tmp_name" + tmp_module = target.parent / tmp_name + _rename_package(project, target.as_posix(), tmp_name) + _move_package(project, tmp_module.as_posix(), "") + shutil.rmtree(Path(project.address) / conflicting_name) + _rename_package(project, tmp_name, conflicting_name) + + # Copy source in appropriate folder structure + package_target = package_path.relative_to(project_metadata.source_dir) + full_path = _create_nested_package(project, package_target) + # overwrite=True to update the __init__.py files generated by create_package + _sync_dirs(package_path, full_path, overwrite=True) + + # Copy tests in appropriate folder structure + if tests_path.exists(): + tests_target = tests_path.relative_to(project_metadata.source_dir) + full_path = _create_nested_package(project, tests_target) + # overwrite=True to update the __init__.py files generated by create_package + _sync_dirs(tests_path, full_path, overwrite=True) + + # Refactor imports in src/package_name/.../micro_package + # and imports of `micro_package` in tests. + micro_package_name = package_target.stem + if micro_package_name == project_metadata.package_name: + _move_package_with_conflicting_name(package_target, micro_package_name) + else: + _move_package(project, package_target.as_posix(), "") + shutil.rmtree(Path(project.address) / project_metadata.package_name) + + if alias: + _rename_package(project, micro_package_name, alias) + + if tests_path.exists(): + # we can't move the relevant tests folder as is because + # it will conflict with the top-level package ; + # we can't rename it "tests" and move it, because it will conflict + # with the existing "tests" folder at top level; + # hence we give it a temp name, move it, delete tests/ and + # rename the temp name to tests. + _move_package_with_conflicting_name(tests_target, "tests") + + +_SourcePathType = Union[Path, List[Tuple[Path, str]]] + + +def _generate_sdist_file( # noqa: too-many-arguments,too-many-locals + micropkg_name: str, + destination: Path, + source_paths: tuple[_SourcePathType, ...], + version: str, + metadata: ProjectMetadata, + alias: str = None, +) -> None: + package_name = alias or micropkg_name + package_source, tests_source, conf_source = source_paths + + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir).resolve() + + project = Project(temp_dir_path) # project where to do refactoring + _refactor_code_for_package( + project, package_source, tests_source, alias, metadata # type: ignore + ) + project.close() + + # Copy & "refactor" config + _, _, conf_target = _get_package_artifacts(temp_dir_path, package_name) + _sync_path_list(conf_source, conf_target) # type: ignore + if conf_target.is_dir() and alias: + _rename_files(conf_target, micropkg_name, alias) + + # Build a pyproject.toml on the fly + try: + install_requires = _make_install_requires( + package_source / "requirements.txt" # type: ignore + ) + except Exception as exc: + click.secho("FAILED", fg="red") + cls = exc.__class__ + raise KedroCliError(f"{cls.__module__}.{cls.__qualname__}: {exc}") from exc + + _generate_manifest_file(temp_dir_path) + _generate_pyproject_file(package_name, version, install_requires, temp_dir_path) + + package_file = destination / _get_sdist_name(name=package_name, version=version) + + if package_file.is_file(): + click.secho( + f"Package file {package_file} will be overwritten!", fg="yellow" + ) + + # python -m build --outdir + call( + [ + sys.executable, + "-m", + "build", + "--sdist", + "--outdir", + str(destination), + ], + cwd=temp_dir, + ) + + +def _generate_manifest_file(output_dir: Path): + manifest_file = output_dir / "MANIFEST.in" + manifest_file.write_text( + """ + global-include README.md + global-include config/parameters* + global-include config/**/parameters* + global-include config/parameters*/** + global-include config/parameters*/**/* + """ + ) + + +def _generate_pyproject_file( + package_name: str, version: str, install_requires: list[str], output_dir: Path +) -> Path: + pyproject_file = output_dir / "pyproject.toml" + + pyproject_file_context = { + "name": package_name, + "version": version, + "install_requires": install_requires, + } + + pyproject_file.write_text(_PYPROJECT_TOML_TEMPLATE.format(**pyproject_file_context)) + return pyproject_file + + +def _get_package_artifacts( + source_path: Path, package_name: str +) -> tuple[Path, Path, Path]: + """From existing package, returns in order: + source_path, tests_path, config_path + """ + artifacts = ( + source_path / package_name, + source_path / "tests", + # package_data (non-python files) needs to live inside one of the packages + source_path / package_name / "config", + ) + return artifacts + + +def _append_package_reqs( + requirements_txt: Path, package_reqs: list[str], package_name: str +) -> None: + """Appends micro-package requirements to project level requirements.txt""" + incoming_reqs = _safe_parse_requirements(package_reqs) + if requirements_txt.is_file(): + existing_reqs = _safe_parse_requirements(requirements_txt.read_text()) + reqs_to_add = set(incoming_reqs) - set(existing_reqs) + if not reqs_to_add: + return + + sorted_reqs = sorted(str(req) for req in reqs_to_add) + sep = "\n" + with open(requirements_txt, "a", encoding="utf-8") as file: + file.write( + f"\n\n# Additional requirements from micro-package `{package_name}`:\n" + ) + file.write(sep.join(sorted_reqs)) + click.secho( + f"Added the following requirements from micro-package '{package_name}' to " + f"requirements.txt:\n{sep.join(sorted_reqs)}" + ) + else: + click.secho( + "No project requirements.txt found. Copying contents from project requirements.txt..." + ) + sorted_reqs = sorted(str(req) for req in incoming_reqs) + sep = "\n" + with open(requirements_txt, "a", encoding="utf-8") as file: + file.write(sep.join(sorted_reqs)) + + click.secho( + "Use 'kedro build-reqs' to compile and 'pip install -r src/requirements.lock' to install " + "the updated list of requirements." + ) + + +def _get_all_library_reqs(metadata): + """Get all library requirements from metadata, leaving markers intact.""" + # See https://discuss.python.org/t/\ + # programmatically-getting-non-optional-requirements-of-current-directory/26963/2 + return [ + str(_EquivalentRequirement(dep_str)) + for dep_str in metadata.get_all("Requires-Dist", []) + ] + + +def _safe_parse_requirements( + requirements: str | Iterable[str], +) -> set[_EquivalentRequirement]: + """Safely parse a requirement or set of requirements. This avoids blowing up when it + encounters a requirement it cannot parse (e.g. `-r requirements.txt`). This way + we can still extract all the parseable requirements out of a set containing some + unparseable requirements. + """ + parseable_requirements = set() + if isinstance(requirements, str): + requirements = requirements.splitlines() + # TODO: Properly handle continuation lines, + # see https://github.com/pypa/setuptools/blob/v67.8.0/setuptools/_reqs.py + for requirement_line in requirements: + if ( + requirement_line + and not requirement_line.startswith("#") + and not requirement_line.startswith("-e") + ): + try: + parseable_requirements.add( + _EquivalentRequirement(_drop_comment(requirement_line)) + ) + except InvalidRequirement: + continue + return parseable_requirements diff --git a/kedro/framework/cli/pipeline.py b/kedro/framework/cli/pipeline.py index b865b64c43..09e3c6986c 100644 --- a/kedro/framework/cli/pipeline.py +++ b/kedro/framework/cli/pipeline.py @@ -1,57 +1,21 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """A collection of CLI commands for working with Kedro pipelines.""" -import json +from __future__ import annotations + import re import shutil -import sys -import tempfile -from importlib import import_module from pathlib import Path from textwrap import indent -from typing import Any, List, NamedTuple, Optional, Tuple, Union -from zipfile import ZipFile +from typing import NamedTuple import click -import pkg_resources -from setuptools.dist import Distribution import kedro from kedro.framework.cli.utils import ( KedroCliError, _clean_pycache, _filter_deprecation_warnings, - _get_requirements_in, - call, command_with_verbosity, env_option, - python_call, ) from kedro.framework.project import settings from kedro.framework.startup import ProjectMetadata @@ -65,15 +29,17 @@ description="Modular pipeline `{name}`", packages=find_packages(), include_package_data=True, - package_data={package_data}, install_requires={install_requires}, ) """ -PipelineArtifacts = NamedTuple( - "PipelineArtifacts", - [("pipeline_dir", Path), ("pipeline_tests", Path), ("pipeline_conf", Path)], -) + +class PipelineArtifacts(NamedTuple): + """An ordered collection of source_path, tests_path, config_paths""" + + pipeline_dir: Path + pipeline_tests: Path + pipeline_conf: Path def _assert_pkg_name_ok(pkg_name: str): @@ -86,11 +52,11 @@ def _assert_pkg_name_ok(pkg_name: str): KedroCliError: If package name violates the requirements. """ - base_message = f"`{pkg_name}` is not a valid Python package name." + base_message = f"'{pkg_name}' is not a valid Python package name." if not re.match(r"^[a-zA-Z_]", pkg_name): message = base_message + " It must start with a letter or underscore." raise KedroCliError(message) - if len(pkg_name) < 2: + if len(pkg_name) < 2: # noqa: PLR2004 message = base_message + " It must be at least 2 characters long." raise KedroCliError(message) if not re.match(r"^\w+$", pkg_name[1:]): @@ -100,13 +66,13 @@ def _assert_pkg_name_ok(pkg_name: str): raise KedroCliError(message) -def _check_pipeline_name(ctx, param, value): # pylint: disable=unused-argument +def _check_pipeline_name(ctx, param, value): # noqa: unused-argument if value: _assert_pkg_name_ok(value) return value -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def pipeline_cli(): # pragma: no cover pass @@ -128,7 +94,7 @@ def pipeline(): @click.pass_obj # this will pass the metadata as first argument def create_pipeline( metadata: ProjectMetadata, name, skip_config, env, **kwargs -): # pylint: disable=unused-argument +): # noqa: unused-argument """Create a new modular pipeline by providing a name.""" package_dir = metadata.source_dir / metadata.package_name conf_source = settings.CONF_SOURCE @@ -137,26 +103,20 @@ def create_pipeline( env = env or "base" if not skip_config and not (project_conf_path / env).exists(): raise KedroCliError( - f"Unable to locate environment `{env}`. " + f"Unable to locate environment '{env}'. " f"Make sure it exists in the project configuration." ) result_path = _create_pipeline(name, package_dir / "pipelines") _copy_pipeline_tests(name, result_path, package_dir) _copy_pipeline_configs(result_path, project_conf_path, skip_config, env=env) - click.secho(f"\nPipeline `{name}` was successfully created.\n", fg="green") - - click.secho( - f"To be able to run the pipeline `{name}`, you will need to add it " - f"to `register_pipelines()` in `{package_dir / 'pipeline_registry.py'}`.", - fg="yellow", - ) + click.secho(f"\nPipeline '{name}' was successfully created.\n", fg="green") @command_with_verbosity(pipeline, "delete") @click.argument("name", nargs=1, callback=_check_pipeline_name) @env_option( - help="Environment to delete pipeline configuration from. Defaults to `base`." + help="Environment to delete pipeline configuration from. Defaults to 'base'." ) @click.option( "-y", "--yes", is_flag=True, help="Confirm deletion of pipeline non-interactively." @@ -164,7 +124,7 @@ def create_pipeline( @click.pass_obj # this will pass the metadata as first argument def delete_pipeline( metadata: ProjectMetadata, name, env, yes, **kwargs -): # pylint: disable=unused-argument +): # noqa: unused-argument """Delete a modular pipeline by providing a name.""" package_dir = metadata.source_dir / metadata.package_name conf_source = settings.CONF_SOURCE @@ -173,7 +133,7 @@ def delete_pipeline( env = env or "base" if not (project_conf_path / env).exists(): raise KedroCliError( - f"Unable to locate environment `{env}`. " + f"Unable to locate environment '{env}'. " f"Make sure it exists in the project configuration." ) @@ -191,7 +151,7 @@ def delete_pipeline( ] if not files_to_delete and not dirs_to_delete: - raise KedroCliError(f"Pipeline `{name}` not found.") + raise KedroCliError(f"Pipeline '{name}' not found.") if not yes: _echo_deletion_warning( @@ -200,141 +160,22 @@ def delete_pipeline( files=files_to_delete, ) click.echo() - yes = click.confirm(f"Are you sure you want to delete pipeline `{name}`?") + yes = click.confirm(f"Are you sure you want to delete pipeline '{name}'?") click.echo() if not yes: raise KedroCliError("Deletion aborted!") _delete_artifacts(*files_to_delete, *dirs_to_delete) - click.secho(f"\nPipeline `{name}` was successfully deleted.", fg="green") + click.secho(f"\nPipeline '{name}' was successfully deleted.", fg="green") click.secho( - f"\nIf you added the pipeline `{name}` to `register_pipelines()` in " - f"`{package_dir / 'pipeline_registry.py'}`, you will need to remove it.", + f"\nIf you added the pipeline '{name}' to 'register_pipelines()' in" + f""" '{package_dir / "pipeline_registry.py"}', you will need to remove it.""", fg="yellow", ) -@command_with_verbosity(pipeline, "pull") -@click.argument("package_path", nargs=1) -@env_option( - help="Environment to install the pipeline configuration to. Defaults to `base`." -) -@click.option( - "--alias", - type=str, - default="", - callback=_check_pipeline_name, - help="Alternative name to unpackage under.", -) -@click.option( - "--fs-args", - type=click.Path( - exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True - ), - default=None, - help="Location of a configuration file for the fsspec filesystem used to pull the package.", -) -@click.pass_obj # this will pass the metadata as first argument -def pull_package( - metadata: ProjectMetadata, package_path, env, alias, fs_args, **kwargs -): # pylint:disable=unused-argument - """Pull and unpack a modular pipeline in your project.""" - - with tempfile.TemporaryDirectory() as temp_dir: - temp_dir_path = Path(temp_dir).resolve() - - _unpack_wheel(package_path, temp_dir_path, fs_args) - - dist_info_file = list(temp_dir_path.glob("*.dist-info")) - if len(dist_info_file) != 1: - raise KedroCliError( - f"More than 1 or no dist-info files found from {package_path}. " - f"There has to be exactly one dist-info directory." - ) - # Extract package name, based on the naming convention for wheel files - # https://www.python.org/dev/peps/pep-0427/#file-name-convention - package_name = dist_info_file[0].stem.split("-")[0] - package_metadata = dist_info_file[0] / "METADATA" - - _clean_pycache(temp_dir_path) - _install_files(metadata, package_name, temp_dir_path, env, alias) - - req_pattern = r"Requires-Dist: (.*?)\n" - package_reqs = re.findall(req_pattern, package_metadata.read_text()) - if package_reqs: - requirements_in = _get_requirements_in( - metadata.source_dir, create_empty=True - ) - _append_package_reqs(requirements_in, package_reqs, package_name) - - -def _package_pipelines_from_manifest(metadata: ProjectMetadata) -> None: - # pylint: disable=import-outside-toplevel - import anyconfig # for performance reasons - - config_dict = anyconfig.load(metadata.config_file) - config_dict = config_dict["tool"]["kedro"] - build_specs = config_dict.get("pipeline", {}).get("package") - - if not build_specs: - click.secho( - "Nothing to package. Please update your `pyproject.toml`.", fg="yellow" - ) - return - - for pipeline_name, specs in build_specs.items(): - _package_pipeline(pipeline_name, metadata, **specs) - click.secho(f"Packaged `{pipeline_name}` pipeline!") - - click.secho("Pipelines packaged!", fg="green") - - -@pipeline.command("package") -@env_option( - help="Environment where the pipeline configuration lives. Defaults to `base`." -) -@click.option( - "--alias", - type=str, - default="", - callback=_check_pipeline_name, - help="Alternative name to package under.", -) -@click.option( - "-d", - "--destination", - type=click.Path(resolve_path=True, file_okay=False), - help="Location where to create the wheel file. Defaults to `dist/`.", -) -@click.option("--all", "-a", "all_flag", is_flag=True) -@click.argument("name", nargs=1, required=False) -@click.pass_obj # this will pass the metadata as first argument -def package_pipeline( - metadata: ProjectMetadata, name, env, alias, destination, all_flag -): # pylint: disable=too-many-arguments - """Package up a modular pipeline as a Python .whl.""" - if not name and not all_flag: - click.secho( - "Please specify a pipeline name or add " - "'--all' to package all pipelines in `pyproject.toml`." - ) - sys.exit(1) - - if all_flag: - _package_pipelines_from_manifest(metadata) - return - - result_path = _package_pipeline( - name, metadata, alias=alias, destination=destination, env=env - ) - - as_alias = f" as `{alias}`" if alias else "" - message = f"Pipeline `{name}` packaged{as_alias}! Location: {result_path}" - click.secho(message, fg="green") - - -def _echo_deletion_warning(message: str, **paths: List[Path]): +def _echo_deletion_warning(message: str, **paths: list[Path]): paths = {key: values for key, values in paths.items() if values} if paths: @@ -346,290 +187,15 @@ def _echo_deletion_warning(message: str, **paths: List[Path]): click.echo(indent(paths_str, " " * 2)) -def _get_fsspec_filesystem(location: str, fs_args: Optional[str]): - # pylint: disable=import-outside-toplevel - import anyconfig - import fsspec - - from kedro.io.core import get_protocol_and_path - - protocol, _ = get_protocol_and_path(location) - fs_args_config = anyconfig.load(fs_args) if fs_args else {} - - try: - return fsspec.filesystem(protocol, **fs_args_config) - except Exception as exc: # pylint: disable=broad-except - # Specified protocol is not supported by `fsspec` - # or requires extra dependencies - click.secho(str(exc), fg="red") - click.secho("Trying to use 'pip download'...", fg="red") - return None - - -def _unpack_wheel(location: str, destination: Path, fs_args: Optional[str]) -> None: - filesystem = _get_fsspec_filesystem(location, fs_args) - - if location.endswith(".whl") and filesystem and filesystem.exists(location): - with filesystem.open(location) as fs_file: - # pylint: disable=consider-using-with - ZipFile(fs_file).extractall(destination) - else: - python_call( - "pip", ["download", "--no-deps", "--dest", str(destination), location] - ) - wheel_file = list(destination.glob("*.whl")) - # `--no-deps` should fetch only one wheel file, and CLI should fail if that's - # not the case. - if len(wheel_file) != 1: - file_names = [wf.name for wf in wheel_file] - raise KedroCliError( - f"More than 1 or no wheel files found: {file_names}. " - f"There has to be exactly one distribution file." - ) - # pylint: disable=consider-using-with - ZipFile(wheel_file[0]).extractall(destination) - - -def _rename_files(conf_source: Path, old_name: str, new_name: str): - config_files_to_rename = ( - each - for each in conf_source.rglob("*") - if each.is_file() and old_name in each.name - ) - for config_file in config_files_to_rename: - new_config_name = config_file.name.replace(old_name, new_name) - config_file.rename(config_file.parent / new_config_name) - - -def _install_files( - project_metadata: ProjectMetadata, - package_name: str, - source_path: Path, - env: str = None, - alias: str = None, -): - env = env or "base" - - package_source, test_source, conf_source = _get_package_artifacts( - source_path, package_name - ) - - if conf_source.is_dir() and alias: - _rename_files(conf_source, package_name, alias) - - pipeline_name = alias or package_name - package_dest, test_dest, conf_dest = _get_pipeline_artifacts( - project_metadata, pipeline_name=pipeline_name, env=env - ) - - if conf_source.is_dir(): - _sync_dirs(conf_source, conf_dest) - # `config` dir was packaged under `package_name` directory with - # `kedro pipeline package`. Since `config` was already synced, - # we don't want to copy it again when syncing the package, so we remove it. - shutil.rmtree(str(conf_source)) - - if test_source.is_dir(): - _sync_dirs(test_source, test_dest) - - # Sync everything under package directory, except `config` - # since it has already been copied. - if package_source.is_dir(): - _sync_dirs(package_source, package_dest) - - -def _find_config_files( - source_config_dir: Path, glob_patterns: List[str] -) -> List[Tuple[Path, str]]: - config_files = [] # type: List[Tuple[Path, str]] - - if source_config_dir.is_dir(): - config_files = [ - (path, path.parent.relative_to(source_config_dir).as_posix()) - for glob_pattern in glob_patterns - for path in source_config_dir.glob(glob_pattern) - if path.is_file() - ] - - return config_files - - -def _package_pipeline( - pipeline_name: str, - metadata: ProjectMetadata, - alias: str = None, - destination: str = None, - env: str = None, -) -> Path: - package_dir = metadata.source_dir / metadata.package_name - env = env or "base" - - artifacts_to_package = _get_pipeline_artifacts( - metadata, pipeline_name=pipeline_name, env=env - ) - # as the wheel file will only contain parameters, we aren't listing other - # config files not to confuse users and avoid useless file copies - configs_to_package = _find_config_files( - artifacts_to_package.pipeline_conf, - [f"parameters*/**/{pipeline_name}.yml", f"parameters*/**/{pipeline_name}/*"], - ) - - source_paths = ( - artifacts_to_package.pipeline_dir, - artifacts_to_package.pipeline_tests, - configs_to_package, - ) - - # Check that pipeline directory exists and not empty - _validate_dir(artifacts_to_package.pipeline_dir) - destination = Path(destination) if destination else metadata.project_path / "dist" - - # default to pipeline package version - try: - pipeline_module = import_module( - f"{metadata.package_name}.pipelines.{pipeline_name}" - ) - version = pipeline_module.__version__ # type: ignore - except (AttributeError, ModuleNotFoundError): - # if pipeline version doesn't exist, take the project one - project_module = import_module(f"{metadata.package_name}") - version = project_module.__version__ # type: ignore - - _generate_wheel_file( - pipeline_name, destination, source_paths, version, alias=alias # type: ignore - ) - - _clean_pycache(package_dir) - _clean_pycache(metadata.project_path) - - return destination - - -def _validate_dir(path: Path) -> None: - if not path.is_dir(): - raise KedroCliError(f"Directory '{path}' doesn't exist.") - if not list(path.iterdir()): - raise KedroCliError(f"'{path}' is an empty directory.") - - -def _get_wheel_name(**kwargs: Any) -> str: - # https://stackoverflow.com/q/51939257/3364156 - dist = Distribution(attrs=kwargs) - bdist_wheel_cmd = dist.get_command_obj("bdist_wheel") - bdist_wheel_cmd.ensure_finalized() - - distname = bdist_wheel_cmd.wheel_dist_name - tag = "-".join(bdist_wheel_cmd.get_tag()) - return f"{distname}-{tag}.whl" - - -def _sync_path_list(source: List[Tuple[Path, str]], target: Path) -> None: - for source_path, suffix in source: - target_with_suffix = (target / suffix).resolve() - _sync_dirs(source_path, target_with_suffix) - - -def _make_install_requires(requirements_txt: Path) -> List[str]: - """Parses each line of requirements.txt into a version specifier valid to put in - install_requires.""" - if not requirements_txt.exists(): - return [] - requirements = pkg_resources.parse_requirements(requirements_txt.read_text()) - return [str(requirement) for requirement in requirements] - - -_SourcePathType = Union[Path, List[Tuple[Path, str]]] - - -# pylint: disable=too-many-locals -def _generate_wheel_file( - pipeline_name: str, - destination: Path, - source_paths: Tuple[_SourcePathType, ...], - version: str, - alias: str = None, -) -> None: - package_name = alias or pipeline_name - - with tempfile.TemporaryDirectory() as temp_dir: - temp_dir_path = Path(temp_dir).resolve() - - # Copy source folders - target_paths = _get_package_artifacts(temp_dir_path, package_name) - source_target, _, conf_target = target_paths - for source, target in zip(source_paths, target_paths): - sync_func = _sync_path_list if isinstance(source, list) else _sync_dirs - sync_func(source, target) # type: ignore - - if conf_target.is_dir() and alias: - _rename_files(conf_target, pipeline_name, alias) - - # Build a setup.py on the fly - try: - install_requires = _make_install_requires( - source_target / "requirements.txt" - ) - except Exception as exc: - click.secho("FAILED", fg="red") - cls = exc.__class__ - raise KedroCliError(f"{cls.__module__}.{cls.__qualname__}: {exc}") from exc - - setup_file = _generate_setup_file( - package_name, version, install_requires, temp_dir_path - ) - - package_file = destination / _get_wheel_name(name=package_name, version=version) - if package_file.is_file(): - click.secho( - f"Package file {package_file} will be overwritten!", fg="yellow" - ) - - # python setup.py bdist_wheel --dist-dir - call( - [ - sys.executable, - str(setup_file.resolve()), - "bdist_wheel", - "--dist-dir", - str(destination), - ], - cwd=temp_dir, - ) - - -def _generate_setup_file( - package_name: str, version: str, install_requires: List[str], output_dir: Path -) -> Path: - setup_file = output_dir / "setup.py" - package_data = { - package_name: [ - "README.md", - "config/parameters*", - "config/**/parameters*", - "config/parameters*/**", - "config/parameters*/**/*", - ] - } - setup_file_context = dict( - name=package_name, - version=version, - package_data=json.dumps(package_data), - install_requires=install_requires, - ) - - setup_file.write_text(_SETUP_PY_TEMPLATE.format(**setup_file_context)) - return setup_file - - def _create_pipeline(name: str, output_dir: Path) -> Path: with _filter_deprecation_warnings(): - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.main import cookiecutter template_path = Path(kedro.__file__).parent / "templates" / "pipeline" cookie_context = {"pipeline_name": name, "kedro_version": kedro.__version__} - click.echo(f"Creating the pipeline `{name}`: ", nl=False) + click.echo(f"Creating the pipeline '{name}': ", nl=False) try: result_path = cookiecutter( @@ -645,7 +211,7 @@ def _create_pipeline(name: str, output_dir: Path) -> Path: click.secho("OK", fg="green") result_path = Path(result_path) - message = indent(f"Location: `{result_path.resolve()}`", " " * 2) + message = indent(f"Location: '{result_path.resolve()}'", " " * 2) click.secho(message, bold=True) _clean_pycache(result_path) @@ -653,12 +219,12 @@ def _create_pipeline(name: str, output_dir: Path) -> Path: return result_path -# pylint: disable=missing-raises-doc -def _sync_dirs(source: Path, target: Path, prefix: str = ""): +def _sync_dirs(source: Path, target: Path, prefix: str = "", overwrite: bool = False): """Recursively copies `source` directory (or file) into `target` directory without overwriting any existing files/directories in the target using the following rules: - 1) Skip any files/directories which names match with files in target. + 1) Skip any files/directories which names match with files in target, + unless overwrite=True. 2) Copy all files from source to target. 3) Recursively copy all directories from source to target. @@ -677,15 +243,17 @@ def _sync_dirs(source: Path, target: Path, prefix: str = ""): elif source.is_file(): content = [source] else: - content = [] # nothing to copy + # nothing to copy + content = [] # pragma: no cover for source_path in content: source_name = source_path.name target_path = target / source_name - click.echo(indent(f"Creating `{target_path}`: ", prefix), nl=False) + click.echo(indent(f"Creating '{target_path}': ", prefix), nl=False) if ( # rule #1 - source_name in existing_files + not overwrite + and source_name in existing_files or source_path.is_file() and source_name in existing_folders ): @@ -707,29 +275,22 @@ def _sync_dirs(source: Path, target: Path, prefix: str = ""): def _get_pipeline_artifacts( project_metadata: ProjectMetadata, pipeline_name: str, env: str ) -> PipelineArtifacts: - """From existing project, returns in order: source_path, tests_path, config_paths""" - package_dir = project_metadata.source_dir / project_metadata.package_name - conf_source = settings.CONF_SOURCE - project_conf_path = project_metadata.project_path / conf_source - artifacts = PipelineArtifacts( - package_dir / "pipelines" / pipeline_name, - package_dir.parent / "tests" / "pipelines" / pipeline_name, - project_conf_path / env, + artifacts = _get_artifacts_to_package( + project_metadata, f"pipelines.{pipeline_name}", env ) - return artifacts + return PipelineArtifacts(*artifacts) -def _get_package_artifacts( - source_path: Path, package_name: str -) -> Tuple[Path, Path, Path]: - """From existing unpacked wheel, returns in order: - source_path, tests_path, config_path - """ +def _get_artifacts_to_package( + project_metadata: ProjectMetadata, module_path: str, env: str +) -> tuple[Path, Path, Path]: + """From existing project, returns in order: source_path, tests_path, config_paths""" + package_dir = project_metadata.source_dir / project_metadata.package_name + project_conf_path = project_metadata.project_path / settings.CONF_SOURCE artifacts = ( - source_path / package_name, - source_path / "tests", - # package_data (non-python files) needs to live inside one of the packages - source_path / package_name / "config", + Path(package_dir, *module_path.split(".")), + Path(package_dir.parent, "tests", *module_path.split(".")), + project_conf_path / env, ) return artifacts @@ -757,7 +318,7 @@ def _copy_pipeline_configs( def _delete_artifacts(*artifacts: Path): for artifact in artifacts: - click.echo(f"Deleting `{artifact}`: ", nl=False) + click.echo(f"Deleting '{artifact}': ", nl=False) try: if artifact.is_dir(): shutil.rmtree(artifact) @@ -767,31 +328,4 @@ def _delete_artifacts(*artifacts: Path): click.secho("FAILED", fg="red") cls = exc.__class__ raise KedroCliError(f"{cls.__module__}.{cls.__qualname__}: {exc}") from exc - else: - click.secho("OK", fg="green") - - -def _append_package_reqs( - requirements_in: Path, package_reqs: List[str], pipeline_name: str -) -> None: - """Appends modular pipeline requirements to project level requirements.in""" - existing_reqs = pkg_resources.parse_requirements(requirements_in.read_text()) - new_reqs = pkg_resources.parse_requirements(package_reqs) - reqs_to_add = set(new_reqs) - set(existing_reqs) - if not reqs_to_add: - return - - sorted_reqs = sorted(str(req) for req in reqs_to_add) - with open(requirements_in, "a") as file: - file.write( - f"\n\n# Additional requirements from modular pipeline `{pipeline_name}`:\n" - ) - file.write("\n".join(sorted_reqs)) - click.secho( - "Added the following requirements from modular pipeline `{}` to " - "requirements.in:\n{}".format(pipeline_name, "\n".join(sorted_reqs)) - ) - click.secho( - "Use `kedro install --build-reqs` to compile and install the updated list of " - "requirements." - ) + click.secho("OK", fg="green") diff --git a/kedro/framework/cli/project.py b/kedro/framework/cli/project.py index 8d57ee5d86..f3cf141dfa 100644 --- a/kedro/framework/cli/project.py +++ b/kedro/framework/cli/project.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """A collection of CLI commands for working with Kedro project.""" import os @@ -34,33 +6,33 @@ import sys import webbrowser from pathlib import Path -from typing import Sequence import click -from click import secho from kedro.framework.cli.utils import ( KedroCliError, _check_module_importable, _config_file_callback, - _get_requirements_in, + _deprecate_options, _get_values_as_tuple, _reformat_load_versions, + _split_load_versions, _split_params, call, command_with_verbosity, env_option, forward_command, - ipython_message, python_call, + split_node_names, split_string, ) +from kedro.framework.project import settings from kedro.framework.session import KedroSession from kedro.framework.startup import ProjectMetadata from kedro.utils import load_obj NO_DEPENDENCY_MESSAGE = """{module} is not installed. Please make sure {module} is in -{src}/requirements.txt and run `kedro install`.""" +{src}/requirements.txt and run 'pip install -r src/requirements.txt'.""" LINT_CHECK_ONLY_HELP = """Check the files for style guide violations, unsorted / unformatted imports, and unblackened Python code without modifying the files.""" OPEN_ARG_HELP = """Open the documentation in your default browser after building.""" @@ -72,11 +44,7 @@ TO_NODES_HELP = """A list of node names which should be used as an end point.""" NODE_ARG_HELP = """Run only nodes with specified names.""" RUNNER_ARG_HELP = """Specify a runner that you want to run the pipeline with. -Available runners: `SequentialRunner`, `ParallelRunner` and `ThreadRunner`. -This option cannot be used together with --parallel.""" -PARALLEL_ARG_HELP = """Run the pipeline using the `ParallelRunner`. -If not specified, use the `SequentialRunner`. This flag cannot be used together -with --runner.""" +Available runners: 'SequentialRunner', 'ParallelRunner' and 'ThreadRunner'.""" ASYNC_ARG_HELP = """Load and save node inputs and outputs asynchronously with threads. If not specified, load and save datasets synchronously.""" TAG_ARG_HELP = """Construct the pipeline using only nodes which have this tag @@ -86,27 +54,21 @@ CONFIG_FILE_HELP = """Specify a YAML configuration file to load the run command arguments from. If command line arguments are provided, they will override the loaded ones.""" -PIPELINE_ARG_HELP = """Name of the modular pipeline to run. -If not set, the project pipeline is run by default.""" +PIPELINE_ARG_HELP = """Name of the registered pipeline to run. +If not set, the '__default__' pipeline is run.""" +NAMESPACE_ARG_HELP = """Name of the node namespace to run.""" PARAMS_ARG_HELP = """Specify extra parameters that you want to pass -to the context initializer. Items must be separated by comma, keys - by colon, -example: param1:value1,param2:value2. Each parameter is split by the first comma, -so parameter values are allowed to contain colons, parameter keys are not.""" - - -def _build_reqs(source_path: Path, args: Sequence[str] = ()): - """Run `pip-compile requirements.in` command. - - Args: - source_path: Path to the project `src` folder. - args: Optional arguments for `pip-compile` call, e.g. `--generate-hashes`. - - """ - requirements_in = _get_requirements_in(source_path) - python_call("piptools", ["compile", "-q", *args, str(requirements_in)]) +to the context initialiser. Items must be separated by comma, keys - by colon or equals sign, +example: param1=value1,param2=value2. Each parameter is split by the first comma, +so parameter values are allowed to contain colons, parameter keys are not. +To pass a nested dictionary as parameter, separate keys by '.', example: +param_group.param1:value1.""" +INPUT_FILE_HELP = """Name of the requirements file to compile.""" +OUTPUT_FILE_HELP = """Name of the file where compiled requirements should be stored.""" +CONF_SOURCE_HELP = """Path of a directory where project configuration is stored.""" -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def project_group(): # pragma: no cover pass @@ -114,8 +76,15 @@ def project_group(): # pragma: no cover @forward_command(project_group, forward_help=True) @click.pass_obj # this will pass the metadata as first argument -def test(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-argument - """Run the test suite.""" +def test(metadata: ProjectMetadata, args, **kwargs): # noqa: ument + """Run the test suite. (DEPRECATED)""" + deprecation_message = ( + "DeprecationWarning: Command 'kedro test' is deprecated and " + "will not be available from Kedro 0.19.0. " + "Use the command 'pytest' instead. " + ) + click.secho(deprecation_message, fg="red") + try: _check_module_importable("pytest") except KedroCliError as exc: @@ -123,8 +92,7 @@ def test(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-a raise KedroCliError( NO_DEPENDENCY_MESSAGE.format(module="pytest", src=str(source_path)) ) from exc - else: - python_call("pytest", args) + python_call("pytest", args) @command_with_verbosity(project_group) @@ -133,8 +101,14 @@ def test(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-a @click.pass_obj # this will pass the metadata as first argument def lint( metadata: ProjectMetadata, files, check_only, **kwargs -): # pylint: disable=unused-argument - """Run flake8, isort and black.""" +): # noqa: unused-argument + """Run flake8, isort and black. (DEPRECATED)""" + deprecation_message = ( + "DeprecationWarning: Command 'kedro lint' is deprecated and " + "will not be available from Kedro 0.19.0." + ) + click.secho(deprecation_message, fg="red") + source_path = metadata.source_dir package_name = metadata.package_name files = files or (str(source_path / "tests"), str(source_path / package_name)) @@ -154,106 +128,52 @@ def lint( python_call("black", ("--check",) + files if check_only else files) python_call("flake8", files) - - check_flag = ("-c",) if check_only else () - python_call("isort", (*check_flag, "-rc") + files) # type: ignore - - -@project_group.command() -@click.option( - "--build-reqs/--no-build-reqs", - "compile_flag", - default=None, - help="Run `pip-compile` on project requirements before install. " - "By default runs only if `src/requirements.in` file doesn't exist.", -) -@click.pass_obj # this will pass the metadata as first argument -def install(metadata: ProjectMetadata, compile_flag): - """Install project dependencies from both requirements.txt - and environment.yml (optional).""" - # we cannot use `context.project_path` as in other commands since - # context instantiation might break due to missing dependencies - # we attempt to install here - source_path = metadata.source_dir - environment_yml = source_path / "environment.yml" - requirements_in = source_path / "requirements.in" - requirements_txt = source_path / "requirements.txt" - - if environment_yml.is_file(): - call(["conda", "env", "update", "--file", str(environment_yml), "--prune"]) - - default_compile = bool(compile_flag is None and not requirements_in.is_file()) - do_compile = compile_flag or default_compile - if do_compile: - _build_reqs(source_path) - - pip_command = ["install", "-U", "-r", str(requirements_txt)] - - if os.name == "posix": - python_call("pip", pip_command) - else: - command = [sys.executable, "-m", "pip"] + pip_command - # To comply with mypy, `shell=True` should be passed instead of - # `creationflags=subprocess.CREATE_NEW_CONSOLE`. However, bandit finds security - # issues for subprocess calls with `shell=True`, so we ignore type instead. See: - # https://bandit.readthedocs.io/en/latest/plugins/b602_subprocess_popen_with_shell_equals_true.html - proc = subprocess.Popen( # pylint: disable=consider-using-with - command, - creationflags=subprocess.CREATE_NEW_CONSOLE, # type: ignore - stderr=subprocess.PIPE, - ) - _, errs = proc.communicate() - if errs: - secho(errs.decode(), fg="red") - raise click.exceptions.Exit(code=1) - secho("Requirements installed!", fg="green") + python_call("isort", ("--check",) + files if check_only else files) @forward_command(project_group, forward_help=True) @env_option @click.pass_obj # this will pass the metadata as first argument -def ipython( - metadata: ProjectMetadata, env, args, **kwargs -): # pylint: disable=unused-argument +def ipython(metadata: ProjectMetadata, env, args, **kwargs): # noqa: unused-argument """Open IPython with project specific variables loaded.""" _check_module_importable("IPython") - os.environ["IPYTHONDIR"] = str(metadata.project_path / ".ipython") if env: os.environ["KEDRO_ENV"] = env - if "-h" not in args and "--help" not in args: - ipython_message() - call(["ipython"] + list(args)) + call(["ipython", "--ext", "kedro.ipython"] + list(args)) @project_group.command() @click.pass_obj # this will pass the metadata as first argument def package(metadata: ProjectMetadata): - """Package the project as a Python egg and wheel.""" + """Package the project as a Python wheel.""" source_path = metadata.source_dir call( [ sys.executable, - "setup.py", - "clean", - "--all", - "bdist_egg", - "--dist-dir", + "-m", + "build", + "--wheel", + "--outdir", "../dist", ], cwd=str(source_path), ) + + directory = ( + str(Path(settings.CONF_SOURCE).parent) + if settings.CONF_SOURCE != "conf" + else metadata.project_path + ) call( [ - sys.executable, - "setup.py", - "clean", - "--all", - "bdist_wheel", - "--dist-dir", - "../dist", - ], - cwd=str(source_path), + "tar", + "--exclude=local/*.yml", + "-czf", + f"dist/conf-{metadata.package_name}.tar.gz", + f"--directory={directory}", + str(Path(settings.CONF_SOURCE).stem), + ] ) @@ -269,7 +189,13 @@ def package(metadata: ProjectMetadata): ) @click.pass_obj # this will pass the metadata as first argument def build_docs(metadata: ProjectMetadata, open_docs): - """Build the project documentation.""" + """Build the project documentation. (DEPRECATED)""" + deprecation_message = ( + "DeprecationWarning: Command 'kedro build-docs' is deprecated and " + "will not be available from Kedro 0.19.0." + ) + click.secho(deprecation_message, fg="red") + source_path = metadata.source_dir package_name = metadata.package_name @@ -289,34 +215,80 @@ def build_docs(metadata: ProjectMetadata, open_docs): call(["sphinx-build", "-M", "html", "docs/source", "docs/build", "-a"]) if open_docs: docs_page = (Path.cwd() / "docs" / "build" / "html" / "index.html").as_uri() - secho(f"Opening {docs_page}") + click.secho(f"Opening {docs_page}") webbrowser.open(docs_page) @forward_command(project_group, name="build-reqs") +@click.option( + "--input-file", + "input_file", + type=click.Path(exists=True, dir_okay=False, resolve_path=True), + multiple=False, + help=INPUT_FILE_HELP, +) +@click.option( + "--output-file", + "output_file", + multiple=False, + help=OUTPUT_FILE_HELP, +) @click.pass_obj # this will pass the metadata as first argument def build_reqs( - metadata: ProjectMetadata, args, **kwargs -): # pylint: disable=unused-argument - """Build the project dependency requirements.""" + metadata: ProjectMetadata, input_file, output_file, args, **kwargs +): # noqa: unused-argument + """Run `pip-compile` on src/requirements.txt or the user defined input file and save + the compiled requirements to src/requirements.lock or the user defined output file. + (DEPRECATED) + """ + deprecation_message = ( + "DeprecationWarning: Command 'kedro build-reqs' is deprecated and " + "will not be available from Kedro 0.19.0." + ) + click.secho(deprecation_message, fg="red") + source_path = metadata.source_dir - _build_reqs(source_path, args) - secho( - "Requirements built! Please update requirements.in " + input_file = Path(input_file or source_path / "requirements.txt") + output_file = Path(output_file or source_path / "requirements.lock") + + if input_file.is_file(): + python_call( + "piptools", + [ + "compile", + *args, + str(input_file), + "--output-file", + str(output_file), + ], + ) + + else: + raise FileNotFoundError( + f"File '{input_file}' not found in the project. " + "Please specify another input or create the file and try again." + ) + + click.secho( + f"Requirements built! Please update {input_file.name} " "if you'd like to make a change in your project's dependencies, " - "and re-run build-reqs to generate the new requirements.txt.", + f"and re-run build-reqs to generate the new {output_file.name}.", fg="green", ) @command_with_verbosity(project_group, "activate-nbstripout") @click.pass_obj # this will pass the metadata as first argument -def activate_nbstripout( - metadata: ProjectMetadata, **kwargs -): # pylint: disable=unused-argument - """Install the nbstripout git hook to automatically clean notebooks.""" +def activate_nbstripout(metadata: ProjectMetadata, **kwargs): # noqa: unused-argument + """Install the nbstripout git hook to automatically clean notebooks. (DEPRECATED)""" + deprecation_message = ( + "DeprecationWarning: Command 'kedro activate-nbstripout' is deprecated and " + "will not be available from Kedro 0.19.0." + ) + click.secho(deprecation_message, fg="red") + source_path = metadata.source_dir - secho( + click.secho( ( "Notebook output cells will be automatically cleared before committing" " to git." @@ -332,13 +304,12 @@ def activate_nbstripout( ) from exc try: - res = subprocess.run( # pylint: disable=subprocess-run-check + res = subprocess.run( # noqa: subprocess-run-check ["git", "rev-parse", "--git-dir"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + capture_output=True, ) if res.returncode: - raise KedroCliError("Not a git repository. Run `git init` first.") + raise KedroCliError("Not a git repository. Run 'git init' first.") except FileNotFoundError as exc: raise KedroCliError("Git executable not found. Install Git first.") from exc @@ -347,25 +318,64 @@ def activate_nbstripout( @project_group.command() @click.option( - "--from-inputs", type=str, default="", help=FROM_INPUTS_HELP, callback=split_string + "--from-inputs", + type=str, + default="", + help=FROM_INPUTS_HELP, + callback=split_string, ) @click.option( - "--to-outputs", type=str, default="", help=TO_OUTPUTS_HELP, callback=split_string + "--to-outputs", + type=str, + default="", + help=TO_OUTPUTS_HELP, + callback=split_string, ) @click.option( - "--from-nodes", type=str, default="", help=FROM_NODES_HELP, callback=split_string + "--from-nodes", + type=str, + default="", + help=FROM_NODES_HELP, + callback=split_node_names, ) @click.option( - "--to-nodes", type=str, default="", help=TO_NODES_HELP, callback=split_string + "--to-nodes", type=str, default="", help=TO_NODES_HELP, callback=split_node_names +) +@click.option( + "--node", + "-n", + "node_names", + type=str, + multiple=True, + help=NODE_ARG_HELP, + callback=_deprecate_options, ) -@click.option("--node", "-n", "node_names", type=str, multiple=True, help=NODE_ARG_HELP) @click.option( - "--runner", "-r", type=str, default=None, multiple=False, help=RUNNER_ARG_HELP + "--nodes", + "nodes_names", + type=str, + default="", + help=NODE_ARG_HELP, + callback=split_node_names, ) -@click.option("--parallel", "-p", is_flag=True, multiple=False, help=PARALLEL_ARG_HELP) -@click.option("--async", "is_async", is_flag=True, multiple=False, help=ASYNC_ARG_HELP) +@click.option("--runner", "-r", type=str, default=None, help=RUNNER_ARG_HELP) +@click.option("--async", "is_async", is_flag=True, help=ASYNC_ARG_HELP) @env_option -@click.option("--tag", "-t", type=str, multiple=True, help=TAG_ARG_HELP) +@click.option( + "--tag", + "-t", + type=str, + multiple=True, + help=TAG_ARG_HELP, + callback=_deprecate_options, +) +@click.option( + "--tags", + type=str, + default="", + help=TAG_ARG_HELP, + callback=split_string, +) @click.option( "--load-version", "-lv", @@ -374,7 +384,15 @@ def activate_nbstripout( help=LOAD_VERSION_HELP, callback=_reformat_load_versions, ) -@click.option("--pipeline", type=str, default=None, help=PIPELINE_ARG_HELP) +@click.option( + "--load-versions", + type=str, + default="", + help=LOAD_VERSION_HELP, + callback=_split_load_versions, +) +@click.option("--pipeline", "-p", type=str, default=None, help=PIPELINE_ARG_HELP) +@click.option("--namespace", "-ns", type=str, default=None, help=NAMESPACE_ARG_HELP) @click.option( "--config", "-c", @@ -383,43 +401,58 @@ def activate_nbstripout( callback=_config_file_callback, ) @click.option( - "--params", type=str, default="", help=PARAMS_ARG_HELP, callback=_split_params + "--conf-source", + type=click.Path(exists=True, file_okay=True, resolve_path=True), + help=CONF_SOURCE_HELP, ) -# pylint: disable=too-many-arguments,unused-argument,too-many-locals -def run( +@click.option( + "--params", + type=click.UNPROCESSED, + default="", + help=PARAMS_ARG_HELP, + callback=_split_params, +) +def run( # noqa: too-many-arguments,unused-argument,too-many-locals tag, + tags, env, - parallel, runner, is_async, node_names, + nodes_names, to_nodes, from_nodes, from_inputs, to_outputs, load_version, + load_versions, pipeline, config, + conf_source, params, + namespace, ): """Run the pipeline.""" - if parallel and runner: - raise KedroCliError( - "Both --parallel and --runner options cannot be used together. " - "Please use either --parallel or --runner." - ) - runner = runner or "SequentialRunner" - if parallel: - runner = "ParallelRunner" - runner_class = load_obj(runner, "kedro.runner") - tag = _get_values_as_tuple(tag) if tag else tag - node_names = _get_values_as_tuple(node_names) if node_names else node_names + runner = load_obj(runner or "SequentialRunner", "kedro.runner") + + tag = _get_values_as_tuple(tag) + node_names = _get_values_as_tuple(node_names) + + # temporary duplicates for the plural flags + tags = _get_values_as_tuple(tags) + nodes_names = _get_values_as_tuple(nodes_names) + + tag = tag + tags + node_names = node_names + nodes_names + load_version = {**load_version, **load_versions} - with KedroSession.create(env=env, extra_params=params) as session: + with KedroSession.create( + env=env, conf_source=conf_source, extra_params=params + ) as session: session.run( tags=tag, - runner=runner_class(is_async=is_async), + runner=runner(is_async=is_async), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes, @@ -427,4 +460,5 @@ def run( to_outputs=to_outputs, load_versions=load_version, pipeline_name=pipeline, + namespace=namespace, ) diff --git a/kedro/framework/cli/registry.py b/kedro/framework/cli/registry.py index 820819b564..05b05f9afd 100644 --- a/kedro/framework/cli/registry.py +++ b/kedro/framework/cli/registry.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """A collection of CLI commands for working with registered Kedro pipelines.""" import click import yaml @@ -34,7 +7,7 @@ from kedro.framework.startup import ProjectMetadata -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def registry_cli(): # pragma: no cover pass @@ -56,7 +29,7 @@ def list_registered_pipelines(): @click.pass_obj def describe_registered_pipeline( metadata: ProjectMetadata, name, **kwargs -): # pylint: disable=unused-argument, protected-access +): # noqa: unused-argument, protected-access """Describe a registered pipeline by providing a pipeline name. Defaults to the `__default__` pipeline. """ @@ -65,7 +38,7 @@ def describe_registered_pipeline( all_pipeline_names = pipelines.keys() existing_pipelines = ", ".join(sorted(all_pipeline_names)) raise KedroCliError( - f"`{name}` pipeline not found. Existing pipelines: [{existing_pipelines}]" + f"'{name}' pipeline not found. Existing pipelines: [{existing_pipelines}]" ) nodes = [] diff --git a/kedro/framework/cli/starters.py b/kedro/framework/cli/starters.py index c4b3e704d5..25e68f3699 100644 --- a/kedro/framework/cli/starters.py +++ b/kedro/framework/cli/starters.py @@ -1,49 +1,23 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - """kedro is a CLI for managing Kedro projects. This module implements commands available from the kedro CLI for creating projects. """ +from __future__ import annotations + import os import re import shutil import stat import tempfile from collections import OrderedDict +from itertools import groupby from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable import click -import git import yaml +from attrs import define, field import kedro from kedro import __version__ as version @@ -52,21 +26,56 @@ KedroCliError, _clean_pycache, _filter_deprecation_warnings, + _get_entry_points, + _safe_load_entry_point, command_with_verbosity, ) KEDRO_PATH = Path(kedro.__file__).parent TEMPLATE_PATH = KEDRO_PATH / "templates" / "project" +_STARTERS_REPO = "git+https://github.com/kedro-org/kedro-starters.git" + + +@define(order=True) +class KedroStarterSpec: # noqa: too-few-public-methods + """Specification of custom kedro starter template + Args: + alias: alias of the starter which shows up on `kedro starter list` and is used + by the starter argument of `kedro new` + template_path: path to a directory or a URL to a remote VCS repository supported + by `cookiecutter` + directory: optional directory inside the repository where the starter resides. + origin: reserved field used by kedro internally to determine where the starter + comes from, users do not need to provide this field. + """ + + alias: str + template_path: str + directory: str | None = None + origin: str | None = field(init=False) + + +_OFFICIAL_STARTER_SPECS = [ + KedroStarterSpec("astro-airflow-iris", _STARTERS_REPO, "astro-airflow-iris"), + # The `astro-iris` was renamed to `astro-airflow-iris`, but old (external) + # documentation and tutorials still refer to `astro-iris`. We create an alias to + # check if a user has entered old `astro-iris` as the starter name and changes it + # to `astro-airflow-iris`. + KedroStarterSpec("astro-iris", _STARTERS_REPO, "astro-airflow-iris"), + KedroStarterSpec( + "standalone-datacatalog", _STARTERS_REPO, "standalone-datacatalog" + ), + KedroStarterSpec("pandas-iris", _STARTERS_REPO, "pandas-iris"), + KedroStarterSpec("pyspark", _STARTERS_REPO, "pyspark"), + KedroStarterSpec("pyspark-iris", _STARTERS_REPO, "pyspark-iris"), + KedroStarterSpec("spaceflights", _STARTERS_REPO, "spaceflights"), + KedroStarterSpec("databricks-iris", _STARTERS_REPO, "databricks-iris"), +] +# Set the origin for official starters +for starter_spec in _OFFICIAL_STARTER_SPECS: + starter_spec.origin = "kedro" +_OFFICIAL_STARTER_SPECS = {spec.alias: spec for spec in _OFFICIAL_STARTER_SPECS} -_STARTER_ALIASES = { - "astro-iris", - "mini-kedro", - "pandas-iris", - "pyspark", - "pyspark-iris", - "spaceflights", -} -_STARTERS_REPO = "git+https://github.com/quantumblacklabs/kedro-starters.git" CONFIG_ARG_HELP = """Non-interactive mode, using a configuration yaml file. This file must supply the keys required by the template's prompts.yml. When not using a starter, @@ -83,8 +92,8 @@ ) -# pylint: disable=unused-argument -def _remove_readonly(func: Callable, path: Path, excinfo: Tuple): # pragma: no cover +# noqa: unused-argument +def _remove_readonly(func: Callable, path: Path, excinfo: tuple): # pragma: no cover """Remove readonly files on Windows See: https://docs.python.org/3/library/shutil.html?highlight=shutil#rmtree-example """ @@ -92,7 +101,66 @@ def _remove_readonly(func: Callable, path: Path, excinfo: Tuple): # pragma: no func(path) -# pylint: disable=missing-function-docstring +def _get_starters_dict() -> dict[str, KedroStarterSpec]: + """This function lists all the starter aliases declared in + the core repo and in plugins entry points. + + For example, the output for official kedro starters looks like: + {"astro-airflow-iris": + KedroStarterSpec( + name="astro-airflow-iris", + template_path="git+https://github.com/kedro-org/kedro-starters.git", + directory="astro-airflow-iris", + origin="kedro" + ), + "astro-iris": + KedroStarterSpec( + name="astro-iris", + template_path="git+https://github.com/kedro-org/kedro-starters.git", + directory="astro-airflow-iris", + origin="kedro" + ), + } + """ + starter_specs = _OFFICIAL_STARTER_SPECS + + for starter_entry_point in _get_entry_points(name="starters"): + origin = starter_entry_point.module.split(".")[0] + specs = _safe_load_entry_point(starter_entry_point) or [] + for spec in specs: + if not isinstance(spec, KedroStarterSpec): + click.secho( + f"The starter configuration loaded from module {origin}" + f"should be a 'KedroStarterSpec', got '{type(spec)}' instead", + fg="red", + ) + elif spec.alias in starter_specs: + click.secho( + f"Starter alias `{spec.alias}` from `{origin}` " + f"has been ignored as it is already defined by" + f"`{starter_specs[spec.alias].origin}`", + fg="red", + ) + else: + spec.origin = origin + starter_specs[spec.alias] = spec + return starter_specs + + +def _starter_spec_to_dict( + starter_specs: dict[str, KedroStarterSpec] +) -> dict[str, dict[str, str]]: + """Convert a dictionary of starters spec to a nicely formatted dictionary""" + format_dict: dict[str, dict[str, str]] = {} + for alias, spec in starter_specs.items(): + format_dict[alias] = {} # Each dictionary represent 1 starter + format_dict[alias]["template_path"] = spec.template_path + if spec.directory: + format_dict[alias]["directory"] = spec.directory + return format_dict + + +# noqa: missing-function-docstring @click.group(context_settings=CONTEXT_SETTINGS, name="Kedro") def create_cli(): # pragma: no cover pass @@ -106,31 +174,34 @@ def create_cli(): # pragma: no cover type=click.Path(exists=True), help=CONFIG_ARG_HELP, ) -@click.option("--starter", "-s", "starter_name", help=STARTER_ARG_HELP) +@click.option("--starter", "-s", "starter_alias", help=STARTER_ARG_HELP) @click.option("--checkout", help=CHECKOUT_ARG_HELP) @click.option("--directory", help=DIRECTORY_ARG_HELP) -def new( - config_path, starter_name, checkout, directory, **kwargs -): # pylint: disable=unused-argument +def new(config_path, starter_alias, checkout, directory, **kwargs): """Create a new kedro project.""" - if checkout and not starter_name: + if checkout and not starter_alias: raise KedroCliError("Cannot use the --checkout flag without a --starter value.") - if directory and not starter_name: + if directory and not starter_alias: raise KedroCliError( "Cannot use the --directory flag without a --starter value." ) - if starter_name in _STARTER_ALIASES: + starters_dict = _get_starters_dict() + + if starter_alias in starters_dict: if directory: raise KedroCliError( "Cannot use the --directory flag with a --starter alias." ) - template_path = _STARTERS_REPO - directory = starter_name + spec = starters_dict[starter_alias] + template_path = spec.template_path + # "directory" is an optional key for starters from plugins, so if the key is + # not present we will use "None". + directory = spec.directory checkout = checkout or version - elif starter_name is not None: - template_path = starter_name + elif starter_alias is not None: + template_path = starter_alias checkout = checkout or version else: template_path = str(TEMPLATE_PATH) @@ -147,12 +218,14 @@ def new( # Ideally we would want to be able to use tempfile.TemporaryDirectory() context manager # but it causes an issue with readonly files on windows # see: https://bugs.python.org/issue26660. - # So onerror, we will attempt to clear the readonly bits and re-attempt the cleanup + # So on error, we will attempt to clear the readonly bits and re-attempt the cleanup shutil.rmtree(tmpdir, onerror=_remove_readonly) # Obtain config, either from a file or from interactive user prompts. if not prompts_required: - config = dict() + config = {} + if config_path: + config = _fetch_config_from_file(config_path) elif config_path: config = _fetch_config_from_file(config_path) _validate_config_file(config, prompts_required) @@ -171,16 +244,29 @@ def starter(): @starter.command("list") def list_starters(): """List all official project starters available.""" - repo_url = _STARTERS_REPO.replace("git+", "").replace( - ".git", "/tree/master/{alias}" + starters_dict = _get_starters_dict() + + # Group all specs by origin as nested dict and sort it. + sorted_starters_dict: dict[str, dict[str, KedroStarterSpec]] = { + origin: dict(sorted(starters_dict_by_origin)) + for origin, starters_dict_by_origin in groupby( + starters_dict.items(), lambda item: item[1].origin + ) + } + + # ensure kedro starters are listed first + sorted_starters_dict = dict( + sorted(sorted_starters_dict.items(), key=lambda x: x == "kedro") ) - output = [ - {alias: repo_url.format(alias=alias)} for alias in sorted(_STARTER_ALIASES) - ] - click.echo(yaml.safe_dump(output)) + + for origin, starters_spec in sorted_starters_dict.items(): + click.secho(f"\nStarters from {origin}\n", fg="yellow") + click.echo( + yaml.safe_dump(_starter_spec_to_dict(starters_spec), sort_keys=False) + ) -def _fetch_config_from_file(config_path: str) -> Dict[str, str]: +def _fetch_config_from_file(config_path: str) -> dict[str, str]: """Obtains configuration for a new kedro project non-interactively from a file. Args: @@ -196,7 +282,7 @@ def _fetch_config_from_file(config_path: str) -> Dict[str, str]: """ try: - with open(config_path, "r") as config_file: + with open(config_path, encoding="utf-8") as config_file: config = yaml.safe_load(config_file) if KedroCliError.VERBOSE_ERROR: @@ -211,10 +297,10 @@ def _fetch_config_from_file(config_path: str) -> Dict[str, str]: def _make_cookiecutter_args( - config: Dict[str, str], + config: dict[str, str], checkout: str, directory: str, -) -> Dict[str, Any]: +) -> dict[str, Any]: """Creates a dictionary of arguments to pass to cookiecutter. Args: @@ -247,7 +333,7 @@ def _make_cookiecutter_args( return cookiecutter_args -def _create_project(template_path: str, cookiecutter_args: Dict[str, str]): +def _create_project(template_path: str, cookiecutter_args: dict[str, Any]): """Creates a new kedro project using cookiecutter. Args: @@ -261,7 +347,7 @@ def _create_project(template_path: str, cookiecutter_args: Dict[str, str]): KedroCliError: If it fails to generate a project. """ with _filter_deprecation_warnings(): - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.main import cookiecutter # for performance reasons try: @@ -272,16 +358,28 @@ def _create_project(template_path: str, cookiecutter_args: Dict[str, str]): ) from exc _clean_pycache(Path(result_path)) + extra_context = cookiecutter_args["extra_context"] + project_name = extra_context.get("project_name", "New Kedro Project") + python_package = extra_context.get( + "python_package", project_name.lower().replace(" ", "_").replace("-", "_") + ) click.secho( - f"\nChange directory to the project generated in {result_path}", - fg="green", + f"\nThe project name '{project_name}' has been applied to: " + f"\n- The project title in {result_path}/README.md " + f"\n- The folder created for your project in {result_path} " + f"\n- The project's python package in {result_path}/src/{python_package}" ) click.secho( "\nA best-practice setup includes initialising git and creating " - "a virtual environment before running ``kedro install`` to install " + "a virtual environment before running 'pip install -r src/requirements.txt' to install " "project-specific dependencies. Refer to the Kedro documentation: " "https://kedro.readthedocs.io/" ) + click.secho( + f"\nChange directory to the project generated in {result_path} by " + f"entering 'cd {result_path}'", + fg="green", + ) def _get_cookiecutter_dir( @@ -291,14 +389,14 @@ def _get_cookiecutter_dir( clones it to ``tmpdir``; if template_path is a file path then directly uses that path without copying anything. """ - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.exceptions import RepositoryCloneFailed, RepositoryNotFound from cookiecutter.repository import determine_repo_dir # for performance reasons try: cookiecutter_dir, _ = determine_repo_dir( template=template_path, - abbreviations=dict(), + abbreviations={}, clone_to_dir=Path(tmpdir).resolve(), checkout=checkout, no_input=True, @@ -312,12 +410,16 @@ def _get_cookiecutter_dir( f" Specified tag {checkout}. The following tags are available: " + ", ".join(_get_available_tags(template_path)) ) - raise KedroCliError(error_message) from exc + official_starters = sorted(_OFFICIAL_STARTER_SPECS) + raise KedroCliError( + f"{error_message}. The aliases for the official Kedro starters are: \n" + f"{yaml.safe_dump(official_starters, sort_keys=False)}" + ) from exc return Path(cookiecutter_dir) -def _get_prompts_required(cookiecutter_dir: Path) -> Optional[Dict[str, Any]]: +def _get_prompts_required(cookiecutter_dir: Path) -> dict[str, Any] | None: """Finds the information a user must supply according to prompts.yml.""" prompts_yml = cookiecutter_dir / "prompts.yml" if not prompts_yml.is_file(): @@ -333,8 +435,8 @@ def _get_prompts_required(cookiecutter_dir: Path) -> Optional[Dict[str, Any]]: def _fetch_config_from_user_prompts( - prompts: Dict[str, Any], cookiecutter_context: OrderedDict -) -> Dict[str, str]: + prompts: dict[str, Any], cookiecutter_context: OrderedDict +) -> dict[str, str]: """Interactively obtains information from user prompts. Args: @@ -345,11 +447,11 @@ def _fetch_config_from_user_prompts( Configuration for starting a new project. This is passed as ``extra_context`` to cookiecutter and will overwrite the cookiecutter.json defaults. """ - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.environment import StrictEnvironment from cookiecutter.prompt import read_user_variable, render_variable - config: Dict[str, str] = dict() + config: dict[str, str] = {} for variable_name, prompt_dict in prompts.items(): prompt = _Prompt(**prompt_dict) @@ -357,7 +459,7 @@ def _fetch_config_from_user_prompts( # render the variable on the command line cookiecutter_variable = render_variable( env=StrictEnvironment(context=cookiecutter_context), - raw=cookiecutter_context[variable_name], + raw=cookiecutter_context.get(variable_name), cookiecutter_dict=config, ) @@ -370,7 +472,7 @@ def _fetch_config_from_user_prompts( def _make_cookiecutter_context_for_prompts(cookiecutter_dir: Path): - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.generate import generate_context cookiecutter_context = generate_context(cookiecutter_dir / "cookiecutter.json") @@ -380,7 +482,7 @@ def _make_cookiecutter_context_for_prompts(cookiecutter_dir: Path): class _Prompt: """Represent a single CLI prompt for `kedro new`""" - def __init__(self, *args, **kwargs) -> None: # pylint: disable=unused-argument + def __init__(self, *args, **kwargs) -> None: # noqa: unused-argument try: self.title = kwargs["title"] except KeyError as exc: @@ -402,12 +504,17 @@ def __str__(self) -> str: def validate(self, user_input: str) -> None: """Validate a given prompt value against the regex validator""" if self.regexp and not re.match(self.regexp, user_input): - click.secho(f"`{user_input}` is an invalid value.", fg="red", err=True) + message = f"'{user_input}' is an invalid value for {self.title}." + click.secho(message, fg="red", err=True) click.secho(self.error_message, fg="red", err=True) - raise ValueError(user_input) + raise ValueError(message, self.error_message) + +def _get_available_tags(template_path: str) -> list: + # Not at top level so that kedro CLI works without a working git executable. + # noqa: import-outside-toplevel + import git -def _get_available_tags(template_path: str) -> List: try: tags = git.cmd.Git().ls_remote("--tags", template_path.replace("git+", "")) @@ -423,7 +530,7 @@ def _get_available_tags(template_path: str) -> List: return sorted(unique_tags) -def _validate_config_file(config: Dict[str, str], prompts: Dict[str, Any]): +def _validate_config_file(config: dict[str, str], prompts: dict[str, Any]): """Checks that the configuration file contains all needed variables. Args: @@ -436,7 +543,6 @@ def _validate_config_file(config: Dict[str, str], prompts: Dict[str, Any]): """ if config is None: raise KedroCliError("Config file is empty.") - missing_keys = set(prompts) - set(config) if missing_keys: click.echo(yaml.dump(config, default_flow_style=False)) @@ -444,6 +550,6 @@ def _validate_config_file(config: Dict[str, str], prompts: Dict[str, Any]): if "output_dir" in config and not Path(config["output_dir"]).exists(): raise KedroCliError( - f"`{config['output_dir']}` is not a valid output directory. " + f"'{config['output_dir']}' is not a valid output directory. " "It must be a relative or absolute path to an existing directory." ) diff --git a/kedro/framework/cli/utils.py b/kedro/framework/cli/utils.py index d890c5e219..3240c3c4ab 100644 --- a/kedro/framework/cli/utils.py +++ b/kedro/framework/cli/utils.py @@ -1,33 +1,8 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Utilities for use with click.""" +from __future__ import annotations + import difflib +import logging import re import shlex import shutil @@ -41,12 +16,13 @@ from importlib import import_module from itertools import chain from pathlib import Path -from typing import Dict, Iterable, List, Mapping, Sequence, Set, Tuple, Union +from typing import Iterable, Sequence import click -import pkg_resources +import importlib_metadata +from omegaconf import OmegaConf -CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) +CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"]} MAX_SUGGESTIONS = 3 CUTOFF = 0.5 @@ -59,10 +35,13 @@ "line_magic": "kedro.line_magic", "hooks": "kedro.hooks", "cli_hooks": "kedro.cli_hooks", + "starters": "kedro.starters", } +logger = logging.getLogger(__name__) + -def call(cmd: List[str], **kwargs): # pragma: no cover +def call(cmd: list[str], **kwargs): # pragma: no cover """Run a subprocess command and raise if it fails. Args: @@ -73,7 +52,7 @@ def call(cmd: List[str], **kwargs): # pragma: no cover click.exceptions.Exit: If `subprocess.run` returns non-zero code. """ click.echo(" ".join(shlex.quote(c) for c in cmd)) - # pylint: disable=subprocess-run-check + # noqa: subprocess-run-check code = subprocess.run(cmd, **kwargs).returncode if code: raise click.exceptions.Exit(code=code) @@ -102,10 +81,10 @@ def wrapit(func): func = command_with_verbosity( group, name=name, - context_settings=dict( - ignore_unknown_options=True, - help_option_names=[] if forward_help else ["-h", "--help"], - ), + context_settings={ + "ignore_unknown_options": True, + "help_option_names": [] if forward_help else ["-h", "--help"], + }, )(func) return func @@ -133,7 +112,7 @@ def _suggest_cli_command( class CommandCollection(click.CommandCollection): """Modified from the Click one to still run the source groups function.""" - def __init__(self, *groups: Tuple[str, Sequence[click.MultiCommand]]): + def __init__(self, *groups: tuple[str, Sequence[click.MultiCommand]]): self.groups = [ (title, self._merge_same_name_collections(cli_list)) for title, cli_list in groups @@ -160,7 +139,7 @@ def _dedupe_commands(cli_collections: Sequence[click.CommandCollection]): """Deduplicate commands by keeping the ones from the last source in the list. """ - seen_names: Set[str] = set() + seen_names: set[str] = set() for cli_collection in reversed(cli_collections): for cmd_group in reversed(cli_collection.sources): cmd_group.commands = { # type: ignore @@ -180,8 +159,8 @@ def _dedupe_commands(cli_collections: Sequence[click.CommandCollection]): @staticmethod def _merge_same_name_collections(groups: Sequence[click.MultiCommand]): - named_groups: Mapping[str, List[click.MultiCommand]] = defaultdict(list) - helps: Mapping[str, list] = defaultdict(list) + named_groups: defaultdict[str, list[click.MultiCommand]] = defaultdict(list) + helps: defaultdict[str, list] = defaultdict(list) for group in groups: named_groups[group.name].append(group) if group.help: @@ -199,7 +178,7 @@ def _merge_same_name_collections(groups: Sequence[click.MultiCommand]): if cli_list ] - def resolve_command(self, ctx: click.core.Context, args: List): + def resolve_command(self, ctx: click.core.Context, args: list): try: return super().resolve_command(ctx, args) except click.exceptions.UsageError as exc: @@ -222,7 +201,7 @@ def format_commands( group.format_commands(ctx, formatter) -def get_pkg_version(reqs_path: (Union[str, Path]), package_name: str) -> str: +def get_pkg_version(reqs_path: (str | Path), package_name: str) -> str: """Get package version from requirements.txt. Args: @@ -238,19 +217,19 @@ def get_pkg_version(reqs_path: (Union[str, Path]), package_name: str) -> str: """ reqs_path = Path(reqs_path).absolute() if not reqs_path.is_file(): - raise KedroCliError(f"Given path `{reqs_path}` is not a regular file.") + raise KedroCliError(f"Given path '{reqs_path}' is not a regular file.") pattern = re.compile(package_name + r"([^\w]|$)") - with reqs_path.open("r") as reqs_file: + with reqs_path.open("r", encoding="utf-8") as reqs_file: for req_line in reqs_file: - req_line = req_line.strip() + req_line = req_line.strip() # noqa: redefined-loop-name if pattern.search(req_line): return req_line - raise KedroCliError(f"Cannot find `{package_name}` package in `{reqs_path}`.") + raise KedroCliError(f"Cannot find '{package_name}' package in '{reqs_path}'.") -def _update_verbose_flag(ctx, param, value): # pylint: disable=unused-argument +def _update_verbose_flag(ctx, param, value): # noqa: unused-argument KedroCliError.VERBOSE_ERROR = value @@ -286,7 +265,7 @@ class KedroCliError(click.exceptions.ClickException): def show(self, file=None): if file is None: - # pylint: disable=protected-access + # noqa: protected-access file = click._compat.get_text_stderr() if self.VERBOSE_ERROR: click.secho(traceback.format_exc(), nl=False, fg="yellow") @@ -312,36 +291,55 @@ def _clean_pycache(path: Path): shutil.rmtree(each, ignore_errors=True) -def split_string(ctx, param, value): # pylint: disable=unused-argument +def split_string(ctx, param, value): # noqa: unused-argument """Split string by comma.""" return [item.strip() for item in value.split(",") if item.strip()] +# noqa: unused-argument,missing-param-doc,missing-type-doc +def split_node_names(ctx, param, to_split: str) -> list[str]: + """Split string by comma, ignoring commas enclosed by square parentheses. + This avoids splitting the string of nodes names on commas included in + default node names, which have the pattern + ([,...]) -> [,...]) + + Note: + - `to_split` will have such commas if and only if it includes a + default node name. User-defined node names cannot include commas + or square brackets. + - This function will no longer be necessary from Kedro 0.19.*, + in which default node names will no longer contain commas + + Args: + to_split: the string to split safely + + Returns: + A list containing the result of safe-splitting the string. + """ + result = [] + argument, match_state = "", 0 + for char in to_split + ",": + if char == "[": + match_state += 1 + elif char == "]": + match_state -= 1 + if char == "," and match_state == 0 and argument: + argument = argument.strip() + result.append(argument) + argument = "" + else: + argument += char + return result + + def env_option(func_=None, **kwargs): """Add `--env` CLI option to a function.""" - default_args = dict(type=str, default=None, help=ENV_HELP) + default_args = {"type": str, "default": None, "help": ENV_HELP} kwargs = {**default_args, **kwargs} opt = click.option("--env", "-e", **kwargs) return opt(func_) if func_ else opt -def ipython_message(all_kernels=True): - """Show a message saying how we have configured the IPython env.""" - ipy_vars = ["startup_error", "context"] - click.secho("-" * 79, fg="cyan") - click.secho("Starting a Kedro session with the following variables in scope") - click.secho(", ".join(ipy_vars), fg="green") - line_magic = click.style("%reload_kedro", fg="green") - click.secho(f"Use the line magic {line_magic} to refresh them") - click.secho("or to see the error message if they are undefined") - - if not all_kernels: - click.secho("The choice of kernels is limited to the default one.", fg="yellow") - click.secho("(restart with --all-kernels to get access to others)", fg="yellow") - - click.secho("-" * 79, fg="cyan") - - @contextmanager def _filter_deprecation_warnings(): """Temporarily suppress all DeprecationWarnings.""" @@ -355,11 +353,32 @@ def _check_module_importable(module_name: str) -> None: import_module(module_name) except ImportError as exc: raise KedroCliError( - f"Module `{module_name}` not found. Make sure to install required project " - f"dependencies by running the `kedro install` command first." + f"Module '{module_name}' not found. Make sure to install required project " + f"dependencies by running the 'pip install -r src/requirements.txt' command first." ) from exc +def _get_entry_points(name: str) -> importlib_metadata.EntryPoints: + """Get all kedro related entry points""" + return importlib_metadata.entry_points().select(group=ENTRY_POINT_GROUPS[name]) + + +def _safe_load_entry_point( # noqa: inconsistent-return-statements + entry_point, +): + """Load entrypoint safely, if fails it will just skip the entrypoint.""" + try: + return entry_point.load() + except Exception as exc: # noqa: broad-except + logger.warning( + "Failed to load %s commands from %s. Full exception: %s", + entry_point.module, + entry_point, + exc, + ) + return + + def load_entry_points(name: str) -> Sequence[click.MultiCommand]: """Load package entry point commands. @@ -373,23 +392,22 @@ def load_entry_points(name: str) -> Sequence[click.MultiCommand]: List of entry point commands. """ - entry_points = pkg_resources.iter_entry_points(group=ENTRY_POINT_GROUPS[name]) + entry_point_commands = [] - for entry_point in entry_points: - try: - entry_point_commands.append(entry_point.load()) - except Exception as exc: - raise KedroCliError(f"Loading {name} commands from {entry_point}") from exc + for entry_point in _get_entry_points(name): + loaded_entry_point = _safe_load_entry_point(entry_point) + if loaded_entry_point: + entry_point_commands.append(loaded_entry_point) return entry_point_commands -def _config_file_callback(ctx, param, value): # pylint: disable=unused-argument +def _config_file_callback(ctx, param, value): # noqa: unused-argument """CLI callback that replaces command line options with values specified in a config file. If command line options are passed, they override config file values. """ # for performance reasons - import anyconfig # pylint: disable=import-outside-toplevel + import anyconfig # noqa: import-outside-toplevel ctx.default_map = ctx.default_map or {} section = ctx.info_name @@ -401,20 +419,21 @@ def _config_file_callback(ctx, param, value): # pylint: disable=unused-argument return value -def _reformat_load_versions( # pylint: disable=unused-argument - ctx, param, value -) -> Dict[str, str]: +def _reformat_load_versions(ctx, param, value) -> dict[str, str]: """Reformat data structure from tuple to dictionary for `load-version`, e.g.: ('dataset1:time1', 'dataset2:time2') -> {"dataset1": "time1", "dataset2": "time2"}. """ - load_versions_dict = {} + if param.name == "load_version": + _deprecate_options(ctx, param, value) + load_versions_dict = {} for load_version in value: + load_version = load_version.strip() # noqa: PLW2901 load_version_list = load_version.split(":", 1) - if len(load_version_list) != 2: + if len(load_version_list) != 2: # noqa: PLR2004 raise KedroCliError( - f"Expected the form of `load_version` to be " - f"`dataset_name:YYYY-MM-DDThh.mm.ss.sssZ`," + f"Expected the form of 'load_version' to be " + f"'dataset_name:YYYY-MM-DDThh.mm.ss.sssZ'," f"found {load_version} instead" ) load_versions_dict[load_version_list[0]] = load_version_list[1] @@ -422,75 +441,69 @@ def _reformat_load_versions( # pylint: disable=unused-argument return load_versions_dict -def _try_convert_to_numeric(value): - try: - value = float(value) - except ValueError: - return value - return int(value) if value.is_integer() else value - - def _split_params(ctx, param, value): if isinstance(value, dict): return value - result = {} + dot_list = [] for item in split_string(ctx, param, value): - item = item.split(":", 1) - if len(item) != 2: + equals_idx = item.find("=") + colon_idx = item.find(":") + if equals_idx != -1 and colon_idx != -1 and equals_idx < colon_idx: + # For cases where key-value pair is separated by = and the value contains a colon + # which should not be replaced by = + pass + else: + item = item.replace(":", "=", 1) # noqa: redefined-loop-name + items = item.split("=", 1) + if len(items) != 2: # noqa: PLR2004 ctx.fail( f"Invalid format of `{param.name}` option: " - f"Item `{item[0]}` must contain " - f"a key and a value separated by `:`." + f"Item `{items[0]}` must contain " + f"a key and a value separated by `:` or `=`." ) - key = item[0].strip() + key = items[0].strip() if not key: ctx.fail( f"Invalid format of `{param.name}` option: Parameter key " f"cannot be an empty string." ) - value = item[1].strip() - result[key] = _try_convert_to_numeric(value) - return result - + dot_list.append(item) + conf = OmegaConf.from_dotlist(dot_list) + return OmegaConf.to_container(conf) -def _get_values_as_tuple(values: Iterable[str]) -> Tuple[str, ...]: - return tuple(chain.from_iterable(value.split(",") for value in values)) +def _split_load_versions(ctx, param, value): + lv_tuple = _get_values_as_tuple([value]) + return _reformat_load_versions(ctx, param, lv_tuple) if value else {} -def _get_requirements_in(source_path: Path, create_empty: bool = False) -> Path: - """Get path to project level requirements.in, creating it if required. - Args: - source_path: Path to the project `src` folder. - create_empty: Whether an empty requirements.in file should be created if - requirements.in does not exist and there is also no requirements.txt to - copy requirements from. - - Returns: - Path to requirements.in. +def _get_values_as_tuple(values: Iterable[str]) -> tuple[str, ...]: + return tuple(chain.from_iterable(value.split(",") for value in values)) - Raises: - FileNotFoundError: If neither requirements.in nor requirements.txt is found. - """ - requirements_in = source_path / "requirements.in" - if requirements_in.is_file(): - return requirements_in - - requirements_txt = source_path / "requirements.txt" - if requirements_txt.is_file(): - click.secho( - "No requirements.in found. Copying contents from requirements.txt..." +def _deprecate_options(ctx, param, value): + deprecated_flag = { + "node_names": "--node", + "tag": "--tag", + "load_version": "--load-version", + } + new_flag = { + "node_names": "--nodes", + "tag": "--tags", + "load_version": "--load-versions", + } + shorthand_flag = { + "node_names": "-n", + "tag": "-t", + "load_version": "-lv", + } + if value: + deprecation_message = ( + f"DeprecationWarning: 'kedro run' flag '{deprecated_flag[param.name]}' is deprecated " + "and will not be available from Kedro 0.19.0. " + f"Use the flag '{new_flag[param.name]}' instead. Shorthand " + f"'{shorthand_flag[param.name]}' will be updated to use " + f"'{new_flag[param.name]}' in Kedro 0.19.0." ) - shutil.copyfile(str(requirements_txt), str(requirements_in)) - return requirements_in - - if create_empty: - click.secho("Creating empty requirements.in...") - requirements_in.touch() - return requirements_in - - raise FileNotFoundError( - "No project requirements.in or requirements.txt found in `/src`. " - "Please create either and try again." - ) + click.secho(deprecation_message, fg="red") + return value diff --git a/kedro/framework/context/__init__.py b/kedro/framework/context/__init__.py index f5288f38fa..41891e5c72 100644 --- a/kedro/framework/context/__init__.py +++ b/kedro/framework/context/__init__.py @@ -1,34 +1,7 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.framework.context`` provides functionality for loading Kedro project context. """ -from .context import KedroContext # NOQA -from .context import KedroContextError # NOQA +from .context import KedroContext, KedroContextError + +__all__ = ["KedroContext", "KedroContextError"] diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 45fa66b259..003ff696ae 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -1,68 +1,20 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module provides context for Kedro project.""" +from __future__ import annotations -import functools import logging from copy import deepcopy from pathlib import Path, PurePosixPath, PureWindowsPath -from typing import Any, Dict, Iterable, Optional, Union +from typing import Any from urllib.parse import urlparse from warnings import warn +from attrs import field, frozen +from pluggy import PluginManager + from kedro.config import ConfigLoader, MissingConfigException -from kedro.framework.hooks import get_hook_manager -from kedro.framework.project import pipelines, settings +from kedro.framework.project import settings from kedro.io import DataCatalog -from kedro.io.core import generate_timestamp -from kedro.pipeline import Pipeline from kedro.pipeline.pipeline import _transcode_split -from kedro.runner.runner import AbstractRunner -from kedro.runner.sequential_runner import SequentialRunner -from kedro.versioning import Journal - - -def _deprecate(version): - """Decorator to deprecate a few of the context's properties.""" - - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - property_name = func.__name__ - warn( - f"Accessing {property_name} via the context will be deprecated in Kedro {version}.", - DeprecationWarning, - ) - return func(*args, **kwargs) - - return wrapper - - return decorator def _is_relative_path(path_string: str) -> bool: @@ -71,12 +23,12 @@ def _is_relative_path(path_string: str) -> bool: Example: :: >>> _is_relative_path("data/01_raw") == True - >>> _is_relative_path("logs/info.log") == True + >>> _is_relative_path("info.log") == True >>> _is_relative_path("/tmp/data/01_raw") == False - >>> _is_relative_path(r"C:\\logs\\info.log") == False - >>> _is_relative_path(r"\\logs\\'info.log") == False - >>> _is_relative_path("c:/logs/info.log") == False - >>> _is_relative_path("s3://logs/info.log") == False + >>> _is_relative_path(r"C:\\info.log") == False + >>> _is_relative_path(r"\\'info.log") == False + >>> _is_relative_path("c:/info.log") == False + >>> _is_relative_path("s3://info.log") == False Args: path_string: The path string to check. @@ -101,8 +53,8 @@ def _is_relative_path(path_string: str) -> bool: def _convert_paths_to_absolute_posix( - project_path: Path, conf_dictionary: Dict[str, Any] -) -> Dict[str, Any]: + project_path: Path, conf_dictionary: dict[str, Any] +) -> dict[str, Any]: """Turn all relative paths inside ``conf_dictionary`` into absolute paths by appending them to ``project_path`` and convert absolute Windows paths to POSIX format. This is a hack to make sure that we don't have to change user's working directory for logging and datasets to @@ -116,13 +68,13 @@ def _convert_paths_to_absolute_posix( >>> conf_dictionary={ >>> "handlers": { >>> "info_file_handler": { - >>> "filename": "logs/info.log" + >>> "filename": "info.log" >>> } >>> } >>> } >>> ) >>> print(conf['handlers']['info_file_handler']['filename']) - "/path/to/my/project/logs/info.log" + "/path/to/my/project/info.log" Args: project_path: The root directory to prepend to relative path to make absolute path. @@ -141,7 +93,6 @@ def _convert_paths_to_absolute_posix( conf_keys_with_filepath = ("filename", "filepath", "path") for conf_key, conf_value in conf_dictionary.items(): - # if the conf_value is another dictionary, absolutify its paths first. if isinstance(conf_value, dict): conf_dictionary[conf_key] = _convert_paths_to_absolute_posix( @@ -168,152 +119,77 @@ def _convert_paths_to_absolute_posix( return conf_dictionary -def _validate_layers_for_transcoding(catalog: DataCatalog) -> None: - """Check that transcoded names that correspond to - the same dataset also belong to the same layer. - """ +def _validate_transcoded_datasets(catalog: DataCatalog): + """Validates transcoded datasets are correctly named - def _find_conflicts(): - base_names_to_layer = {} - for current_layer, dataset_names in catalog.layers.items(): - for name in dataset_names: - base_name, _ = _transcode_split(name) - known_layer = base_names_to_layer.setdefault(base_name, current_layer) - if current_layer != known_layer: - yield name - else: - base_names_to_layer[base_name] = current_layer - - conflicting_datasets = sorted(_find_conflicts()) - if conflicting_datasets: - error_str = ", ".join(conflicting_datasets) - raise ValueError( - f"Transcoded datasets should have the same layer. Mismatch found for: {error_str}" - ) + Args: + catalog (DataCatalog): The catalog object containing the + datasets to be validated. + Raises: + ValueError: If a dataset name does not conform to the expected + transcoding naming conventions,a ValueError is raised by the + `_transcode_split` function. -class KedroContext: - """``KedroContext`` is the base class which holds the configuration and - Kedro's main functionality. """ + # noqa: protected-access + for dataset_name in catalog._data_sets.keys(): + _transcode_split(dataset_name) - _CONF_SOURCE = "conf" - """CONF_SOURCE: Name of root directory containing project configuration. - Default name is "conf".""" - - def __init__( - self, - package_name: str, - project_path: Union[Path, str], - env: str = None, - extra_params: Dict[str, Any] = None, - ): - """Create a context object by providing the root of a Kedro project and - the environment configuration subfolders - (see ``kedro.config.ConfigLoader``) - - Raises: - KedroContextError: If there is a mismatch - between Kedro project version and package version. - - Args: - package_name: Package name for the Kedro project the context is - created for. - project_path: Project path to define the context for. - env: Optional argument for configuration default environment to be used - for running the pipeline. If not specified, it defaults to "local". - extra_params: Optional dictionary containing extra project parameters. - If specified, will update (and therefore take precedence over) - the parameters retrieved from the project configuration. - """ - self._project_path = Path(project_path).expanduser().resolve() - self._package_name = package_name - - self._env = env - self._extra_params = deepcopy(extra_params) - - @property # type: ignore - @_deprecate(version="0.18.0") - def CONF_SOURCE(self) -> str: # pylint: disable=invalid-name - """Deprecated in favour of settings.CONF_SOURCE - - Returns: - The root directory of the configuration directory of the project. - Raises: - DeprecationWarning - """ - return self._CONF_SOURCE - - @CONF_SOURCE.setter # type: ignore - @_deprecate(version="0.18.0") - def CONF_SOURCE(self, value: str) -> None: # pylint: disable=invalid-name - """Deprecated in favour of settings.CONF_SOURCE - Raises: - DeprecationWarning - """ - self._CONF_SOURCE = value # pylint: disable=invalid-name - - @property # type: ignore - def env(self) -> Optional[str]: - """Property for the current Kedro environment. - Returns: - Name of the current Kedro environment. - - """ - return self._env +def _update_nested_dict(old_dict: dict[Any, Any], new_dict: dict[Any, Any]) -> None: + """Update a nested dict with values of new_dict. - @property # type: ignore - @_deprecate(version="0.18.0") - def package_name(self) -> str: - """Property for Kedro project package name. + Args: + old_dict: dict to be updated + new_dict: dict to use for updating old_dict - Returns: - Name of Kedro project package. + """ + for key, value in new_dict.items(): + if key not in old_dict: + old_dict[key] = value + elif isinstance(old_dict[key], dict) and isinstance(value, dict): + _update_nested_dict(old_dict[key], value) + else: + old_dict[key] = value - """ - return self._package_name - @property # type: ignore - @_deprecate(version="0.18.0") - def pipeline(self) -> Pipeline: - """Read-only property for an instance of Pipeline. +def _expand_full_path(project_path: str | Path) -> Path: + return Path(project_path).expanduser().resolve() - Returns: - Default pipeline. - Raises: - KedroContextError: If the `__default__` pipeline is not - defined by `register_pipelines`. - """ - try: - return pipelines["__default__"] - except KeyError as exc: # pragma: no cover - raise KedroContextError( - "Failed to find the pipeline named '__default__'. " - "It needs to be generated and returned " - "by the 'register_pipelines' function." - ) from exc - - @property # type: ignore - @_deprecate(version="0.18.0") - def pipelines(self) -> Dict[str, Pipeline]: - """Read-only property for an instance of Pipeline. +@frozen +class KedroContext: + """``KedroContext`` is the base class which holds the configuration and + Kedro's main functionality. + """ - Returns: - A dictionary of defined pipelines. - """ - return dict(pipelines) + _package_name: str + project_path: Path = field(converter=_expand_full_path) + config_loader: ConfigLoader + _hook_manager: PluginManager + env: str | None = None + _extra_params: dict[str, Any] | None = field(default=None, converter=deepcopy) - @property - def project_path(self) -> Path: - """Read-only property containing Kedro's root project directory. + """Create a context object by providing the root of a Kedro project and + the environment configuration subfolders (see ``kedro.config.ConfigLoader``) - Returns: - Project directory. + Raises: + KedroContextError: If there is a mismatch + between Kedro project version and package version. - """ - return self._project_path + Args: + package_name: Package name for the Kedro project the context is + created for. + project_path: Project path to define the context for. + config_loader: Kedro's ``ConfigLoader`` for loading the configuration files. + hook_manager: The ``PluginManager`` to activate hooks, supplied by the session. + env: Optional argument for configuration default environment to be used + for running the pipeline. If not specified, it defaults to "local". + extra_params: Optional dictionary containing extra project parameters. + If specified, will update (and therefore take precedence over) + the parameters retrieved from the project configuration. + """ @property def catalog(self) -> DataCatalog: @@ -328,7 +204,7 @@ def catalog(self) -> DataCatalog: return self._get_catalog() @property - def params(self) -> Dict[str, Any]: + def params(self) -> dict[str, Any]: """Read-only property referring to Kedro's parameters for this context. Returns: @@ -336,21 +212,17 @@ def params(self) -> Dict[str, Any]: extra parameters passed at initialization. """ try: - # '**/parameters*' reads modular pipeline configs - params = self.config_loader.get( - "parameters*", "parameters*/**", "**/parameters*" - ) + params = self.config_loader["parameters"] except MissingConfigException as exc: warn(f"Parameters not found in your Kedro project config.\n{str(exc)}") params = {} - params.update(self._extra_params or {}) + _update_nested_dict(params, self._extra_params or {}) return params def _get_catalog( self, save_version: str = None, - journal: Journal = None, - load_versions: Dict[str, str] = None, + load_versions: dict[str, str] = None, ) -> DataCatalog: """A hook for changing the creation of a DataCatalog instance. @@ -361,7 +233,7 @@ def _get_catalog( """ # '**/catalog*' reads modular pipeline configs - conf_catalog = self.config_loader.get("catalog*", "catalog*/**", "**/catalog*") + conf_catalog = self.config_loader["catalog"] # turn relative paths in conf_catalog into absolute paths # before initializing the catalog conf_catalog = _convert_paths_to_absolute_posix( @@ -369,90 +241,27 @@ def _get_catalog( ) conf_creds = self._get_config_credentials() - hook_manager = get_hook_manager() - catalog = hook_manager.hook.register_catalog( # pylint: disable=no-member + catalog = settings.DATA_CATALOG_CLASS.from_config( catalog=conf_catalog, credentials=conf_creds, load_versions=load_versions, save_version=save_version, - journal=journal, ) - if not isinstance(catalog, DataCatalog): - raise KedroContextError( - f"Expected an instance of `DataCatalog`, " - f"got `{type(catalog).__name__}` instead." - ) feed_dict = self._get_feed_dict() catalog.add_feed_dict(feed_dict) - if catalog.layers: - _validate_layers_for_transcoding(catalog) - hook_manager = get_hook_manager() - hook_manager.hook.after_catalog_created( # pylint: disable=no-member + _validate_transcoded_datasets(catalog) + self._hook_manager.hook.after_catalog_created( catalog=catalog, conf_catalog=conf_catalog, conf_creds=conf_creds, feed_dict=feed_dict, save_version=save_version, load_versions=load_versions, - run_id=self.run_id or save_version, ) return catalog - @property # type: ignore - @_deprecate(version="0.18.0") - def io(self) -> DataCatalog: - """Read-only alias property referring to Kedro's ``DataCatalog`` for this - context. - - Returns: - DataCatalog defined in `catalog.yml`. - Raises: - KedroContextError: Incorrect ``DataCatalog`` registered for the project. - - """ - # pylint: disable=invalid-name - return self.catalog - - def _get_config_loader(self) -> ConfigLoader: - """A hook for changing the creation of a ConfigLoader instance. - - Returns: - Instance of `ConfigLoader` created by `settings.py`. - Raises: - KedroContextError: Incorrect ``ConfigLoader`` registered for the project. - - """ - try: - return settings.CONFIG_LOADER_CLASS( - conf_source=str(self.project_path / settings.CONF_SOURCE), - env=self.env, - runtime_params=self._extra_params, - **settings.CONFIG_LOADER_ARGS, - ) - - except TypeError as exc: - raise KedroContextError( - f"Expected an instance of `ConfigLoader`, " - f"got `{settings.CONFIG_LOADER_CLASS}` of class " - f"`{type(settings.CONFIG_LOADER_CLASS)}` instead.\n" - f"The provided `CONFIG_LOADER_ARGS were: {settings.CONFIG_LOADER_ARGS}" - ) from exc - - @property - def config_loader(self) -> ConfigLoader: - """Read-only property referring to Kedro's ``ConfigLoader`` for this - context. - - Returns: - Instance of `ConfigLoader`. - Raises: - KedroContextError: Incorrect ``ConfigLoader`` registered for the project. - - """ - return self._get_config_loader() - - def _get_feed_dict(self) -> Dict[str, Any]: + def _get_feed_dict(self) -> dict[str, Any]: """Get parameters and return the feed dictionary.""" params = self.params feed_dict = {"parameters": params} @@ -472,7 +281,6 @@ def _add_param_to_feed_dict(param_name, param_value): """ key = f"params:{param_name}" feed_dict[key] = param_value - if isinstance(param_value, dict): for key, val in param_value.items(): _add_param_to_feed_dict(f"{param_name}.{key}", val) @@ -482,164 +290,17 @@ def _add_param_to_feed_dict(param_name, param_value): return feed_dict - def _get_config_credentials(self) -> Dict[str, Any]: + def _get_config_credentials(self) -> dict[str, Any]: """Getter for credentials specified in credentials directory.""" try: - conf_creds = self.config_loader.get( - "credentials*", "credentials*/**", "**/credentials*" - ) + conf_creds = self.config_loader["credentials"] except MissingConfigException as exc: - warn(f"Credentials not found in your Kedro project config.\n{str(exc)}") + logging.getLogger(__name__).debug( + "Credentials not found in your Kedro project config.\n %s", str(exc) + ) conf_creds = {} return conf_creds - @property - def run_id(self) -> Union[None, str]: - """Unique identifier for a run / journal record, defaults to None. - If `run_id` is None, `save_version` will be used instead. - """ - return self._get_run_id() - - def run( # pylint: disable=too-many-arguments,too-many-locals - self, - tags: Iterable[str] = None, - runner: AbstractRunner = None, - node_names: Iterable[str] = None, - from_nodes: Iterable[str] = None, - to_nodes: Iterable[str] = None, - from_inputs: Iterable[str] = None, - to_outputs: Iterable[str] = None, - load_versions: Dict[str, str] = None, - pipeline_name: str = None, - ) -> Dict[str, Any]: - """Runs the pipeline with a specified runner. - - Args: - tags: An optional list of node tags which should be used to - filter the nodes of the ``Pipeline``. If specified, only the nodes - containing *any* of these tags will be run. - runner: An optional parameter specifying the runner that you want to run - the pipeline with. - node_names: An optional list of node names which should be used to - filter the nodes of the ``Pipeline``. If specified, only the nodes - with these names will be run. - from_nodes: An optional list of node names which should be used as a - starting point of the new ``Pipeline``. - to_nodes: An optional list of node names which should be used as an - end point of the new ``Pipeline``. - from_inputs: An optional list of input datasets which should be used as a - starting point of the new ``Pipeline``. - to_outputs: An optional list of output datasets which should be used as an - end point of the new ``Pipeline``. - load_versions: An optional flag to specify a particular dataset version timestamp - to load. - pipeline_name: Name of the ``Pipeline`` to execute. - Defaults to "__default__". - Raises: - KedroContextError: If the resulting ``Pipeline`` is empty - or incorrect tags are provided. - Exception: Any uncaught exception will be re-raised - after being passed to``on_pipeline_error``. - Returns: - Any node outputs that cannot be processed by the ``DataCatalog``. - These are returned in a dictionary, where the keys are defined - by the node outputs. - """ - warn( - "`kedro.framework.context.KedroContext.run` is now deprecated in favour of " - "`KedroSession.run` and will be removed in Kedro 0.18.0.", - DeprecationWarning, - ) - # Report project name - logging.info("** Kedro project %s", self.project_path.name) - - name = pipeline_name or "__default__" - - try: - pipeline = pipelines[name] - except KeyError as exc: - raise KedroContextError( - f"Failed to find the pipeline named '{name}'. " - f"It needs to be generated and returned " - f"by the 'register_pipelines' function." - ) from exc - - filtered_pipeline = pipeline.filter( - tags=tags, - from_nodes=from_nodes, - to_nodes=to_nodes, - node_names=node_names, - from_inputs=from_inputs, - to_outputs=to_outputs, - ) - - save_version = self._get_save_version() - run_id = self.run_id or save_version - - record_data = { - "run_id": run_id, - "project_path": str(self.project_path), - "env": self.env, - "tags": tags, - "from_nodes": from_nodes, - "to_nodes": to_nodes, - "node_names": node_names, - "from_inputs": from_inputs, - "to_outputs": to_outputs, - "load_versions": load_versions, - "pipeline_name": pipeline_name, - "extra_params": self._extra_params, - } - journal = Journal(record_data) - - catalog = self._get_catalog( - save_version=save_version, journal=journal, load_versions=load_versions - ) - - # Run the runner - runner = runner or SequentialRunner() - hook_manager = get_hook_manager() - hook_manager.hook.before_pipeline_run( # pylint: disable=no-member - run_params=record_data, pipeline=filtered_pipeline, catalog=catalog - ) - - try: - run_result = runner.run(filtered_pipeline, catalog, run_id) - except Exception as exc: - hook_manager.hook.on_pipeline_error( # pylint: disable=no-member - error=exc, - run_params=record_data, - pipeline=filtered_pipeline, - catalog=catalog, - ) - raise exc - - hook_manager.hook.after_pipeline_run( # pylint: disable=no-member - run_params=record_data, - run_result=run_result, - pipeline=filtered_pipeline, - catalog=catalog, - ) - return run_result - - def _get_run_id( # pylint: disable=no-self-use - self, *args, **kwargs # pylint: disable=unused-argument - ) -> Union[None, str]: - """A hook for generating a unique identifier for a - run / journal record, defaults to None. - If None, `save_version` will be used instead. - """ - return None - - def _get_save_version( # pylint: disable=no-self-use - self, *args, **kwargs # pylint: disable=unused-argument - ) -> str: - """Generate unique ID for dataset versioning, defaults to timestamp. - `save_version` MUST be something that can be ordered, in order to - easily determine the latest version. - """ - return generate_timestamp() - class KedroContextError(Exception): """Error occurred when loading project and running context pipeline.""" diff --git a/kedro/framework/hooks/__init__.py b/kedro/framework/hooks/__init__.py index f0fcf4aaba..8ce7a9b695 100644 --- a/kedro/framework/hooks/__init__.py +++ b/kedro/framework/hooks/__init__.py @@ -1,30 +1,5 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``kedro.framework.hooks`` provides primitives to use hooks to extend KedroContext's behaviour""" -from .manager import get_hook_manager # NOQA -from .markers import hook_impl # NOQA +from .manager import _create_hook_manager +from .markers import hook_impl + +__all__ = ["_create_hook_manager", "hook_impl"] diff --git a/kedro/framework/hooks/manager.py b/kedro/framework/hooks/manager.py index 001fff87cd..13a8e5a8b2 100644 --- a/kedro/framework/hooks/manager.py +++ b/kedro/framework/hooks/manager.py @@ -1,34 +1,6 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module provides an utility function to retrieve the global hook_manager singleton in a Kedro's execution process. """ -# pylint: disable=global-statement,invalid-name import logging from typing import Any, Iterable @@ -38,35 +10,29 @@ from .specs import ( DataCatalogSpecs, DatasetSpecs, + KedroContextSpecs, NodeSpecs, PipelineSpecs, - RegistrationSpecs, ) -_hook_manager = None - _PLUGIN_HOOKS = "kedro.hooks" # entry-point to load hooks from for installed plugins +logger = logging.getLogger(__name__) + def _create_hook_manager() -> PluginManager: """Create a new PluginManager instance and register Kedro's hook specs.""" manager = PluginManager(HOOK_NAMESPACE) + manager.trace.root.setwriter(logger.debug) + manager.enable_tracing() manager.add_hookspecs(NodeSpecs) manager.add_hookspecs(PipelineSpecs) manager.add_hookspecs(DataCatalogSpecs) - manager.add_hookspecs(RegistrationSpecs) manager.add_hookspecs(DatasetSpecs) + manager.add_hookspecs(KedroContextSpecs) return manager -def get_hook_manager(): - """Create or return the global _hook_manager singleton instance.""" - global _hook_manager - if _hook_manager is None: - _hook_manager = _create_hook_manager() - return _hook_manager - - def _register_hooks(hook_manager: PluginManager, hooks: Iterable[Any]) -> None: """Register all hooks as specified in ``hooks`` with the global ``hook_manager``. @@ -96,10 +62,13 @@ def _register_hooks_setuptools( """ already_registered = hook_manager.get_plugins() - found = hook_manager.load_setuptools_entrypoints(_PLUGIN_HOOKS) + # Method name is misleading: + # entry points are standard and don't require setuptools, + # see https://packaging.python.org/en/latest/specifications/entry-points/ + hook_manager.load_setuptools_entrypoints(_PLUGIN_HOOKS) disabled_plugins = set(disabled_plugins) - # Get list of plugin/distinfo tuples for all setuptools registered plugins. + # Get list of plugin/distinfo tuples for all registered plugins. plugininfo = hook_manager.list_plugin_distinfo() plugin_names = set() disabled_plugin_names = set() @@ -110,20 +79,33 @@ def _register_hooks_setuptools( # name and not `entry_point` name. Also, we log project names with # version for which hooks were registered. hook_manager.unregister(plugin=plugin) - found -= 1 disabled_plugin_names.add(f"{dist.project_name}-{dist.version}") elif plugin not in already_registered: plugin_names.add(f"{dist.project_name}-{dist.version}") if disabled_plugin_names: - logging.info( + logger.debug( "Hooks are disabled for plugin(s): %s", ", ".join(sorted(disabled_plugin_names)), ) if plugin_names: - logging.info( + logger.debug( "Registered hooks from %d installed plugin(s): %s", - found, + len(plugin_names), ", ".join(sorted(plugin_names)), ) + + +class _NullPluginManager: + """This class creates an empty ``hook_manager`` that will ignore all calls to hooks, + allowing the runner to function if no ``hook_manager`` has been instantiated.""" + + def __init__(self, *args, **kwargs): + pass + + def __getattr__(self, name): + return self + + def __call__(self, *args, **kwargs): + pass diff --git a/kedro/framework/hooks/markers.py b/kedro/framework/hooks/markers.py index cc9d2bf207..b45638ebed 100644 --- a/kedro/framework/hooks/markers.py +++ b/kedro/framework/hooks/markers.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module provides markers to declare Kedro's hook specs and implementations. For more information, please see [Pluggy's documentation](https://pluggy.readthedocs.io/en/stable/#marking-hooks). diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index 9c1fc3e55b..aa10ab7276 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -1,41 +1,15 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """A module containing specifications for all callable hooks in the Kedro's execution timeline. For more information about these specifications, please visit [Pluggy's documentation](https://pluggy.readthedocs.io/en/stable/#specs) """ -from typing import Any, Dict, Optional +from __future__ import annotations + +from typing import Any +from kedro.framework.context import KedroContext from kedro.io import DataCatalog from kedro.pipeline import Pipeline from kedro.pipeline.node import Node -from kedro.versioning import Journal from .markers import hook_spec @@ -44,15 +18,14 @@ class DataCatalogSpecs: """Namespace that defines all specifications for a data catalog's lifecycle hooks.""" @hook_spec - def after_catalog_created( # pylint: disable=too-many-arguments + def after_catalog_created( # noqa: too-many-arguments self, catalog: DataCatalog, - conf_catalog: Dict[str, Any], - conf_creds: Dict[str, Any], - feed_dict: Dict[str, Any], + conf_catalog: dict[str, Any], + conf_creds: dict[str, Any], + feed_dict: dict[str, Any], save_version: str, - load_versions: Dict[str, str], - run_id: str, + load_versions: dict[str, str], ) -> None: """Hooks to be invoked after a data catalog is created. It receives the ``catalog`` as well as @@ -67,7 +40,6 @@ def after_catalog_created( # pylint: disable=too-many-arguments for all datasets in the catalog. load_versions: The load_versions used in ``load`` operations for each dataset in the catalog. - run_id: The id of the run for which the catalog is loaded. """ pass @@ -76,14 +48,14 @@ class NodeSpecs: """Namespace that defines all specifications for a node's lifecycle hooks.""" @hook_spec - def before_node_run( # pylint: disable=too-many-arguments + def before_node_run( # noqa: too-many-arguments self, node: Node, catalog: DataCatalog, - inputs: Dict[str, Any], + inputs: dict[str, Any], is_async: bool, - run_id: str, - ) -> Optional[Dict[str, Any]]: + session_id: str, + ) -> dict[str, Any] | None: """Hook to be invoked before a node runs. The arguments received are the same as those used by ``kedro.runner.run_node`` @@ -94,7 +66,7 @@ def before_node_run( # pylint: disable=too-many-arguments The keys are dataset names and the values are the actual loaded input data, not the dataset instance. is_async: Whether the node was run in ``async`` mode. - run_id: The id of the run. + session_id: The id of the session. Returns: Either None or a dictionary mapping dataset name(s) to new value(s). @@ -104,14 +76,14 @@ def before_node_run( # pylint: disable=too-many-arguments pass @hook_spec - def after_node_run( # pylint: disable=too-many-arguments + def after_node_run( # noqa: too-many-arguments self, node: Node, catalog: DataCatalog, - inputs: Dict[str, Any], - outputs: Dict[str, Any], + inputs: dict[str, Any], + outputs: dict[str, Any], is_async: bool, - run_id: str, + session_id: str, ) -> None: """Hook to be invoked after a node runs. The arguments received are the same as those used by ``kedro.runner.run_node`` @@ -127,19 +99,19 @@ def after_node_run( # pylint: disable=too-many-arguments The keys are dataset names and the values are the actual computed output data, not the dataset instance. is_async: Whether the node was run in ``async`` mode. - run_id: The id of the run. + session_id: The id of the session. """ pass @hook_spec - def on_node_error( # pylint: disable=too-many-arguments + def on_node_error( # noqa: too-many-arguments self, error: Exception, node: Node, catalog: DataCatalog, - inputs: Dict[str, Any], + inputs: dict[str, Any], is_async: bool, - run_id: str, + session_id: str, ): """Hook to be invoked if a node run throws an uncaught error. The signature of this error hook should match the signature of ``before_node_run`` @@ -153,7 +125,7 @@ def on_node_error( # pylint: disable=too-many-arguments The keys are dataset names and the values are the actual loaded input data, not the dataset instance. is_async: Whether the node was run in ``async`` mode. - run_id: The id of the run. + session_id: The id of the session. """ pass @@ -163,16 +135,16 @@ class PipelineSpecs: @hook_spec def before_pipeline_run( - self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog + self, run_params: dict[str, Any], pipeline: Pipeline, catalog: DataCatalog ) -> None: """Hook to be invoked before a pipeline runs. Args: run_params: The params used to run the pipeline. - Should be identical to the data logged by Journal with the following schema:: + Should have the following schema:: { - "run_id": str + "session_id": str "project_path": str, "env": str, "kedro_version": str, @@ -183,8 +155,10 @@ def before_pipeline_run( "from_inputs": Optional[List[str]], "to_outputs": Optional[List[str]], "load_versions": Optional[List[str]], - "pipeline_name": str, "extra_params": Optional[Dict[str, Any]] + "pipeline_name": str, + "namespace": Optional[str], + "runner": str, } pipeline: The ``Pipeline`` that will be run. @@ -195,8 +169,8 @@ def before_pipeline_run( @hook_spec def after_pipeline_run( self, - run_params: Dict[str, Any], - run_result: Dict[str, Any], + run_params: dict[str, Any], + run_result: dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ) -> None: @@ -204,10 +178,10 @@ def after_pipeline_run( Args: run_params: The params used to run the pipeline. - Should be identical to the data logged by Journal with the following schema:: + Should have the following schema:: { - "run_id": str + "session_id": str "project_path": str, "env": str, "kedro_version": str, @@ -218,8 +192,10 @@ def after_pipeline_run( "from_inputs": Optional[List[str]], "to_outputs": Optional[List[str]], "load_versions": Optional[List[str]], - "pipeline_name": str, "extra_params": Optional[Dict[str, Any]] + "pipeline_name": str, + "namespace": Optional[str], + "runner": str, } run_result: The output of ``Pipeline`` run. @@ -232,7 +208,7 @@ def after_pipeline_run( def on_pipeline_error( self, error: Exception, - run_params: Dict[str, Any], + run_params: dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ): @@ -243,10 +219,10 @@ def on_pipeline_error( Args: error: The uncaught exception thrown during the pipeline run. run_params: The params used to run the pipeline. - Should be identical to the data logged by Journal with the following schema:: + Should have the following schema:: { - "run_id": str + "session_id": str "project_path": str, "env": str, "kedro_version": str, @@ -257,9 +233,12 @@ def on_pipeline_error( "from_inputs": Optional[List[str]], "to_outputs": Optional[List[str]], "load_versions": Optional[List[str]], - "pipeline_name": str, "extra_params": Optional[Dict[str, Any]] + "pipeline_name": str, + "namespace": Optional[str], + "runner": str, } + pipeline: The ``Pipeline`` that will was run. catalog: The ``DataCatalog`` used during the run. """ @@ -270,75 +249,61 @@ class DatasetSpecs: """Namespace that defines all specifications for a dataset's lifecycle hooks.""" @hook_spec - def before_dataset_loaded(self, dataset_name: str) -> None: + def before_dataset_loaded(self, dataset_name: str, node: Node) -> None: """Hook to be invoked before a dataset is loaded from the catalog. Args: dataset_name: name of the dataset to be loaded from the catalog. - + node: The ``Node`` to run. """ pass @hook_spec - def after_dataset_loaded(self, dataset_name: str, data: Any) -> None: + def after_dataset_loaded(self, dataset_name: str, data: Any, node: Node) -> None: """Hook to be invoked after a dataset is loaded from the catalog. Args: dataset_name: name of the dataset that was loaded from the catalog. data: the actual data that was loaded from the catalog. - + node: The ``Node`` to run. """ pass @hook_spec - def before_dataset_saved(self, dataset_name: str, data: Any) -> None: + def before_dataset_saved(self, dataset_name: str, data: Any, node: Node) -> None: """Hook to be invoked before a dataset is saved to the catalog. Args: dataset_name: name of the dataset to be saved to the catalog. data: the actual data to be saved to the catalog. - + node: The ``Node`` that ran. """ pass @hook_spec - def after_dataset_saved(self, dataset_name: str, data: Any) -> None: + def after_dataset_saved(self, dataset_name: str, data: Any, node: Node) -> None: """Hook to be invoked after a dataset is saved in the catalog. Args: dataset_name: name of the dataset that was saved to the catalog. data: the actual data that was saved to the catalog. + node: The ``Node`` that ran. """ pass -class RegistrationSpecs: - """Namespace that defines all specifications for hooks registering - library components with a Kedro project. - """ +class KedroContextSpecs: + """Namespace that defines all specifications for a Kedro context's lifecycle hooks.""" @hook_spec - def register_pipelines(self) -> Dict[str, Pipeline]: - """Hook to be invoked to register a project's pipelines. - - Returns: - A mapping from a pipeline name to a ``Pipeline`` object. - - """ - pass - - @hook_spec(firstresult=True) - def register_catalog( # pylint: disable=too-many-arguments + def after_context_created( self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: - """Hook to be invoked to register a project's data catalog. + context: KedroContext, + ) -> None: + """Hooks to be invoked after a `KedroContext` is created. This is the earliest + hook triggered within a Kedro run. The `KedroContext` stores useful information + such as `credentials`, `config_loader` and `env`. - Returns: - An instance of a ``DataCatalog``. + Args: + context: The context that was created. """ - pass diff --git a/kedro/framework/project/__init__.py b/kedro/framework/project/__init__.py index c61471d9a3..ea7369cadf 100644 --- a/kedro/framework/project/__init__.py +++ b/kedro/framework/project/__init__.py @@ -1,45 +1,31 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``kedro.framework.project`` module provides utitlity to configure a Kedro project and access its settings.""" -# pylint: disable=redefined-outer-name,unused-argument,global-statement +# noqa: redefined-outer-name,unused-argument,global-statement +from __future__ import annotations + import importlib +import logging.config import operator +import os +import traceback +import types +import warnings +from collections import UserDict from collections.abc import MutableMapping -from typing import Dict, Optional -from warnings import warn +from pathlib import Path +from typing import Any +import importlib_resources +import yaml from dynaconf import LazySettings from dynaconf.validator import ValidationError, Validator -from kedro.framework.hooks import get_hook_manager -from kedro.framework.hooks.manager import _register_hooks, _register_hooks_setuptools -from kedro.pipeline import Pipeline +from kedro.pipeline import Pipeline, pipeline + +IMPORT_ERROR_MESSAGE = ( + "An error occurred while importing the '{module}' module. Nothing " + "defined therein will be returned by 'find_pipelines'.\n\n{tb_exc}" +) def _get_default_class(class_import_path): @@ -54,17 +40,46 @@ def validator_func(settings, validators): class _IsSubclassValidator(Validator): """A validator to check if the supplied setting value is a subclass of the default class""" - def _validate_items(self, settings, env=None): - super()._validate_items(settings, env) + def validate(self, settings, *args, **kwargs): + super().validate(settings, *args, **kwargs) default_class = self.default(settings, self) for name in self.names: setting_value = getattr(settings, name) if not issubclass(setting_value, default_class): raise ValidationError( - f"Invalid value `{setting_value.__module__}.{setting_value.__qualname__}` " - f"received for setting `{name}`. It must be a subclass of " - f"`{default_class.__module__}.{default_class.__qualname__}`." + f"Invalid value '{setting_value.__module__}.{setting_value.__qualname__}' " + f"received for setting '{name}'. It must be a subclass of " + f"'{default_class.__module__}.{default_class.__qualname__}'." + ) + + +class _HasSharedParentClassValidator(Validator): + """A validator to check that the parent of the default class is an ancestor of + the settings value.""" + + def validate(self, settings, *args, **kwargs): + super().validate(settings, *args, **kwargs) + + default_class = self.default(settings, self) + for name in self.names: + setting_value = getattr(settings, name) + # In the case of ConfigLoader, default_class.mro() will be: + # [kedro.config.config.ConfigLoader, + # kedro.config.abstract_config.AbstractConfigLoader, + # abc.ABC, + # object] + # We pick out the direct parent and check if it's in any of the ancestors of + # the supplied setting_value. This assumes that the direct parent is + # the abstract class that must be inherited from. + # A more general check just for a shared ancestor would be: + # set(default_class.mro()) & set(setting_value.mro()) - {abc.ABC, object} + default_class_parent = default_class.mro()[1] + if default_class_parent not in setting_value.mro(): + raise ValidationError( + f"Invalid value '{setting_value.__module__}.{setting_value.__qualname__}' " + f"received for setting '{name}'. It must be a subclass of " + f"'{default_class_parent.__module__}.{default_class_parent.__qualname__}'." ) @@ -76,7 +91,7 @@ class _ProjectSettings(LazySettings): _CONF_SOURCE = Validator("CONF_SOURCE", default="conf") _HOOKS = Validator("HOOKS", default=tuple()) - _CONTEXT_CLASS = Validator( + _CONTEXT_CLASS = _IsSubclassValidator( "CONTEXT_CLASS", default=_get_default_class("kedro.framework.context.KedroContext"), ) @@ -86,13 +101,15 @@ class _ProjectSettings(LazySettings): ) _SESSION_STORE_ARGS = Validator("SESSION_STORE_ARGS", default={}) _DISABLE_HOOKS_FOR_PLUGINS = Validator("DISABLE_HOOKS_FOR_PLUGINS", default=tuple()) - _CONFIG_LOADER_CLASS = Validator( + _CONFIG_LOADER_CLASS = _HasSharedParentClassValidator( "CONFIG_LOADER_CLASS", default=_get_default_class("kedro.config.ConfigLoader") ) _CONFIG_LOADER_ARGS = Validator("CONFIG_LOADER_ARGS", default={}) + _DATA_CATALOG_CLASS = _IsSubclassValidator( + "DATA_CATALOG_CLASS", default=_get_default_class("kedro.io.DataCatalog") + ) def __init__(self, *args, **kwargs): - kwargs.update( validators=[ self._CONF_SOURCE, @@ -103,6 +120,7 @@ def __init__(self, *args, **kwargs): self._DISABLE_HOOKS_FOR_PLUGINS, self._CONFIG_LOADER_CLASS, self._CONFIG_LOADER_ARGS, + self._DATA_CATALOG_CLASS, ] ) super().__init__(*args, **kwargs) @@ -112,7 +130,8 @@ def _load_data_wrapper(func): """Wrap a method in _ProjectPipelines so that data is loaded on first access. Taking inspiration from dynaconf.utils.functional.new_method_proxy """ - # pylint: disable=protected-access + + # noqa: protected-access def inner(self, *args, **kwargs): self._load_data() return func(self._content, *args, **kwargs) @@ -122,15 +141,25 @@ def inner(self, *args, **kwargs): class _ProjectPipelines(MutableMapping): """A read-only lazy dictionary-like object to hold the project pipelines. - On configure it will store the pipelines module. - On first data access, e.g. through __getitem__, it will load the registered pipelines and merge - them with pipelines defined from hooks. + When configured, it stores the pipelines module. + On first data access, e.g. through __getitem__, it will load the registered pipelines + + This object is initialized lazily for a few reasons: + + 1. To support an unified way of importing via `from kedro.framework.project import pipelines`. + The pipelines object is initializedlazily since the framework doesn't have knowledge about + the project until `bootstrap_project` is run. + 2. To speed up Kedro CLI performance. Loading the pipelines incurs overhead, as all related + modules need to be imported. + 3. To ensure Kedro CLI remains functional when pipelines are broken. During development, broken + pipelines are common, but they shouldn't prevent other parts of Kedro CLI from functioning + properly (e.g. `kedro -h`). """ def __init__(self) -> None: - self._pipelines_module: Optional[str] = None + self._pipelines_module: str | None = None self._is_data_loaded = False - self._content: Dict[str, Pipeline] = {} + self._content: dict[str, Pipeline] = {} @staticmethod def _get_pipelines_registry_callable(pipelines_module: str): @@ -139,56 +168,29 @@ def _get_pipelines_registry_callable(pipelines_module: str): return register_pipelines def _load_data(self): - """Lazily read pipelines defined in the pipelines registry module""" + """Lazily read pipelines defined in the pipelines registry module.""" # If the pipelines dictionary has not been configured with a pipelines module # or if data has been loaded if self._pipelines_module is None or self._is_data_loaded: return - try: - register_pipelines = self._get_pipelines_registry_callable( - self._pipelines_module - ) - except (ModuleNotFoundError, AttributeError) as exc: - # for backwards compatibility with templates < 0.17.2 - # where no pipelines_registry is defined - if self._pipelines_module in str(exc): # pragma: no cover - project_pipelines = {} - else: - raise - else: - project_pipelines = register_pipelines() - - hook_manager = get_hook_manager() - pipelines_dicts = ( - hook_manager.hook.register_pipelines() # pylint: disable=no-member + register_pipelines = self._get_pipelines_registry_callable( + self._pipelines_module ) - for pipeline_collection in pipelines_dicts: - duplicate_keys = pipeline_collection.keys() & project_pipelines.keys() - if duplicate_keys: - warn( - f"Found duplicate pipeline entries. " - f"The following will be overwritten: {', '.join(duplicate_keys)}" - ) - project_pipelines.update(pipeline_collection) + project_pipelines = register_pipelines() self._content = project_pipelines self._is_data_loaded = True - def configure(self, pipelines_module: str) -> None: + def configure(self, pipelines_module: str | None = None) -> None: """Configure the pipelines_module to load the pipelines dictionary. - Reset the data loading state so that after every `configure` call, + Reset the data loading state so that after every ``configure`` call, data are reloaded. """ - self._clear(pipelines_module) - - def _clear(self, pipelines_module: str) -> None: - """Helper method to clear the pipelines so new content will be reloaded - next time data is accessed. Useful for testing purpose. - """ - self._is_data_loaded = False self._pipelines_module = pipelines_module + self._is_data_loaded = False + self._content = {} # Dict-like interface __getitem__ = _load_data_wrapper(operator.getitem) @@ -196,45 +198,183 @@ def _clear(self, pipelines_module: str) -> None: __delitem__ = _load_data_wrapper(operator.delitem) __iter__ = _load_data_wrapper(iter) __len__ = _load_data_wrapper(len) + keys = _load_data_wrapper(operator.methodcaller("keys")) + values = _load_data_wrapper(operator.methodcaller("values")) + items = _load_data_wrapper(operator.methodcaller("items")) # Presentation methods __repr__ = _load_data_wrapper(repr) __str__ = _load_data_wrapper(str) +class _ProjectLogging(UserDict): + # noqa: super-init-not-called + def __init__(self): + """Initialise project logging. The path to logging configuration is given in + environment variable KEDRO_LOGGING_CONFIG (defaults to default_logging.yml).""" + path = os.environ.get( + "KEDRO_LOGGING_CONFIG", Path(__file__).parent / "default_logging.yml" + ) + logging_config = Path(path).read_text(encoding="utf-8") + self.configure(yaml.safe_load(logging_config)) + + def configure(self, logging_config: dict[str, Any]) -> None: + """Configure project logging using ``logging_config`` (e.g. from project + logging.yml). We store this in the UserDict data so that it can be reconfigured + in _bootstrap_subprocess. + """ + logging.config.dictConfig(logging_config) + self.data = logging_config + + def set_project_logging(self, package_name: str): + """Add the project level logging to the loggers upon provision of a package name. + Checks if project logger already exists to prevent overwriting, if none exists + it defaults to setting project logs at INFO level.""" + if package_name not in self.data["loggers"]: + self.data["loggers"][package_name] = {"level": "INFO"} + self.configure(self.data) + + PACKAGE_NAME = None +LOGGING = _ProjectLogging() settings = _ProjectSettings() pipelines = _ProjectPipelines() -def _validate_module(settings_module): - """Eagerly validate that the module is importable. - This ensures that the settings module is syntactically - correct so that any import errors are surfaced early. - """ - importlib.import_module(settings_module) - - def configure_project(package_name: str): """Configure a Kedro project by populating its settings with values defined in user's settings.py and pipeline_registry.py. """ settings_module = f"{package_name}.settings" - _validate_module(settings_module) settings.configure(settings_module) - # set up all hooks so we can discover all pipelines - hook_manager = get_hook_manager() - _register_hooks(hook_manager, settings.HOOKS) - _register_hooks_setuptools(hook_manager, settings.DISABLE_HOOKS_FOR_PLUGINS) - pipelines_module = f"{package_name}.pipeline_registry" pipelines.configure(pipelines_module) # Once the project is successfully configured once, store PACKAGE_NAME as a - # global variable to make it easily accessible. This is used by ParallelRunner on - # Windows, as package_name is required every time a new subprocess is spawned. - global PACKAGE_NAME + # global variable to make it easily accessible. This is used by validate_settings() + # below, and also by ParallelRunner on Windows, as package_name is required every + # time a new subprocess is spawned. + global PACKAGE_NAME # noqa: PLW0603 PACKAGE_NAME = package_name + + if PACKAGE_NAME: + LOGGING.set_project_logging(PACKAGE_NAME) + + +def configure_logging(logging_config: dict[str, Any]) -> None: + """Configure logging according to ``logging_config`` dictionary.""" + LOGGING.configure(logging_config) + + +def validate_settings(): + """Eagerly validate that the settings module is importable. This is desirable to + surface any syntax or import errors early. In particular, without eagerly importing + the settings module, dynaconf would silence any import error (e.g. missing + dependency, missing/mislabelled pipeline), and users would instead get a cryptic + error message ``Expected an instance of `ConfigLoader`, got `NoneType` instead``. + More info on the dynaconf issue: https://github.com/rochacbruno/dynaconf/issues/460 + """ + if PACKAGE_NAME is None: + raise ValueError( + "Package name not found. Make sure you have configured the project using " + "'bootstrap_project'. This should happen automatically if you are using " + "Kedro command line interface." + ) + + importlib.import_module(f"{PACKAGE_NAME}.settings") + + +def _create_pipeline(pipeline_module: types.ModuleType) -> Pipeline | None: + if not hasattr(pipeline_module, "create_pipeline"): + warnings.warn( + f"The '{pipeline_module.__name__}' module does not " + f"expose a 'create_pipeline' function, so no pipelines " + f"defined therein will be returned by 'find_pipelines'." + ) + return None + + obj = getattr(pipeline_module, "create_pipeline")() + if not isinstance(obj, Pipeline): + warnings.warn( + f"Expected the 'create_pipeline' function in the " + f"'{pipeline_module.__name__}' module to return a " + f"'Pipeline' object, got '{type(obj).__name__}' " + f"instead. Nothing defined therein will be returned by " + f"'find_pipelines'." + ) + return None + + return obj + + +def find_pipelines() -> dict[str, Pipeline]: # noqa: PLR0912 + """Automatically find modular pipelines having a ``create_pipeline`` + function. By default, projects created using Kedro 0.18.3 and higher + call this function to autoregister pipelines upon creation/addition. + + Projects that require more fine-grained control can still define the + pipeline registry without calling this function. Alternatively, they + can modify the mapping generated by the ``find_pipelines`` function. + + For more information on the pipeline registry and autodiscovery, see + https://kedro.readthedocs.io/en/latest/nodes_and_pipelines/pipeline_registry.html + + Returns: + A generated mapping from pipeline names to ``Pipeline`` objects. + + Warns: + UserWarning: When a module does not expose a ``create_pipeline`` + function, the ``create_pipeline`` function does not return a + ``Pipeline`` object, or if the module import fails up front. + """ + pipeline_obj = None + + # Handle the simplified project structure found in several starters. + pipeline_module_name = f"{PACKAGE_NAME}.pipeline" + try: + pipeline_module = importlib.import_module(pipeline_module_name) + except Exception as exc: # noqa: broad-except + if str(exc) != f"No module named '{pipeline_module_name}'": + warnings.warn( + IMPORT_ERROR_MESSAGE.format( + module=pipeline_module_name, tb_exc=traceback.format_exc() + ) + ) + else: + pipeline_obj = _create_pipeline(pipeline_module) + + pipelines_dict = {"__default__": pipeline_obj or pipeline([])} + + # Handle the case that a project doesn't have a pipelines directory. + try: + pipelines_package = importlib_resources.files(f"{PACKAGE_NAME}.pipelines") + except ModuleNotFoundError as exc: + if str(exc) == f"No module named '{PACKAGE_NAME}.pipelines'": + return pipelines_dict + + for pipeline_dir in pipelines_package.iterdir(): + if not pipeline_dir.is_dir(): + continue + + pipeline_name = pipeline_dir.name + if pipeline_name == "__pycache__": + continue + + pipeline_module_name = f"{PACKAGE_NAME}.pipelines.{pipeline_name}" + try: + pipeline_module = importlib.import_module(pipeline_module_name) + except: # noqa: bare-except # noqa: E722 + warnings.warn( + IMPORT_ERROR_MESSAGE.format( + module=pipeline_module_name, tb_exc=traceback.format_exc() + ) + ) + continue + + pipeline_obj = _create_pipeline(pipeline_module) + if pipeline_obj is not None: + pipelines_dict[pipeline_name] = pipeline_obj + return pipelines_dict diff --git a/kedro/framework/project/default_logging.yml b/kedro/framework/project/default_logging.yml new file mode 100644 index 0000000000..87fae8a25c --- /dev/null +++ b/kedro/framework/project/default_logging.yml @@ -0,0 +1,18 @@ +version: 1 + +disable_existing_loggers: False + +handlers: + rich: + class: kedro.logging.RichHandler + rich_tracebacks: True + # Advance options for customisation. + # See https://docs.kedro.org/en/stable/logging/logging.html#project-side-logging-configuration + # tracebacks_show_locals: False + +loggers: + kedro: + level: INFO + +root: + handlers: [rich] diff --git a/kedro/framework/session/__init__.py b/kedro/framework/session/__init__.py index d740abdb73..b195660d3c 100644 --- a/kedro/framework/session/__init__.py +++ b/kedro/framework/session/__init__.py @@ -1,32 +1,6 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.framework.session`` provides access to KedroSession responsible for project lifecycle. """ -from .session import KedroSession, get_current_session # NOQA +from .session import KedroSession + +__all__ = ["KedroSession"] diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 2a8fee6d2a..f076b01e3c 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -1,169 +1,141 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -# pylint: disable=invalid-name,global-statement """This module implements Kedro session responsible for project lifecycle.""" +from __future__ import annotations +import getpass import logging import logging.config import os import subprocess +import sys import traceback from copy import deepcopy from pathlib import Path -from typing import Any, Dict, Iterable, Optional, Union +from typing import Any, Iterable import click from kedro import __version__ as kedro_version +from kedro.config import ConfigLoader from kedro.framework.context import KedroContext -from kedro.framework.context.context import _convert_paths_to_absolute_posix -from kedro.framework.hooks import get_hook_manager -from kedro.framework.project import configure_project, pipelines, settings +from kedro.framework.hooks import _create_hook_manager +from kedro.framework.hooks.manager import _register_hooks, _register_hooks_setuptools +from kedro.framework.project import pipelines, settings, validate_settings from kedro.framework.session.store import BaseSessionStore from kedro.io.core import generate_timestamp from kedro.runner import AbstractRunner, SequentialRunner -_active_session = None - -def get_current_session(silent: bool = False) -> Optional["KedroSession"]: - """Fetch the active ``KedroSession`` instance. - - Args: - silent: Indicates to suppress the error if no active session was found. - - Raises: - RuntimeError: If no active session was found and `silent` is False. - - Returns: - KedroSession instance. - - """ - if not _active_session and not silent: - raise RuntimeError("There is no active Kedro session.") - - return _active_session - - -def _activate_session(session: "KedroSession", force: bool = False) -> None: - global _active_session - - if _active_session and not force and session is not _active_session: - raise RuntimeError( - "Cannot activate the session as another active session already exists." - ) - - _active_session = session - - -def _deactivate_session() -> None: - global _active_session - _active_session = None - - -def _describe_git(project_path: Path) -> Dict[str, Dict[str, Any]]: +def _describe_git(project_path: Path) -> dict[str, dict[str, Any]]: project_path = str(project_path) - try: res = subprocess.check_output( - ["git", "rev-parse", "--short", "HEAD"], cwd=project_path + ["git", "rev-parse", "--short", "HEAD"], + cwd=project_path, + stderr=subprocess.STDOUT, + ) + git_data: dict[str, Any] = {"commit_sha": res.decode().strip()} + git_status_res = subprocess.check_output( + ["git", "status", "--short"], + cwd=project_path, + stderr=subprocess.STDOUT, ) + git_data["dirty"] = bool(git_status_res.decode().strip()) + # `subprocess.check_output()` raises `NotADirectoryError` on Windows - except (subprocess.CalledProcessError, FileNotFoundError, NotADirectoryError): - logging.getLogger(__name__).warning("Unable to git describe %s", project_path) + except Exception: # noqa: broad-except + logger = logging.getLogger(__name__) + logger.debug("Unable to git describe %s", project_path) + logger.debug(traceback.format_exc()) return {} - git_data = {"commit_sha": res.decode().strip()} # type: Dict[str, Any] - - res = subprocess.check_output(["git", "status", "--short"], cwd=project_path) - git_data["dirty"] = bool(res.decode().strip()) - return {"git": git_data} -def _jsonify_cli_context(ctx: click.core.Context) -> Dict[str, Any]: +def _jsonify_cli_context(ctx: click.core.Context) -> dict[str, Any]: return { "args": ctx.args, "params": ctx.params, "command_name": ctx.command.name, - "command_path": ctx.command_path, + "command_path": " ".join(["kedro"] + sys.argv[1:]), } +class KedroSessionError(Exception): + """``KedroSessionError`` raised by ``KedroSession`` + in the case that multiple runs are attempted in one session. + """ + + pass + + +# noqa: too-many-instance-attributes class KedroSession: """``KedroSession`` is the object that is responsible for managing the lifecycle - of a Kedro run. - - Use `KedroSession.create("")` as + of a Kedro run. Use `KedroSession.create()` as a context manager to construct a new KedroSession with session data provided (see the example below). - - Use `KedroSession(session_id=)` to instantiate an existing session with a given - ID. + + Example: :: >>> from kedro.framework.session import KedroSession - >>> - >>> with KedroSession.create("") as session: + >>> from kedro.framework.startup import bootstrap_project + >>> from pathlib import Path + + >>> # If you are creating a session outside of a Kedro project (i.e. not using + >>> # `kedro run` or `kedro jupyter`), you need to run `bootstrap_project` to + >>> # let Kedro find your configuration. + >>> bootstrap_project(Path("")) + >>> with KedroSession.create() as session: >>> session.run() - >>> + """ - def __init__( + def __init__( # noqa: too-many-arguments self, session_id: str, package_name: str = None, - project_path: Union[Path, str] = None, + project_path: Path | str | None = None, save_on_close: bool = False, + conf_source: str | None = None, ): self._project_path = Path(project_path or Path.cwd()).resolve() self.session_id = session_id self.save_on_close = save_on_close self._package_name = package_name self._store = self._init_store() + self._run_called = False + + hook_manager = _create_hook_manager() + _register_hooks(hook_manager, settings.HOOKS) + _register_hooks_setuptools(hook_manager, settings.DISABLE_HOOKS_FOR_PLUGINS) + self._hook_manager = hook_manager + + self._conf_source = conf_source or str( + self._project_path / settings.CONF_SOURCE + ) @classmethod - def create( # pylint: disable=too-many-arguments + def create( # noqa: too-many-arguments cls, package_name: str = None, - project_path: Union[Path, str] = None, + project_path: Path | str | None = None, save_on_close: bool = True, env: str = None, - extra_params: Dict[str, Any] = None, - ) -> "KedroSession": + extra_params: dict[str, Any] = None, + conf_source: str | None = None, + ) -> KedroSession: """Create a new instance of ``KedroSession`` with the session data. Args: package_name: Package name for the Kedro project the session is - created for. + created for. The package_name argument will be removed in Kedro `0.19.0`. project_path: Path to the project root directory. Default is current working directory Path.cwd(). save_on_close: Whether or not to save the session when it's closed. + conf_source: Path to a directory containing configuration env: Environment for the KedroContext. extra_params: Optional dictionary containing extra project parameters for underlying KedroContext. If specified, will update (and therefore @@ -173,28 +145,22 @@ def create( # pylint: disable=too-many-arguments Returns: A new ``KedroSession`` instance. """ - - # this is to make sure that for workflows that manually create session - # without going through one of our known entrypoints, e.g. some plugins like kedro-airflow, - # the project is still properly configured. This is for backward compatibility - # and should be removed in 0.18. - if package_name is not None: - configure_project(package_name) + validate_settings() session = cls( package_name=package_name, project_path=project_path, session_id=generate_timestamp(), save_on_close=save_on_close, + conf_source=conf_source, ) # have to explicitly type session_data otherwise mypy will complain # possibly related to this: https://github.com/python/mypy/issues/1430 - session_data: Dict[str, Any] = { + session_data: dict[str, Any] = { "package_name": session._package_name, "project_path": session._project_path, "session_id": session.session_id, - **_describe_git(session._project_path), } ctx = click.get_current_context(silent=True) @@ -208,30 +174,18 @@ def create( # pylint: disable=too-many-arguments if extra_params: session_data["extra_params"] = extra_params + try: + session_data["username"] = getpass.getuser() + except Exception as exc: # noqa: broad-except + logging.getLogger(__name__).debug( + "Unable to get username. Full exception: %s", exc + ) + + session_data.update(**_describe_git(session._project_path)) session._store.update(session_data) - # we need a ConfigLoader registered in order to be able to set up logging - session._setup_logging() return session - def _get_logging_config(self) -> Dict[str, Any]: - context = self.load_context() - - conf_logging = context.config_loader.get( - "logging*", "logging*/**", "**/logging*" - ) - # turn relative paths in logging config into absolute path - # before initialising loggers - conf_logging = _convert_paths_to_absolute_posix( - project_path=self._project_path, conf_dictionary=conf_logging - ) - return conf_logging - - def _setup_logging(self) -> None: - """Register logging specified in logging directory.""" - conf_logging = self._get_logging_config() - logging.config.dictConfig(conf_logging) - def _init_store(self) -> BaseSessionStore: store_class = settings.SESSION_STORE_CLASS classpath = f"{store_class.__module__}.{store_class.__qualname__}" @@ -244,11 +198,11 @@ def _init_store(self) -> BaseSessionStore: except TypeError as err: raise ValueError( f"\n{err}.\nStore config must only contain arguments valid " - f"for the constructor of `{classpath}`." + f"for the constructor of '{classpath}'." ) from err except Exception as err: raise ValueError( - f"\n{err}.\nFailed to instantiate session store of type `{classpath}`." + f"\n{err}.\nFailed to instantiate session store of type '{classpath}'." ) from err def _log_exception(self, exc_type, exc_value, exc_tb): @@ -263,7 +217,11 @@ def _log_exception(self, exc_type, exc_value, exc_tb): self._store["exception"] = exc_data @property - def store(self) -> Dict[str, Any]: + def _logger(self) -> logging.Logger: + return logging.getLogger(__name__) + + @property + def store(self) -> dict[str, Any]: """Return a copy of internal store.""" return dict(self._store) @@ -271,16 +229,34 @@ def load_context(self) -> KedroContext: """An instance of the project context.""" env = self.store.get("env") extra_params = self.store.get("extra_params") + config_loader = self._get_config_loader() context_class = settings.CONTEXT_CLASS context = context_class( package_name=self._package_name, project_path=self._project_path, + config_loader=config_loader, env=env, extra_params=extra_params, + hook_manager=self._hook_manager, ) + self._hook_manager.hook.after_context_created(context=context) + return context + def _get_config_loader(self) -> ConfigLoader: + """An instance of the config loader.""" + env = self.store.get("env") + extra_params = self.store.get("extra_params") + + config_loader_class = settings.CONFIG_LOADER_CLASS + return config_loader_class( + conf_source=self._conf_source, + env=env, + runtime_params=extra_params, + **settings.CONFIG_LOADER_ARGS, + ) + def close(self): """Close the current session and save its store to disk if `save_on_close` attribute is True. @@ -288,12 +264,7 @@ def close(self): if self.save_on_close: self._store.save() - if get_current_session(silent=True) is self: - _deactivate_session() - def __enter__(self): - if get_current_session(silent=True) is not self: - _activate_session(self) return self def __exit__(self, exc_type, exc_value, tb_): @@ -301,7 +272,7 @@ def __exit__(self, exc_type, exc_value, tb_): self._log_exception(exc_type, exc_value, tb_) self.close() - def run( # pylint: disable=too-many-arguments,too-many-locals + def run( # noqa: too-many-arguments,too-many-locals self, pipeline_name: str = None, tags: Iterable[str] = None, @@ -311,8 +282,9 @@ def run( # pylint: disable=too-many-arguments,too-many-locals to_nodes: Iterable[str] = None, from_inputs: Iterable[str] = None, to_outputs: Iterable[str] = None, - load_versions: Dict[str, str] = None, - ) -> Dict[str, Any]: + load_versions: dict[str, str] = None, + namespace: str = None, + ) -> dict[str, Any]: """Runs the pipeline with a specified runner. Args: @@ -335,21 +307,31 @@ def run( # pylint: disable=too-many-arguments,too-many-locals used as an end point of the new ``Pipeline``. load_versions: An optional flag to specify a particular dataset version timestamp to load. + namespace: The namespace of the nodes that is being run. Raises: ValueError: If the named or `__default__` pipeline is not defined by `register_pipelines`. Exception: Any uncaught exception during the run will be re-raised after being passed to ``on_pipeline_error`` hook. + KedroSessionError: If more than one run is attempted to be executed during + a single session. Returns: Any node outputs that cannot be processed by the ``DataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ - # pylint: disable=protected-access,no-member # Report project name - logging.info("** Kedro project %s", self._project_path.name) + self._logger.info("Kedro project %s", self._project_path.name) + + if self._run_called: + raise KedroSessionError( + "A run has already been completed as part of the" + " active KedroSession. KedroSession has a 1-1 mapping with" + " runs, and thus only one run should be executed per session." + ) - save_version = run_id = self.store["session_id"] + session_id = self.store["session_id"] + save_version = session_id extra_params = self.store.get("extra_params") or {} context = self.load_context() @@ -371,10 +353,11 @@ def run( # pylint: disable=too-many-arguments,too-many-locals node_names=node_names, from_inputs=from_inputs, to_outputs=to_outputs, + node_namespace=namespace, ) record_data = { - "run_id": run_id, + "session_id": session_id, "project_path": self._project_path.as_posix(), "env": context.env, "kedro_version": kedro_version, @@ -387,21 +370,32 @@ def run( # pylint: disable=too-many-arguments,too-many-locals "load_versions": load_versions, "extra_params": extra_params, "pipeline_name": pipeline_name, + "namespace": namespace, + "runner": getattr(runner, "__name__", str(runner)), } - catalog = context._get_catalog( - save_version=save_version, load_versions=load_versions + catalog = context._get_catalog( # noqa: protected-access + save_version=save_version, + load_versions=load_versions, ) # Run the runner + hook_manager = self._hook_manager runner = runner or SequentialRunner() - hook_manager = get_hook_manager() - hook_manager.hook.before_pipeline_run( # pylint: disable=no-member + if not isinstance(runner, AbstractRunner): + raise KedroSessionError( + "KedroSession expect an instance of Runner instead of a class." + "Have you forgotten the `()` at the end of the statement?" + ) + hook_manager.hook.before_pipeline_run( run_params=record_data, pipeline=filtered_pipeline, catalog=catalog ) try: - run_result = runner.run(filtered_pipeline, catalog, run_id) + run_result = runner.run( + filtered_pipeline, catalog, hook_manager, session_id + ) + self._run_called = True except Exception as error: hook_manager.hook.on_pipeline_error( error=error, diff --git a/kedro/framework/session/shelvestore.py b/kedro/framework/session/shelvestore.py new file mode 100644 index 0000000000..3bf34157bc --- /dev/null +++ b/kedro/framework/session/shelvestore.py @@ -0,0 +1,45 @@ +"""This module implements a dict-like store object used to persist Kedro sessions. +This module is separated from store.py to ensure it's only imported when exported explicitly. +""" +from __future__ import annotations + +import dbm +import shelve +from multiprocessing import Lock +from pathlib import Path +from typing import Any + +from .store import BaseSessionStore + + +class ShelveStore(BaseSessionStore): + """Stores the session data on disk using `shelve` package. + This is an example of how to persist data on disk.""" + + _lock = Lock() + + @property + def _location(self) -> Path: + return Path(self._path).expanduser().resolve() / self._session_id / "store" + + def read(self) -> dict[str, Any]: + """Read the data from disk using `shelve` package.""" + data: dict[str, Any] = {} + try: + with shelve.open(str(self._location), flag="r") as _sh: # nosec + data = dict(_sh) + except dbm.error: + pass + return data + + def save(self) -> None: + """Save the data on disk using `shelve` package.""" + location = self._location + location.parent.mkdir(parents=True, exist_ok=True) + + with self._lock, shelve.open(str(location)) as _sh: # nosec + keys_to_del = _sh.keys() - self.data.keys() + for key in keys_to_del: + del _sh[key] + + _sh.update(self.data) diff --git a/kedro/framework/session/store.py b/kedro/framework/session/store.py index 988824624d..6aee727528 100644 --- a/kedro/framework/session/store.py +++ b/kedro/framework/session/store.py @@ -1,38 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module implements a dict-like store object used to persist Kedro sessions.""" -import dbm +from __future__ import annotations + import logging -import shelve from collections import UserDict -from multiprocessing import Lock -from pathlib import Path -from typing import Any, Dict +from typing import Any class BaseSessionStore(UserDict): @@ -50,53 +21,21 @@ def __init__(self, path: str, session_id: str): def _logger(self) -> logging.Logger: return logging.getLogger(__name__) - def read(self) -> Dict[str, Any]: + def read(self) -> dict[str, Any]: """Read the data from the session store. Returns: A mapping containing the session store data. """ - self._logger.info( - "`read()` not implemented for `%s`. Assuming empty store.", + self._logger.debug( + "'read()' not implemented for '%s'. Assuming empty store.", self.__class__.__name__, ) return {} def save(self): """Persist the session store""" - self._logger.info( - "`save()` not implemented for `%s`. Skipping the step.", + self._logger.debug( + "'save()' not implemented for '%s'. Skipping the step.", self.__class__.__name__, ) - - -class ShelveStore(BaseSessionStore): - """Stores the session data on disk using `shelve` package.""" - - _lock = Lock() - - @property - def _location(self) -> Path: - return Path(self._path).expanduser().resolve() / self._session_id / "store" - - def read(self) -> Dict[str, Any]: - """Read the data from disk using `shelve` package.""" - data = {} # type: Dict[str, Any] - try: - with shelve.open(str(self._location), flag="r") as _sh: # nosec - data = dict(_sh) - except dbm.error: - pass - return data - - def save(self) -> None: - """Save the data on disk using `shelve` package.""" - location = self._location - location.parent.mkdir(parents=True, exist_ok=True) - - with self._lock, shelve.open(str(location)) as _sh: # nosec - keys_to_del = _sh.keys() - self.data.keys() - for key in keys_to_del: - del _sh[key] - - _sh.update(self.data) diff --git a/kedro/framework/startup.py b/kedro/framework/startup.py index 428433556a..287999125a 100644 --- a/kedro/framework/startup.py +++ b/kedro/framework/startup.py @@ -1,34 +1,7 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """This module provides metadata for a Kedro project.""" import os import sys +import warnings from pathlib import Path from typing import NamedTuple, Union @@ -49,13 +22,14 @@ class ProjectMetadata(NamedTuple): project_path: Path project_version: str source_dir: Path + kedro_init_version: str -def _version_mismatch_error(project_version) -> str: +def _version_mismatch_error(kedro_init_version) -> str: return ( - f"Your Kedro project version {project_version} does not match Kedro package " + f"Your Kedro project version {kedro_init_version} does not match Kedro package " f"version {kedro_version} you are running. Make sure to update your project " - f"template. See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md " + f"template. See https://github.com/kedro-org/kedro/blob/main/RELEASE.md " f"for how to migrate your Kedro project." ) @@ -64,17 +38,15 @@ def _is_project(project_path: Union[str, Path]) -> bool: metadata_file = Path(project_path).expanduser().resolve() / _PYPROJECT if not metadata_file.is_file(): return False + try: - metadata_dict = anyconfig.load(metadata_file) - if "tool" in metadata_dict and "kedro" in metadata_dict["tool"]: - return True - except Exception: # pylint: disable=broad-except + return "[tool.kedro]" in metadata_file.read_text(encoding="utf-8") + except Exception: # noqa: broad-except return False - return False def _get_project_metadata(project_path: Union[str, Path]) -> ProjectMetadata: - """Read project metadata from `/pyproject.toml` config file, + """Read project metadata from `/pyproject.toml` config file, under the `[tool.kedro]` section. Args: @@ -97,7 +69,7 @@ def _get_project_metadata(project_path: Union[str, Path]) -> ProjectMetadata: f"Could not find the project configuration file '{_PYPROJECT}' in {project_path}. " f"If you have created your project with Kedro " f"version <0.17.0, make sure to update your project template. " - f"See https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md" + f"See https://github.com/kedro-org/kedro/blob/main/RELEASE.md" f"#migration-guide-from-kedro-016-to-kedro-0170 " f"for how to migrate your Kedro project." ) @@ -116,21 +88,39 @@ def _get_project_metadata(project_path: Union[str, Path]) -> ProjectMetadata: f"configuration parameters." ) from exc - mandatory_keys = ["package_name", "project_name", "project_version"] + mandatory_keys = ["package_name", "project_name"] missing_keys = [key for key in mandatory_keys if key not in metadata_dict] if missing_keys: raise RuntimeError(f"Missing required keys {missing_keys} from '{_PYPROJECT}'.") + # Temporary solution to keep project_version backwards compatible to be removed in 0.19.0 + if "project_version" in metadata_dict: + warnings.warn( + "project_version in pyproject.toml is deprecated, use kedro_init_version instead", + DeprecationWarning, + ) + metadata_dict["kedro_init_version"] = metadata_dict["project_version"] + elif "kedro_init_version" in metadata_dict: + metadata_dict["project_version"] = metadata_dict["kedro_init_version"] + else: + raise RuntimeError( + f"Missing required key kedro_init_version from '{_PYPROJECT}'." + ) + + mandatory_keys.append("kedro_init_version") # check the match for major and minor version (skip patch version) - if metadata_dict["project_version"].split(".")[:2] != kedro_version.split(".")[:2]: - raise ValueError(_version_mismatch_error(metadata_dict["project_version"])) + if ( + metadata_dict["kedro_init_version"].split(".")[:2] + != kedro_version.split(".")[:2] + ): + raise ValueError(_version_mismatch_error(metadata_dict["kedro_init_version"])) source_dir = Path(metadata_dict.get("source_dir", "src")).expanduser() source_dir = (project_path / source_dir).resolve() metadata_dict["source_dir"] = source_dir metadata_dict["config_file"] = pyproject_toml metadata_dict["project_path"] = project_path - metadata_dict.pop("pipeline", {}) # don't include micro-packaging specs + metadata_dict.pop("micropkg", {}) # don't include micro-packaging specs try: return ProjectMetadata(**metadata_dict) @@ -170,9 +160,9 @@ def _add_src_to_path(source_dir: Path, project_path: Path) -> None: if str(source_dir) not in sys.path: sys.path.insert(0, str(source_dir)) - python_path = os.getenv("PYTHONPATH") or "" + python_path = os.getenv("PYTHONPATH", "") if str(source_dir) not in python_path: - sep = ";" if python_path else "" + sep = os.pathsep if python_path else "" os.environ["PYTHONPATH"] = f"{str(source_dir)}{sep}{python_path}" diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index be3c51f381..0755af906c 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -1,47 +1,60 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.io`` provides functionality to read and write to a -number of data sets. At core of the library is ``AbstractDataSet`` -which allows implementation of various ``AbstractDataSet``s. +number of data sets. At the core of the library is the ``AbstractDataSet`` class. """ +from __future__ import annotations + +from .cached_dataset import CachedDataSet, CachedDataset +from .core import ( + AbstractDataSet, + AbstractVersionedDataSet, + DatasetAlreadyExistsError, + DatasetError, + DatasetNotFoundError, + Version, +) +from .data_catalog import DataCatalog +from .lambda_dataset import LambdaDataSet, LambdaDataset +from .memory_dataset import MemoryDataSet, MemoryDataset +from .partitioned_dataset import ( + IncrementalDataSet, + IncrementalDataset, + PartitionedDataSet, + PartitionedDataset, +) + +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +DataSetError: type[DatasetError] +DataSetNotFoundError: type[DatasetNotFoundError] +DataSetAlreadyExistsError: type[DatasetAlreadyExistsError] + + +def __getattr__(name): + import kedro.io.core # noqa: import-outside-toplevel + + if name in (kedro.io.core._DEPRECATED_ERROR_CLASSES): # noqa: protected-access + return getattr(kedro.io.core, name) + raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") + -from .cached_dataset import CachedDataSet # NOQA -from .core import AbstractDataSet # NOQA -from .core import AbstractVersionedDataSet # NOQA -from .core import DataSetAlreadyExistsError # NOQA -from .core import DataSetError # NOQA -from .core import DataSetNotFoundError # NOQA -from .core import Version # NOQA -from .data_catalog import DataCatalog # NOQA -from .data_catalog_with_default import DataCatalogWithDefault # NOQA -from .lambda_data_set import LambdaDataSet # NOQA -from .memory_data_set import MemoryDataSet # NOQA -from .partitioned_data_set import IncrementalDataSet # NOQA -from .partitioned_data_set import PartitionedDataSet # NOQA -from .transformers import AbstractTransformer # NOQA +__all__ = [ + "AbstractDataSet", + "AbstractVersionedDataSet", + "CachedDataSet", + "CachedDataset", + "DataCatalog", + "DataSetAlreadyExistsError", + "DatasetAlreadyExistsError", + "DataSetError", + "DatasetError", + "DataSetNotFoundError", + "DatasetNotFoundError", + "IncrementalDataSet", + "IncrementalDataset", + "LambdaDataSet", + "LambdaDataset", + "MemoryDataSet", + "MemoryDataset", + "PartitionedDataSet", + "PartitionedDataset", + "Version", +] diff --git a/kedro/io/cached_dataset.py b/kedro/io/cached_dataset.py index eca610773c..d3aee1a39e 100644 --- a/kedro/io/cached_dataset.py +++ b/kedro/io/cached_dataset.py @@ -1,71 +1,55 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """ -This module contains ``CachedDataSet``, a dataset wrapper which caches in memory the data saved, +This module contains ``CachedDataset``, a dataset wrapper which caches in memory the data saved, so that the user avoids io operations with slow storage media """ +from __future__ import annotations + import logging -from typing import Any, Dict, Union +import warnings +from typing import Any from kedro.io.core import VERSIONED_FLAG_KEY, AbstractDataSet, Version -from kedro.io.memory_data_set import MemoryDataSet +from kedro.io.memory_dataset import MemoryDataset + +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +CachedDataSet: type[CachedDataset] -class CachedDataSet(AbstractDataSet): - """``CachedDataSet`` is a dataset wrapper which caches in memory the data saved, +class CachedDataset(AbstractDataSet): + """``CachedDataset`` is a dataset wrapper which caches in memory the data saved, so that the user avoids io operations with slow storage media. - You can also specify a ``CachedDataSet`` in catalog.yml: + You can also specify a ``CachedDataset`` in catalog.yml: :: >>> test_ds: - >>> type: CachedDataSet + >>> type: CachedDataset >>> versioned: true >>> dataset: - >>> type: pandas.CSVDataSet + >>> type: pandas.CSVDataset >>> filepath: example.csv Please note that if your dataset is versioned, this should be indicated in the wrapper class as shown above. """ + # this dataset cannot be used with ``ParallelRunner``, + # therefore it has the attribute ``_SINGLE_PROCESS = True`` + # for parallelism please consider ``ThreadRunner`` instead + _SINGLE_PROCESS = True + def __init__( self, - dataset: Union[AbstractDataSet, Dict], + dataset: AbstractDataSet | dict, version: Version = None, copy_mode: str = None, + metadata: dict[str, Any] = None, ): - """Creates a new instance of ``CachedDataSet`` pointing to the + """Creates a new instance of ``CachedDataset`` pointing to the provided Python object. Args: - dataset: A Kedro DataSet object or a dictionary to cache. + dataset: A Kedro Dataset object or a dictionary to cache. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -73,6 +57,8 @@ def __init__( copy_mode: The copy mode used to copy the data. Possible values are: "deepcopy", "copy" and "assign". If not provided, it is inferred based on the data type. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: ValueError: If the provided dataset is not a valid dict/YAML @@ -84,10 +70,11 @@ def __init__( self._dataset = dataset else: raise ValueError( - "The argument type of `dataset` should be either a dict/YAML " + "The argument type of 'dataset' should be either a dict/YAML " "representation of the dataset, or the actual dataset object." ) - self._cache = MemoryDataSet(copy_mode=copy_mode) + self._cache = MemoryDataset(copy_mode=copy_mode) + self.metadata = metadata def _release(self) -> None: self._cache.release() @@ -98,7 +85,7 @@ def _from_config(config, version): if VERSIONED_FLAG_KEY in config: raise ValueError( "Cached datasets should specify that they are versioned in the " - "`CachedDataSet`, not in the wrapped dataset." + "'CachedDataset', not in the wrapped dataset." ) if version: config[VERSIONED_FLAG_KEY] = True @@ -107,10 +94,10 @@ def _from_config(config, version): ) return AbstractDataSet.from_config("_cached", config) - def _describe(self) -> Dict[str, Any]: + def _describe(self) -> dict[str, Any]: return { - "dataset": self._dataset._describe(), # pylint: disable=protected-access - "cache": self._cache._describe(), # pylint: disable=protected-access + "dataset": self._dataset._describe(), # noqa: protected-access + "cache": self._cache._describe(), # noqa: protected-access } def _load(self): @@ -134,3 +121,16 @@ def __getstate__(self): logging.getLogger(__name__).warning("%s: clearing cache to pickle.", str(self)) self._cache.release() return self.__dict__ + + +def __getattr__(name): + if name == "CachedDataSet": + alias = CachedDataset + warnings.warn( + f"{repr(name)} has been renamed to {repr(alias.__name__)}, " + f"and the alias will be removed in Kedro 0.19.0", + DeprecationWarning, + stacklevel=2, + ) + return alias + raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/kedro/io/core.py b/kedro/io/core.py index 85f09ced33..f608f10840 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1,34 +1,7 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """This module provides a set of classes which underpin the data loading and saving functionality provided by ``kedro.io``. """ +from __future__ import annotations import abc import copy @@ -41,7 +14,7 @@ from glob import iglob from operator import attrgetter from pathlib import Path, PurePath, PurePosixPath -from typing import Any, Callable, Dict, List, Optional, Tuple, Type +from typing import Any, Callable, Generic, TypeVar from urllib.parse import urlsplit from cachetools import Cache, cachedmethod @@ -49,18 +22,21 @@ from kedro.utils import load_obj -warnings.simplefilter("default", DeprecationWarning) - VERSION_FORMAT = "%Y-%m-%dT%H.%M.%S.%fZ" VERSIONED_FLAG_KEY = "versioned" VERSION_KEY = "version" HTTP_PROTOCOLS = ("http", "https") PROTOCOL_DELIMITER = "://" -CLOUD_PROTOCOLS = ("s3", "gcs", "gs", "adl", "abfs") +CLOUD_PROTOCOLS = ("s3", "s3n", "s3a", "gcs", "gs", "adl", "abfs", "abfss", "gdrive") + +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +DataSetError: type[DatasetError] +DataSetNotFoundError: type[DatasetNotFoundError] +DataSetAlreadyExistsError: type[DatasetAlreadyExistsError] -class DataSetError(Exception): - """``DataSetError`` raised by ``AbstractDataSet`` implementations +class DatasetError(Exception): + """``DatasetError`` raised by ``AbstractDataSet`` implementations in case of failure of input/output methods. ``AbstractDataSet`` implementations should provide instructive @@ -70,23 +46,43 @@ class DataSetError(Exception): pass -class DataSetNotFoundError(DataSetError): - """``DataSetNotFoundError`` raised by ``DataCatalog`` class in case of +class DatasetNotFoundError(DatasetError): + """``DatasetNotFoundError`` raised by ``DataCatalog`` class in case of trying to use a non-existing data set. """ pass -class DataSetAlreadyExistsError(DataSetError): - """``DataSetAlreadyExistsError`` raised by ``DataCatalog`` class in case +class DatasetAlreadyExistsError(DatasetError): + """``DatasetAlreadyExistsError`` raised by ``DataCatalog`` class in case of trying to add a data set which already exists in the ``DataCatalog``. """ pass -class VersionNotFoundError(DataSetError): +_DEPRECATED_ERROR_CLASSES = { + "DataSetError": DatasetError, + "DataSetNotFoundError": DatasetNotFoundError, + "DataSetAlreadyExistsError": DatasetAlreadyExistsError, +} + + +def __getattr__(name): + if name in _DEPRECATED_ERROR_CLASSES: + alias = _DEPRECATED_ERROR_CLASSES[name] + warnings.warn( + f"{repr(name)} has been renamed to {repr(alias.__name__)}, " + f"and the alias will be removed in Kedro 0.19.0", + DeprecationWarning, + stacklevel=2, + ) + return alias + raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") + + +class VersionNotFoundError(DatasetError): """``VersionNotFoundError`` raised by ``AbstractVersionedDataSet`` implementations in case of no load versions available for the data set. """ @@ -94,7 +90,11 @@ class VersionNotFoundError(DataSetError): pass -class AbstractDataSet(abc.ABC): +_DI = TypeVar("_DI") +_DO = TypeVar("_DO") + + +class AbstractDataSet(abc.ABC, Generic[_DI, _DO]): """``AbstractDataSet`` is the base class for all data set implementations. All data set implementations should extend this abstract class and implement the methods marked as abstract. @@ -109,7 +109,7 @@ class AbstractDataSet(abc.ABC): >>> from kedro.io import AbstractDataSet >>> >>> - >>> class MyOwnDataSet(AbstractDataSet): + >>> class MyOwnDataset(AbstractDataSet[pd.DataFrame, pd.DataFrame]): >>> def __init__(self, filepath, param1, param2=True): >>> self._filepath = PurePosixPath(filepath) >>> self._param1 = param1 @@ -131,7 +131,7 @@ class AbstractDataSet(abc.ABC): :: my_dataset: - type: .MyOwnDataSet + type: .MyOwnDataset filepath: data/01_raw/my_data.csv param1: # param1 is a required argument # param2 will be True by default @@ -139,12 +139,12 @@ class AbstractDataSet(abc.ABC): @classmethod def from_config( - cls: Type, + cls: type, name: str, - config: Dict[str, Any], + config: dict[str, Any], load_version: str = None, save_version: str = None, - ) -> "AbstractDataSet": + ) -> AbstractDataSet: """Create a data set instance using the configuration provided. Args: @@ -161,7 +161,7 @@ def from_config( An instance of an ``AbstractDataSet`` subclass. Raises: - DataSetError: When the function fails to create the data set + DatasetError: When the function fails to create the data set from its config. """ @@ -170,22 +170,22 @@ def from_config( config, load_version, save_version ) except Exception as exc: - raise DataSetError( - "An exception occurred when parsing config " - "for DataSet `{}`:\n{}".format(name, str(exc)) + raise DatasetError( + f"An exception occurred when parsing config " + f"for dataset '{name}':\n{str(exc)}" ) from exc try: data_set = class_obj(**config) # type: ignore except TypeError as err: - raise DataSetError( - f"\n{err}.\nDataSet '{name}' must only contain arguments valid for the " - f"constructor of `{class_obj.__module__}.{class_obj.__qualname__}`." + raise DatasetError( + f"\n{err}.\nDataset '{name}' must only contain arguments valid for the " + f"constructor of '{class_obj.__module__}.{class_obj.__qualname__}'." ) from err except Exception as err: - raise DataSetError( - f"\n{err}.\nFailed to instantiate DataSet '{name}' " - f"of type `{class_obj.__module__}.{class_obj.__qualname__}`." + raise DatasetError( + f"\n{err}.\nFailed to instantiate dataset '{name}' " + f"of type '{class_obj.__module__}.{class_obj.__qualname__}'." ) from err return data_set @@ -193,14 +193,14 @@ def from_config( def _logger(self) -> logging.Logger: return logging.getLogger(__name__) - def load(self) -> Any: + def load(self) -> _DO: """Loads data by delegation to the provided load method. Returns: Data returned by the provided load method. Raises: - DataSetError: When underlying load method raises error. + DatasetError: When underlying load method raises error. """ @@ -208,47 +208,47 @@ def load(self) -> Any: try: return self._load() - except DataSetError: + except DatasetError: raise except Exception as exc: # This exception handling is by design as the composed data sets # can throw any type of exception. - message = "Failed while loading data from data set {}.\n{}".format( - str(self), str(exc) + message = ( + f"Failed while loading data from data set {str(self)}.\n{str(exc)}" ) - raise DataSetError(message) from exc + raise DatasetError(message) from exc - def save(self, data: Any) -> None: + def save(self, data: _DI) -> None: """Saves data by delegation to the provided save method. Args: data: the value to be saved by provided save method. Raises: - DataSetError: when underlying save method raises error. + DatasetError: when underlying save method raises error. FileNotFoundError: when save method got file instead of dir, on Windows. NotADirectoryError: when save method got file instead of dir, on Unix. """ if data is None: - raise DataSetError("Saving `None` to a `DataSet` is not allowed") + raise DatasetError("Saving 'None' to a 'Dataset' is not allowed") try: self._logger.debug("Saving %s", str(self)) self._save(data) - except DataSetError: + except DatasetError: raise except (FileNotFoundError, NotADirectoryError): raise except Exception as exc: message = f"Failed while saving data to data set {str(self)}.\n{str(exc)}" - raise DataSetError(message) from exc + raise DatasetError(message) from exc def __str__(self): def _to_str(obj, is_root=False): """Returns a string representation where - 1. The root level (i.e. the DataSet.__init__ arguments) are - formatted like DataSet(key=value). + 1. The root level (i.e. the Dataset.__init__ arguments) are + formatted like Dataset(key=value). 2. Dictionaries have the keys alphabetically sorted recursively. 3. None values are not shown. """ @@ -272,24 +272,24 @@ def _to_str(obj, is_root=False): return f"{type(self).__name__}({_to_str(self._describe(), True)})" @abc.abstractmethod - def _load(self) -> Any: + def _load(self) -> _DO: raise NotImplementedError( - "`{}` is a subclass of AbstractDataSet and" - "it must implement the `_load` method".format(self.__class__.__name__) + f"'{self.__class__.__name__}' is a subclass of AbstractDataSet and " + f"it must implement the '_load' method" ) @abc.abstractmethod - def _save(self, data: Any) -> None: + def _save(self, data: _DI) -> None: raise NotImplementedError( - "`{}` is a subclass of AbstractDataSet and" - "it must implement the `_save` method".format(self.__class__.__name__) + f"'{self.__class__.__name__}' is a subclass of AbstractDataSet and " + f"it must implement the '_save' method" ) @abc.abstractmethod - def _describe(self) -> Dict[str, Any]: + def _describe(self) -> dict[str, Any]: raise NotImplementedError( - "`{}` is a subclass of AbstractDataSet and" - "it must implement the `_describe` method".format(self.__class__.__name__) + f"'{self.__class__.__name__}' is a subclass of AbstractDataSet and " + f"it must implement the '_describe' method" ) def exists(self) -> bool: @@ -300,21 +300,21 @@ def exists(self) -> bool: Flag indicating whether the output already exists. Raises: - DataSetError: when underlying exists method raises error. + DatasetError: when underlying exists method raises error. """ try: self._logger.debug("Checking whether target of %s exists", str(self)) return self._exists() except Exception as exc: - message = "Failed during exists check for data set {}.\n{}".format( - str(self), str(exc) + message = ( + f"Failed during exists check for data set {str(self)}.\n{str(exc)}" ) - raise DataSetError(message) from exc + raise DatasetError(message) from exc def _exists(self) -> bool: self._logger.warning( - "`exists()` not implemented for `%s`. Assuming output does not exist.", + "'exists()' not implemented for '%s'. Assuming output does not exist.", self.__class__.__name__, ) return False @@ -323,7 +323,7 @@ def release(self) -> None: """Release any cached data. Raises: - DataSetError: when underlying release method raises error. + DatasetError: when underlying release method raises error. """ try: @@ -331,12 +331,12 @@ def release(self) -> None: self._release() except Exception as exc: message = f"Failed during release for data set {str(self)}.\n{str(exc)}" - raise DataSetError(message) from exc + raise DatasetError(message) from exc def _release(self) -> None: pass - def _copy(self, **overwrite_params) -> "AbstractDataSet": + def _copy(self, **overwrite_params) -> AbstractDataSet: dataset_copy = copy.deepcopy(self) for name, value in overwrite_params.items(): setattr(dataset_copy, name, value) @@ -365,18 +365,21 @@ class Version(namedtuple("Version", ["load", "save"])): _CONSISTENCY_WARNING = ( - "Save version `{}` did not match load version `{}` for {}. This is strongly " - "discouraged due to inconsistencies it may cause between `save` and " - "`load` operations. Please refrain from setting exact load version for " + "Save version '{}' did not match load version '{}' for {}. This is strongly " + "discouraged due to inconsistencies it may cause between 'save' and " + "'load' operations. Please refrain from setting exact load version for " "intermediate data sets where possible to avoid this warning." ) -_DEFAULT_PACKAGES = ["kedro.io.", "kedro.extras.datasets.", ""] +# `kedro_datasets` is probed before `kedro.extras.datasets`, +# hence the DeprecationWarning will not be shown +# if the dataset is available in the former +_DEFAULT_PACKAGES = ["kedro.io.", "kedro_datasets.", "kedro.extras.datasets.", ""] def parse_dataset_definition( - config: Dict[str, Any], load_version: str = None, save_version: str = None -) -> Tuple[Type[AbstractDataSet], Dict[str, Any]]: + config: dict[str, Any], load_version: str = None, save_version: str = None +) -> tuple[type[AbstractDataSet], dict[str, Any]]: """Parse and instantiate a dataset class using the configuration provided. Args: @@ -390,7 +393,7 @@ def parse_dataset_definition( if versioning was not enabled. Raises: - DataSetError: If the function fails to parse the configuration provided. + DatasetError: If the function fails to parse the configuration provided. Returns: 2-tuple: (Dataset class object, configuration dictionary) @@ -399,49 +402,53 @@ def parse_dataset_definition( config = copy.deepcopy(config) if "type" not in config: - raise DataSetError("`type` is missing from DataSet catalog configuration") + raise DatasetError("'type' is missing from dataset catalog configuration") class_obj = config.pop("type") if isinstance(class_obj, str): if len(class_obj.strip(".")) != len(class_obj): - raise DataSetError( - "`type` class path does not support relative " + raise DatasetError( + "'type' class path does not support relative " "paths or paths ending with a dot." ) - class_paths = (prefix + class_obj for prefix in _DEFAULT_PACKAGES) trials = (_load_obj(class_path) for class_path in class_paths) try: class_obj = next(obj for obj in trials if obj is not None) except StopIteration as exc: - raise DataSetError( - f"Class `{class_obj}` not found or one of its dependencies" + raise DatasetError( + f"Class '{class_obj}' not found or one of its dependencies " f"has not been installed." ) from exc if not issubclass(class_obj, AbstractDataSet): - raise DataSetError( - f"DataSet type `{class_obj.__module__}.{class_obj.__qualname__}` " - f"is invalid: all data set types must extend `AbstractDataSet`." + raise DatasetError( + f"Dataset type '{class_obj.__module__}.{class_obj.__qualname__}' " + f"is invalid: all data set types must extend 'AbstractDataSet'." ) if VERSION_KEY in config: # remove "version" key so that it's not passed # to the "unversioned" data set constructor message = ( - "`%s` attribute removed from data set configuration since it is a " + "'%s' attribute removed from data set configuration since it is a " "reserved word and cannot be directly specified" ) logging.getLogger(__name__).warning(message, VERSION_KEY) del config[VERSION_KEY] - if config.pop(VERSIONED_FLAG_KEY, False): # data set is versioned + + # dataset is either versioned explicitly by the user or versioned is set to true by default + # on the dataset + if config.pop(VERSIONED_FLAG_KEY, False) or getattr( + class_obj, VERSIONED_FLAG_KEY, False + ): config[VERSION_KEY] = Version(load_version, save_version) return class_obj, config -def _load_obj(class_path: str) -> Optional[object]: +def _load_obj(class_path: str) -> object | None: mod_path, _, class_name = class_path.rpartition(".") try: available_classes = load_obj(f"{mod_path}.__all__") @@ -458,11 +465,11 @@ def _load_obj(class_path: str) -> Optional[object]: return None except AttributeError as exc: if available_classes and class_name in available_classes: - raise DataSetError( + raise DatasetError( f"{exc} Please see the documentation on how to " f"install relevant dependencies for {class_path}:\n" f"https://kedro.readthedocs.io/en/stable/" - f"04_kedro_project_setup/01_dependencies.html" + f"kedro_project_setup/dependencies.html" ) from exc return None @@ -474,7 +481,7 @@ def _local_exists(filepath: str) -> bool: # SKIP_IF_NO_SPARK return filepath.exists() or any(par.is_file() for par in filepath.parents) -class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): +class AbstractVersionedDataSet(AbstractDataSet[_DI, _DO], abc.ABC): """ ``AbstractVersionedDataSet`` is the base class for all versioned data set implementations. All data sets that implement versioning should extend this @@ -488,7 +495,7 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): >>> from kedro.io import AbstractVersionedDataSet >>> >>> - >>> class MyOwnDataSet(AbstractVersionedDataSet): + >>> class MyOwnDataset(AbstractVersionedDataSet): >>> def __init__(self, filepath, version, param1, param2=True): >>> super().__init__(PurePosixPath(filepath), version) >>> self._param1 = param1 @@ -513,7 +520,7 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): :: my_dataset: - type: .MyOwnDataSet + type: .MyOwnDataset filepath: data/01_raw/my_data.csv versioned: true param1: # param1 is a required argument @@ -523,9 +530,9 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): def __init__( self, filepath: PurePosixPath, - version: Optional[Version], + version: Version | None, exists_function: Callable[[str], bool] = None, - glob_function: Callable[[str], List[str]] = None, + glob_function: Callable[[str], list[str]] = None, ): """Creates a new instance of ``AbstractVersionedDataSet``. @@ -558,20 +565,26 @@ def _fetch_latest_load_version(self) -> str: most_recent = next( (path for path in version_paths if self._exists_function(path)), None ) - + protocol = getattr(self, "_protocol", None) if not most_recent: - raise VersionNotFoundError(f"Did not find any versions for {self}") - + if protocol in CLOUD_PROTOCOLS: + message = ( + f"Did not find any versions for {self}. This could be " + f"due to insufficient permission." + ) + else: + message = f"Did not find any versions for {self}" + raise VersionNotFoundError(message) return PurePath(most_recent).parent.name # 'key' is set to prevent cache key overlapping for load and save: # https://cachetools.readthedocs.io/en/stable/#cachetools.cachedmethod @cachedmethod(cache=attrgetter("_version_cache"), key=partial(hashkey, "save")) - def _fetch_latest_save_version(self) -> str: # pylint: disable=no-self-use + def _fetch_latest_save_version(self) -> str: # noqa: no-self-use """Generate and cache the current save version""" return generate_timestamp() - def resolve_load_version(self) -> Optional[str]: + def resolve_load_version(self) -> str | None: """Compute the version the dataset should be loaded with.""" if not self._version: return None @@ -587,7 +600,7 @@ def _get_load_path(self) -> PurePosixPath: load_version = self.resolve_load_version() return self._get_versioned_path(load_version) # type: ignore - def resolve_save_version(self) -> Optional[str]: + def resolve_save_version(self) -> str | None: """Compute the version the dataset should be saved with.""" if not self._version: return None @@ -604,9 +617,9 @@ def _get_save_path(self) -> PurePosixPath: versioned_path = self._get_versioned_path(save_version) # type: ignore if self._exists_function(str(versioned_path)): - raise DataSetError( - "Save path `{}` for {} must not exist if versioning " - "is enabled.".format(versioned_path, str(self)) + raise DatasetError( + f"Save path '{versioned_path}' for {str(self)} must not exist if " + f"versioning is enabled." ) return versioned_path @@ -614,11 +627,10 @@ def _get_save_path(self) -> PurePosixPath: def _get_versioned_path(self, version: str) -> PurePosixPath: return self._filepath / version / self._filepath.name - def load(self) -> Any: - self.resolve_load_version() # Make sure last load version is set + def load(self) -> _DO: # noqa: useless-parent-delegation return super().load() - def save(self, data: Any) -> None: + def save(self, data: _DI) -> None: self._version_cache.clear() save_version = self.resolve_save_version() # Make sure last save version is set try: @@ -626,16 +638,16 @@ def save(self, data: Any) -> None: except (FileNotFoundError, NotADirectoryError) as err: # FileNotFoundError raised in Win, NotADirectoryError raised in Unix _default_version = "YYYY-MM-DDThh.mm.ss.sssZ" - raise DataSetError( - f"Cannot save versioned dataset `{self._filepath.name}` to " - f"`{self._filepath.parent.as_posix()}` because a file with the same " + raise DatasetError( + f"Cannot save versioned dataset '{self._filepath.name}' to " + f"'{self._filepath.parent.as_posix()}' because a file with the same " f"name already exists in the directory. This is likely because " f"versioning was enabled on a dataset already saved previously. Either " - f"remove `{self._filepath.name}` from the directory or manually " + f"remove '{self._filepath.name}' from the directory or manually " f"convert it into a versioned dataset by placing it in a versioned " f"directory (e.g. with default versioning format " - f"`{self._filepath.as_posix()}/{_default_version}/{self._filepath.name}" - f"`)." + f"'{self._filepath.as_posix()}/{_default_version}/{self._filepath.name}" + f"')." ) from err load_version = self.resolve_load_version() @@ -652,7 +664,7 @@ def exists(self) -> bool: Flag indicating whether the output already exists. Raises: - DataSetError: when underlying exists method raises error. + DatasetError: when underlying exists method raises error. """ self._logger.debug("Checking whether target of %s exists", str(self)) @@ -661,17 +673,17 @@ def exists(self) -> bool: except VersionNotFoundError: return False except Exception as exc: # SKIP_IF_NO_SPARK - message = "Failed during exists check for data set {}.\n{}".format( - str(self), str(exc) + message = ( + f"Failed during exists check for data set {str(self)}.\n{str(exc)}" ) - raise DataSetError(message) from exc + raise DatasetError(message) from exc def _release(self) -> None: super()._release() self._version_cache.clear() -def _parse_filepath(filepath: str) -> Dict[str, str]: +def _parse_filepath(filepath: str) -> dict[str, str]: """Split filepath on protocol and path. Based on `fsspec.utils.infer_storage_options`. Args: @@ -696,42 +708,47 @@ def _parse_filepath(filepath: str) -> Dict[str, str]: if protocol == "file": windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path) if windows_path: - path = "{}:{}".format(*windows_path.groups()) + path = ":".join(windows_path.groups()) options = {"protocol": protocol, "path": path} - if parsed_path.netloc: - if protocol in CLOUD_PROTOCOLS: - host_with_port = parsed_path.netloc.rsplit("@", 1)[-1] - host = host_with_port.rsplit(":", 1)[0] - options["path"] = host + options["path"] + if parsed_path.netloc and protocol in CLOUD_PROTOCOLS: + host_with_port = parsed_path.netloc.rsplit("@", 1)[-1] + host = host_with_port.rsplit(":", 1)[0] + options["path"] = host + options["path"] + # Azure Data Lake Storage Gen2 URIs can store the container name in the + # 'username' field of a URL (@ syntax), so we need to add it to the path + if protocol == "abfss" and parsed_path.username: + options["path"] = parsed_path.username + "@" + options["path"] return options -def get_protocol_and_path(filepath: str, version: Version = None) -> Tuple[str, str]: +def get_protocol_and_path(filepath: str, version: Version = None) -> tuple[str, str]: """Parses filepath on protocol and path. + .. warning:: + Versioning is not supported for HTTP protocols. + Args: - filepath: raw filepath e.g.: `gcs://bucket/test.json`. + filepath: raw filepath e.g.: ``gcs://bucket/test.json``. version: instance of ``kedro.io.core.Version`` or None. Returns: Protocol and path. Raises: - DataSetError: when protocol is http(s) and version is not None. - Note: HTTP(s) dataset doesn't support versioning. + DatasetError: when protocol is http(s) and version is not None. """ options_dict = _parse_filepath(filepath) path = options_dict["path"] protocol = options_dict["protocol"] if protocol in HTTP_PROTOCOLS: - if version: - raise DataSetError( - "HTTP(s) DataSet doesn't support versioning. " - "Please remove version flag from the dataset configuration." + if version is not None: + raise DatasetError( + "Versioning is not supported for HTTP protocols. " + "Please remove the `versioned` flag from the dataset configuration." ) path = path.split(PROTOCOL_DELIMITER, 1)[-1] @@ -758,6 +775,6 @@ def validate_on_forbidden_chars(**kwargs): """Validate that string values do not include white-spaces or ;""" for key, value in kwargs.items(): if " " in value or ";" in value: - raise DataSetError( - f"Neither white-space nor semicolon are allowed in `{key}`." + raise DatasetError( + f"Neither white-space nor semicolon are allowed in '{key}'." ) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 8f40970885..785a979a7c 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -1,66 +1,41 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``DataCatalog`` stores instances of ``AbstractDataSet`` implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to instantiate it with a dictionary of data sets. Then it will act as a single point of reference for your calls, relaying load and save functions to the underlying data sets. """ +from __future__ import annotations + import copy import difflib import logging import re -import warnings from collections import defaultdict -from functools import partial -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Type, Union -from warnings import warn +from typing import Any, Dict, Iterable + +from parse import parse from kedro.io.core import ( AbstractDataSet, AbstractVersionedDataSet, - DataSetAlreadyExistsError, - DataSetError, - DataSetNotFoundError, + DatasetAlreadyExistsError, + DatasetError, + DatasetNotFoundError, Version, generate_timestamp, ) -from kedro.io.memory_data_set import MemoryDataSet -from kedro.io.transformers import AbstractTransformer -from kedro.versioning import Journal +from kedro.io.memory_dataset import MemoryDataset + +Patterns = Dict[str, Dict[str, Any]] CATALOG_KEY = "catalog" CREDENTIALS_KEY = "credentials" +WORDS_REGEX_PATTERN = re.compile(r"\W+") def _get_credentials( - credentials_name: str, credentials: Dict[str, Any] -) -> Dict[str, Any]: + credentials_name: str, credentials: dict[str, Any] +) -> dict[str, Any]: """Return a set of credentials from the provided credentials dict. Args: @@ -87,8 +62,8 @@ def _get_credentials( def _resolve_credentials( - config: Dict[str, Any], credentials: Dict[str, Any] -) -> Dict[str, Any]: + config: dict[str, Any], credentials: dict[str, Any] +) -> dict[str, Any]: """Return the dataset configuration where credentials are resolved using credentials dictionary provided. @@ -120,22 +95,36 @@ def _sub_nonword_chars(data_set_name: str) -> str: Returns: The name used in `DataCatalog.datasets`. """ - return re.sub(r"\W+", "__", data_set_name) + return re.sub(WORDS_REGEX_PATTERN, "__", data_set_name) class _FrozenDatasets: - """Helper class to access underlying loaded datasets""" + """Helper class to access underlying loaded datasets.""" - def __init__(self, datasets): - # Non-word characters in dataset names are replaced with `__` - # for easy access to transcoded/prefixed datasets. - datasets = {_sub_nonword_chars(key): value for key, value in datasets.items()} - self.__dict__.update(**datasets) + def __init__( + self, + *datasets_collections: _FrozenDatasets | dict[str, AbstractDataSet], + ): + """Return a _FrozenDatasets instance from some datasets collections. + Each collection could either be another _FrozenDatasets or a dictionary. + """ + for collection in datasets_collections: + if isinstance(collection, _FrozenDatasets): + self.__dict__.update(collection.__dict__) + else: + # Non-word characters in dataset names are replaced with `__` + # for easy access to transcoded/prefixed datasets. + self.__dict__.update( + { + _sub_nonword_chars(dataset_name): dataset + for dataset_name, dataset in collection.items() + } + ) # Don't allow users to add/change attributes on the fly def __setattr__(self, key, value): msg = "Operation not allowed! " - if key in self.__dict__.keys(): + if key in self.__dict__: msg += "Please change datasets through configuration." else: msg += "Please use DataCatalog.add() instead." @@ -151,15 +140,14 @@ class DataCatalog: to the underlying data sets. """ - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, - data_sets: Dict[str, AbstractDataSet] = None, - feed_dict: Dict[str, Any] = None, - transformers: Dict[str, List[AbstractTransformer]] = None, - default_transformers: List[AbstractTransformer] = None, - journal: Journal = None, - layers: Dict[str, Set[str]] = None, + data_sets: dict[str, AbstractDataSet] = None, + feed_dict: dict[str, Any] = None, + layers: dict[str, set[str]] = None, + dataset_patterns: Patterns = None, + load_versions: dict[str, str] = None, + save_version: str = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataSet`` implementations to provide ``load`` and ``save`` capabilities from @@ -171,18 +159,19 @@ def __init__( Args: data_sets: A dictionary of data set names and data set instances. feed_dict: A feed dict with data to be added in memory. - transformers: A dictionary of lists of transformers to be applied - to the data sets. - default_transformers: A list of transformers to be applied to any - new data sets. - journal: Instance of Journal. layers: A dictionary of data set layers. It maps a layer name to a set of data set names, according to the data engineering convention. For more details, see - https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#what-is-data-engineering-convention - Raises: - DataSetNotFoundError: When transformers are passed for a non - existent data set. + https://docs.kedro.org/en/stable/resources/glossary.html#layers-data-engineering-convention + dataset_patterns: A dictionary of data set factory patterns + and corresponding data set configuration + load_versions: A mapping between data set names and versions + to load. Has no effect on data sets without enabled versioning. + save_version: Version string to be used for ``save`` operations + by all data sets with enabled versioning. It must: a) be a + case-insensitive string that conforms with operating system + filename limitations, b) always return the latest version when + sorted in lexicographical order. Example: :: @@ -197,12 +186,12 @@ def __init__( self._data_sets = dict(data_sets or {}) self.datasets = _FrozenDatasets(self._data_sets) self.layers = layers + # Keep a record of all patterns in the catalog. + # {dataset pattern name : dataset pattern body} + self._dataset_patterns = dataset_patterns or {} + self._load_versions = load_versions or {} + self._save_version = save_version - self._transformers = {k: list(v) for k, v in (transformers or {}).items()} - self._default_transformers = list(default_transformers or []) - self._check_and_normalize_transformers() - self._journal = journal - # import the feed dict if feed_dict: self.add_feed_dict(feed_dict) @@ -210,32 +199,14 @@ def __init__( def _logger(self): return logging.getLogger(__name__) - def _check_and_normalize_transformers(self): - data_sets = self._data_sets.keys() - transformers = self._transformers.keys() - excess_transformers = transformers - data_sets - missing_transformers = data_sets - transformers - - if excess_transformers: - raise DataSetNotFoundError( - "Unexpected transformers for missing data_sets {}".format( - ", ".join(excess_transformers) - ) - ) - - for data_set_name in missing_transformers: - self._transformers[data_set_name] = list(self._default_transformers) - - # pylint: disable=too-many-arguments @classmethod def from_config( - cls: Type, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]] = None, - load_versions: Dict[str, str] = None, + cls, + catalog: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] = None, + load_versions: dict[str, str] = None, save_version: str = None, - journal: Journal = None, - ) -> "DataCatalog": + ) -> DataCatalog: """Create a ``DataCatalog`` instance from configuration. This is a factory method used to provide developers with a way to instantiate ``DataCatalog`` with configuration parsed from configuration files. @@ -259,15 +230,16 @@ class to be loaded is specified with the key ``type`` and their case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. - journal: Instance of Journal. Returns: An instantiated ``DataCatalog`` containing all specified data sets, created and ready to use. Raises: - DataSetError: When the method fails to create any of the data + DatasetError: When the method fails to create any of the data sets from their config. + DatasetNotFoundError: When `load_versions` refers to a dataset that doesn't + exist in the catalog. Example: :: @@ -283,7 +255,7 @@ class to be loaded is specified with the key ``type`` and their >>> "boats": { >>> "type": "pandas.CSVDataSet", >>> "filepath": "s3://aws-bucket-name/boats.csv", - >>> "credentials": "boats_credentials" + >>> "credentials": "boats_credentials", >>> "save_args": { >>> "index": False >>> } @@ -305,63 +277,169 @@ class to be loaded is specified with the key ``type`` and their >>> catalog.save("boats", df) """ data_sets = {} + dataset_patterns = {} catalog = copy.deepcopy(catalog) or {} credentials = copy.deepcopy(credentials) or {} - run_id = journal.run_id if journal else None - save_version = save_version or run_id or generate_timestamp() + save_version = save_version or generate_timestamp() load_versions = copy.deepcopy(load_versions) or {} + layers: dict[str, set[str]] = defaultdict(set) + + for ds_name, ds_config in catalog.items(): + ds_config = _resolve_credentials( # noqa: redefined-loop-name + ds_config, credentials + ) + if cls._is_pattern(ds_name): + # Add each factory to the dataset_patterns dict. + dataset_patterns[ds_name] = ds_config - missing_keys = load_versions.keys() - catalog.keys() + else: + ds_layer = ds_config.pop("layer", None) + if ds_layer is not None: + layers[ds_layer].add(ds_name) + data_sets[ds_name] = AbstractDataSet.from_config( + ds_name, ds_config, load_versions.get(ds_name), save_version + ) + dataset_layers = layers or None + sorted_patterns = cls._sort_patterns(dataset_patterns) + missing_keys = [ + key + for key in load_versions.keys() + if not (key in catalog or cls._match_pattern(sorted_patterns, key)) + ] if missing_keys: - warn( - f"`load_versions` keys [{', '.join(sorted(missing_keys))}] " + raise DatasetNotFoundError( + f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " f"are not found in the catalog." ) - layers = defaultdict(set) # type: Dict[str, Set[str]] - for ds_name, ds_config in catalog.items(): - ds_layer = ds_config.pop("layer", None) - if ds_layer is not None: - layers[ds_layer].add(ds_name) + return cls( + data_sets=data_sets, + layers=dataset_layers, + dataset_patterns=sorted_patterns, + load_versions=load_versions, + save_version=save_version, + ) - ds_config = _resolve_credentials(ds_config, credentials) - data_sets[ds_name] = AbstractDataSet.from_config( - ds_name, ds_config, load_versions.get(ds_name), save_version - ) + @staticmethod + def _is_pattern(pattern: str): + """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" + return "{" in pattern + + @staticmethod + def _match_pattern(data_set_patterns: Patterns, data_set_name: str) -> str | None: + """Match a dataset name against patterns in a dictionary containing patterns""" + matches = ( + pattern + for pattern in data_set_patterns.keys() + if parse(pattern, data_set_name) + ) + return next(matches, None) - dataset_layers = layers or None - return cls(data_sets=data_sets, journal=journal, layers=dataset_layers) + @classmethod + def _sort_patterns(cls, data_set_patterns: Patterns) -> dict[str, dict[str, Any]]: + """Sort a dictionary of dataset patterns according to parsing rules - + 1. Decreasing specificity (number of characters outside the curly brackets) + 2. Decreasing number of placeholders (number of curly bracket pairs) + 3. Alphabetically + """ + sorted_keys = sorted( + data_set_patterns, + key=lambda pattern: ( + -(cls._specificity(pattern)), + -pattern.count("{"), + pattern, + ), + ) + return {key: data_set_patterns[key] for key in sorted_keys} + + @staticmethod + def _specificity(pattern: str) -> int: + """Helper function to check the length of exactly matched characters not inside brackets + Example - + specificity("{namespace}.companies") = 10 + specificity("{namespace}.{dataset}") = 1 + specificity("france.companies") = 16 + """ + # Remove all the placeholders from the pattern and count the number of remaining chars + result = re.sub(r"\{.*?\}", "", pattern) + return len(result) def _get_dataset( - self, data_set_name: str, version: Version = None + self, data_set_name: str, version: Version = None, suggest: bool = True ) -> AbstractDataSet: - if data_set_name not in self._data_sets: - error_msg = f"DataSet '{data_set_name}' not found in the catalog" - - matches = difflib.get_close_matches(data_set_name, self._data_sets.keys()) - if matches: - suggestions = ", ".join(matches) # type: ignore - error_msg += f" - did you mean one of these instead: {suggestions}" + matched_pattern = self._match_pattern(self._dataset_patterns, data_set_name) + if data_set_name not in self._data_sets and matched_pattern: + # If the dataset is a patterned dataset, materialise it and add it to + # the catalog + data_set_config = self._resolve_config(data_set_name, matched_pattern) + ds_layer = data_set_config.pop("layer", None) + if ds_layer: + self.layers = self.layers or {} + self.layers.setdefault(ds_layer, set()).add(data_set_name) + data_set = AbstractDataSet.from_config( + data_set_name, + data_set_config, + self._load_versions.get(data_set_name), + self._save_version, + ) + if self._specificity(matched_pattern) == 0: + self._logger.warning( + "Config from the dataset factory pattern '%s' in the catalog will be used to " + "override the default MemoryDataset creation for the dataset '%s'", + matched_pattern, + data_set_name, + ) - raise DataSetNotFoundError(error_msg) + self.add(data_set_name, data_set) + if data_set_name not in self._data_sets: + error_msg = f"Dataset '{data_set_name}' not found in the catalog" + # Flag to turn on/off fuzzy-matching which can be time consuming and + # slow down plugins like `kedro-viz` + if suggest: + matches = difflib.get_close_matches( + data_set_name, self._data_sets.keys() + ) + if matches: + suggestions = ", ".join(matches) + error_msg += f" - did you mean one of these instead: {suggestions}" + raise DatasetNotFoundError(error_msg) data_set = self._data_sets[data_set_name] if version and isinstance(data_set, AbstractVersionedDataSet): # we only want to return a similar-looking dataset, # not modify the one stored in the current catalog - data_set = data_set._copy( # pylint: disable=protected-access - _version=version - ) + data_set = data_set._copy(_version=version) # noqa: protected-access return data_set - def _get_transformed_dataset_function( - self, data_set_name: str, operation: str, data_set: AbstractDataSet - ) -> Callable: - func = getattr(data_set, operation) - for transformer in reversed(self._transformers[data_set_name]): - func = partial(getattr(transformer, operation), data_set_name, func) - return func + def __contains__(self, data_set_name): + """Check if an item is in the catalog as a materialised dataset or pattern""" + matched_pattern = self._match_pattern(self._dataset_patterns, data_set_name) + if data_set_name in self._data_sets or matched_pattern: + return True + return False + + def _resolve_config( + self, + data_set_name: str, + matched_pattern: str, + ) -> dict[str, Any]: + """Get resolved AbstractDataSet from a factory config""" + result = parse(matched_pattern, data_set_name) + config_copy = copy.deepcopy(self._dataset_patterns[matched_pattern]) + # Resolve the factory config for the dataset + for key, value in config_copy.items(): + if isinstance(value, Iterable) and "}" in value: + # result.named: gives access to all dict items in the match result. + # format_map fills in dict values into a string with {...} placeholders + # of the same key name. + try: + config_copy[key] = str(value).format_map(result.named) + except KeyError as exc: + raise DatasetError( + f"Unable to resolve '{key}' for the pattern '{matched_pattern}'" + ) from exc + return config_copy def load(self, name: str, version: str = None) -> Any: """Loads a registered data set. @@ -375,7 +453,7 @@ def load(self, name: str, version: str = None) -> Any: The loaded data as configured. Raises: - DataSetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a data set with the given name has not yet been registered. Example: @@ -395,21 +473,14 @@ def load(self, name: str, version: str = None) -> Any: dataset = self._get_dataset(name, version=load_version) self._logger.info( - "Loading data from `%s` (%s)...", name, type(dataset).__name__ + "Loading data from [dark_orange]%s[/dark_orange] (%s)...", + name, + type(dataset).__name__, + extra={"markup": True}, ) - func = self._get_transformed_dataset_function(name, "load", dataset) - result = func() - - version = ( - dataset.resolve_load_version() - if isinstance(dataset, AbstractVersionedDataSet) - else None - ) + result = dataset.load() - # Log only if versioning is enabled for the data set - if self._journal and version: - self._journal.log_catalog(name, "load", version) return result def save(self, name: str, data: Any) -> None: @@ -421,7 +492,7 @@ def save(self, name: str, data: Any) -> None: data set. Raises: - DataSetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a data set with the given name has not yet been registered. Example: @@ -443,20 +514,14 @@ def save(self, name: str, data: Any) -> None: """ dataset = self._get_dataset(name) - self._logger.info("Saving data to `%s` (%s)...", name, type(dataset).__name__) - - func = self._get_transformed_dataset_function(name, "save", dataset) - func(data) - - version = ( - dataset.resolve_save_version() - if isinstance(dataset, AbstractVersionedDataSet) - else None + self._logger.info( + "Saving data to [dark_orange]%s[/dark_orange] (%s)...", + name, + type(dataset).__name__, + extra={"markup": True}, ) - # Log only if versioning is enabled for the data set - if self._journal and version: - self._journal.log_catalog(name, "save", version) + dataset.save(data) def exists(self, name: str) -> bool: """Checks whether registered data set exists by calling its `exists()` @@ -472,7 +537,7 @@ def exists(self, name: str) -> bool: """ try: dataset = self._get_dataset(name) - except DataSetNotFoundError: + except DatasetNotFoundError: return False return dataset.exists() @@ -483,7 +548,7 @@ def release(self, name: str): name: A data set to be checked. Raises: - DataSetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a data set with the given name has not yet been registered. """ dataset = self._get_dataset(name) @@ -499,11 +564,11 @@ def add( registered yet. data_set: A data set object to be associated with the given data set name. - replace: Specifies whether to replace an existing ``DataSet`` + replace: Specifies whether to replace an existing dataset with the same name is allowed. Raises: - DataSetAlreadyExistsError: When a data set with the same name + DatasetAlreadyExistsError: When a data set with the same name has already been registered. Example: @@ -519,28 +584,27 @@ def add( """ if data_set_name in self._data_sets: if replace: - self._logger.warning("Replacing DataSet '%s'", data_set_name) + self._logger.warning("Replacing dataset '%s'", data_set_name) else: - raise DataSetAlreadyExistsError( - f"DataSet '{data_set_name}' has already been registered" + raise DatasetAlreadyExistsError( + f"Dataset '{data_set_name}' has already been registered" ) self._data_sets[data_set_name] = data_set - self._transformers[data_set_name] = list(self._default_transformers) - self.datasets = _FrozenDatasets(self._data_sets) + self.datasets = _FrozenDatasets(self.datasets, {data_set_name: data_set}) def add_all( - self, data_sets: Dict[str, AbstractDataSet], replace: bool = False + self, data_sets: dict[str, AbstractDataSet], replace: bool = False ) -> None: """Adds a group of new data sets to the ``DataCatalog``. Args: - data_sets: A dictionary of ``DataSet`` names and data set + data_sets: A dictionary of dataset names and dataset instances. - replace: Specifies whether to replace an existing ``DataSet`` + replace: Specifies whether to replace an existing dataset with the same name is allowed. Raises: - DataSetAlreadyExistsError: When a data set with the same name + DatasetAlreadyExistsError: When a data set with the same name has already been registered. Example: @@ -563,13 +627,13 @@ def add_all( for name, data_set in data_sets.items(): self.add(name, data_set, replace) - def add_feed_dict(self, feed_dict: Dict[str, Any], replace: bool = False) -> None: - """Adds instances of ``MemoryDataSet``, containing the data provided + def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: + """Adds instances of ``MemoryDataset``, containing the data provided through feed_dict. Args: feed_dict: A feed dict with data to be added in memory. - replace: Specifies whether to replace an existing ``DataSet`` + replace: Specifies whether to replace an existing dataset with the same name is allowed. Example: @@ -592,55 +656,13 @@ def add_feed_dict(self, feed_dict: Dict[str, Any], replace: bool = False) -> Non if isinstance(feed_dict[data_set_name], AbstractDataSet): data_set = feed_dict[data_set_name] else: - data_set = MemoryDataSet(data=feed_dict[data_set_name]) + data_set = MemoryDataset(data=feed_dict[data_set_name]) self.add(data_set_name, data_set, replace) - def add_transformer( - self, - transformer: AbstractTransformer, - data_set_names: Union[str, Iterable[str]] = None, - ): - """Add a ``DataSet`` Transformer to the``DataCatalog``. - Transformers can modify the way Data Sets are loaded and saved. - - Args: - transformer: The transformer instance to add. - data_set_names: The Data Sets to add the transformer to. - Or None to add the transformer to all Data Sets. - Raises: - DataSetNotFoundError: When a transformer is being added to a non - existent data set. - TypeError: When transformer isn't an instance of ``AbstractTransformer`` - """ - - warnings.warn( - "The transformer API will be deprecated in Kedro 0.18.0." - "Please use Dataset Hooks to customise the load and save methods." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/04_hooks.html", - DeprecationWarning, - ) - - if not isinstance(transformer, AbstractTransformer): - raise TypeError( - "Object of type {} is not an instance of AbstractTransformer".format( - type(transformer) - ) - ) - if data_set_names is None: - self._default_transformers.append(transformer) - data_set_names = self._transformers.keys() - elif isinstance(data_set_names, str): - data_set_names = [data_set_names] - for data_set_name in data_set_names: - if data_set_name not in self._data_sets: - raise DataSetNotFoundError(f"No data set called {data_set_name}") - self._transformers[data_set_name].append(transformer) - - def list(self, regex_search: Optional[str] = None) -> List[str]: + def list(self, regex_search: str | None = None) -> list[str]: """ - List of all ``DataSet`` names registered in the catalog. + List of all dataset names registered in the catalog. This can be filtered by providing an optional regular expression which will only return matching keys. @@ -648,7 +670,7 @@ def list(self, regex_search: Optional[str] = None) -> List[str]: regex_search: An optional regular expression which can be provided to limit the data sets returned by a particular pattern. Returns: - A list of ``DataSet`` names available which match the + A list of dataset names available which match the `regex_search` criteria (if provided). All data set names are returned by default. @@ -671,7 +693,7 @@ def list(self, regex_search: Optional[str] = None) -> List[str]: return list(self._data_sets.keys()) if not regex_search.strip(): - logging.warning("The empty string will not match any data sets") + self._logger.warning("The empty string will not match any data sets") return [] try: @@ -679,11 +701,11 @@ def list(self, regex_search: Optional[str] = None) -> List[str]: except re.error as exc: raise SyntaxError( - f"Invalid regular expression provided: `{regex_search}`" + f"Invalid regular expression provided: '{regex_search}'" ) from exc return [dset_name for dset_name in self._data_sets if pattern.search(dset_name)] - def shallow_copy(self) -> "DataCatalog": + def shallow_copy(self) -> DataCatalog: """Returns a shallow copy of the current object. Returns: @@ -691,25 +713,17 @@ def shallow_copy(self) -> "DataCatalog": """ return DataCatalog( data_sets=self._data_sets, - transformers=self._transformers, - default_transformers=self._default_transformers, - journal=self._journal, layers=self.layers, + dataset_patterns=self._dataset_patterns, + load_versions=self._load_versions, + save_version=self._save_version, ) def __eq__(self, other): - return ( - self._data_sets, - self._transformers, - self._default_transformers, - self._journal, - self.layers, - ) == ( + return (self._data_sets, self.layers, self._dataset_patterns) == ( other._data_sets, - other._transformers, - other._default_transformers, - other._journal, other.layers, + other._dataset_patterns, ) def confirm(self, name: str) -> None: @@ -718,13 +732,13 @@ def confirm(self, name: str) -> None: Args: name: Name of the dataset. Raises: - DataSetError: When the dataset does not have `confirm` method. + DatasetError: When the dataset does not have `confirm` method. """ - self._logger.info("Confirming DataSet '%s'", name) + self._logger.info("Confirming dataset '%s'", name) data_set = self._get_dataset(name) if hasattr(data_set, "confirm"): data_set.confirm() # type: ignore else: - raise DataSetError(f"DataSet '{name}' does not have 'confirm' method") + raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") diff --git a/kedro/io/data_catalog_with_default.py b/kedro/io/data_catalog_with_default.py deleted file mode 100644 index ee3296bc14..0000000000 --- a/kedro/io/data_catalog_with_default.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""A ``DataCatalog`` with a default ``DataSet`` implementation for any data set -which is not registered in the catalog. -""" -from typing import Any, Callable, Dict, Optional - -from kedro.io.core import AbstractDataSet -from kedro.io.data_catalog import DataCatalog -from kedro.versioning import Journal - - -class DataCatalogWithDefault(DataCatalog): - """A ``DataCatalog`` with a default ``DataSet`` implementation for any - data set which is not registered in the catalog. - """ - - def __init__( - self, - data_sets: Dict[str, AbstractDataSet] = None, - default: Callable[[str], AbstractDataSet] = None, - remember: bool = False, - ): - """A ``DataCatalog`` with a default ``DataSet`` implementation for any - data set which is not registered in the catalog. - - Args: - data_sets: A dictionary of data set names and data set instances. - default: A callable which accepts a single argument of type string, - the key of the data set, and returns an ``AbstractDataSet``. - ``load`` and ``save`` calls on data sets which are not - registered to the catalog will be delegated to this - ``AbstractDataSet``. - remember: If True, then store in the catalog any - ``AbstractDataSet``s provided by the ``default`` callable - argument. Useful when one want to transition from a - ``DataCatalogWithDefault`` to a ``DataCatalog``: just call - ``DataCatalogWithDefault.to_yaml``, after all required data - sets have been saved/loaded, and use the generated YAML file - with a new ``DataCatalog``. - Raises: - TypeError: If default is not a callable. - - Example: - :: - - >>> from kedro.extras.datasets.pandas import CSVDataSet - >>> - >>> def default_data_set(name): - >>> return CSVDataSet(filepath='data/01_raw/' + name) - >>> - >>> io = DataCatalog(data_sets={}, - >>> default=default_data_set) - >>> - >>> # load the file in data/raw/cars.csv - >>> df = io.load("cars.csv") - """ - super().__init__(data_sets) - - if not callable(default): - raise TypeError( - "Default must be a callable with a single input " - "string argument: the key of the requested data " - "set." - ) - self._default = default - self._remember = remember - - def load(self, name: str, version: str = None) -> Any: - """Loads a registered data set - - Args: - name: A data set to be loaded. - version: Optional version to be loaded. - - - Returns: - The loaded data as configured. - - Raises: - DataSetNotFoundError: When a data set with the given name - has not yet been registered. - - """ - data_set = self._data_sets.get(name, self._default(name)) - - if self._remember and name not in self._data_sets: - self._data_sets[name] = data_set - - return data_set.load() - - def save(self, name: str, data: Any): - """Save data to a registered data set. - - Args: - name: A data set to be saved to. - data: A data object to be saved as configured in the registered - data set. - - Raises: - DataSetNotFoundError: When a data set with the given name - has not yet been registered. - - """ - data_set = self._data_sets.get(name, self._default(name)) - - if self._remember and name not in self._data_sets: - self._data_sets[name] = data_set - - data_set.save(data) - - # pylint: disable=too-many-arguments - @classmethod - def from_config( - cls, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]] = None, - load_versions: Dict[str, str] = None, - save_version: str = None, - journal: Journal = None, - ): - """To create a ``DataCatalogWithDefault`` from configuration, please - use: - :: - - >>> DataCatalogWithDefault.from_data_catalog( - >>> DataCatalog.from_config(catalog, credentials)) - - Args: - catalog: See ``DataCatalog.from_config`` - credentials: See ``DataCatalog.from_config`` - load_versions: See ``DataCatalog.from_config`` - save_version: See ``DataCatalog.from_config`` - journal: See ``DataCatalog.from_config`` - - Raises: - ValueError: If you try to instantiate a ``DataCatalogWithDefault`` - directly with this method. - - """ - raise ValueError( - "Cannot instantiate a `DataCatalogWithDefault` " - "directly from configuration files. Please use" - "``DataCatalogWithDefault.from_data_catalog(" - "DataCatalog.from_config(catalog, " - "credentials, journal))" - ) - - @classmethod - def from_data_catalog( - cls, data_catalog: DataCatalog, default: Callable[[str], AbstractDataSet] - ) -> "DataCatalogWithDefault": - """Convenience factory method to create a ``DataCatalogWithDefault`` - from a ``DataCatalog`` - - A ``DataCatalog`` with a default ``DataSet`` implementation for any - data set which is not registered in the catalog. - - Args: - data_catalog: The ``DataCatalog`` to convert to a - ``DataCatalogWithDefault``. - default: A callable which accepts a single argument of type string, - the key of the data set, and returns an ``AbstractDataSet``. - ``load`` and ``save`` calls on data sets which are not - registered to the catalog will be delegated to this - ``AbstractDataSet``. - - Returns: - A new ``DataCatalogWithDefault`` which contains all the - ``AbstractDataSets`` from the provided data-catalog. - - """ - # pylint: disable=protected-access - return cls({**data_catalog._data_sets}, default) - - def shallow_copy(self) -> "DataCatalogWithDefault": # pragma: no cover - """Returns a shallow copy of the current object. - Returns: - Copy of the current object. - """ - return DataCatalogWithDefault({**self._data_sets}, self._default) diff --git a/kedro/io/lambda_data_set.py b/kedro/io/lambda_data_set.py deleted file mode 100644 index 2dd1977f94..0000000000 --- a/kedro/io/lambda_data_set.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -"""``LambdaDataSet`` is an implementation of ``AbstractDataSet`` which allows for -providing custom load, save, and exists methods without extending -``AbstractDataSet``. -""" -from typing import Any, Callable, Dict, Optional - -from kedro.io.core import AbstractDataSet, DataSetError - - -class LambdaDataSet(AbstractDataSet): - """``LambdaDataSet`` loads and saves data to a data set. - It relies on delegating to specific implementation such as csv, sql, etc. - - ``LambdaDataSet`` class captures Exceptions while performing operations on - composed ``DataSet`` implementations. The composed data set is - responsible for providing information on how to resolve the issue when - possible. This information should be available through str(error). - - Example: - :: - - >>> from kedro.io import LambdaDataSet - >>> import pandas as pd - >>> - >>> file_name = "test.csv" - >>> def load() -> pd.DataFrame: - >>> raise FileNotFoundError("'{}' csv file not found." - >>> .format(file_name)) - >>> data_set = LambdaDataSet(load, None) - """ - - def _describe(self) -> Dict[str, Any]: - def _to_str(func): - if not func: - return None - try: - return f"<{func.__module__}.{func.__name__}>" - except AttributeError: # pragma: no cover - return str(func) - - descr = { - "load": _to_str(self.__load), - "save": _to_str(self.__save), - "exists": _to_str(self.__exists), - "release": _to_str(self.__release), - } - - return descr - - def _save(self, data: Any) -> None: - if not self.__save: - raise DataSetError( - "Cannot save to data set. No `save` function " - "provided when LambdaDataSet was created." - ) - self.__save(data) - - def _load(self) -> Any: - if not self.__load: - raise DataSetError( - "Cannot load data set. No `load` function " - "provided when LambdaDataSet was created." - ) - return self.__load() - - def _exists(self) -> bool: - if not self.__exists: - return super()._exists() - return self.__exists() - - def _release(self) -> None: - if not self.__release: - super()._release() - else: - self.__release() - - def __init__( - self, - load: Optional[Callable[[], Any]], - save: Optional[Callable[[Any], None]], - exists: Callable[[], bool] = None, - release: Callable[[], None] = None, - ): - """Creates a new instance of ``LambdaDataSet`` with references to the - required input/output data set methods. - - Args: - load: Method to load data from a data set. - save: Method to save data to a data set. - exists: Method to check whether output data already exists. - release: Method to release any cached information. - - Raises: - DataSetError: If a method is specified, but is not a Callable. - - """ - - for name, value in [ - ("load", load), - ("save", save), - ("exists", exists), - ("release", release), - ]: - if value is not None and not callable(value): - raise DataSetError( - "`{}` function for LambdaDataSet must be a Callable. " - "Object of type `{}` provided instead.".format( - name, value.__class__.__name__ - ) - ) - - self.__load = load - self.__save = save - self.__exists = exists - self.__release = release diff --git a/kedro/io/lambda_dataset.py b/kedro/io/lambda_dataset.py new file mode 100644 index 0000000000..b2cca48921 --- /dev/null +++ b/kedro/io/lambda_dataset.py @@ -0,0 +1,136 @@ +"""``LambdaDataset`` is an implementation of ``AbstractDataSet`` which allows for +providing custom load, save, and exists methods without extending +``AbstractDataSet``. +""" +from __future__ import annotations + +import warnings +from typing import Any, Callable + +from kedro.io.core import AbstractDataSet, DatasetError + +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +LambdaDataSet: type[LambdaDataset] + + +class LambdaDataset(AbstractDataSet): + """``LambdaDataset`` loads and saves data to a data set. + It relies on delegating to specific implementation such as csv, sql, etc. + + ``LambdaDataset`` class captures Exceptions while performing operations on + composed ``Dataset`` implementations. The composed data set is + responsible for providing information on how to resolve the issue when + possible. This information should be available through str(error). + + Example: + :: + + >>> from kedro.io import LambdaDataset + >>> import pandas as pd + >>> + >>> file_name = "test.csv" + >>> def load() -> pd.DataFrame: + >>> raise FileNotFoundError("'{}' csv file not found." + >>> .format(file_name)) + >>> data_set = LambdaDataset(load, None) + """ + + def _describe(self) -> dict[str, Any]: + def _to_str(func): + if not func: + return None + try: + return f"<{func.__module__}.{func.__name__}>" + except AttributeError: # pragma: no cover + return str(func) + + descr = { + "load": _to_str(self.__load), + "save": _to_str(self.__save), + "exists": _to_str(self.__exists), + "release": _to_str(self.__release), + } + + return descr + + def _save(self, data: Any) -> None: + if not self.__save: + raise DatasetError( + "Cannot save to data set. No 'save' function " + "provided when LambdaDataset was created." + ) + self.__save(data) + + def _load(self) -> Any: + if not self.__load: + raise DatasetError( + "Cannot load data set. No 'load' function " + "provided when LambdaDataset was created." + ) + return self.__load() + + def _exists(self) -> bool: + if not self.__exists: + return super()._exists() + return self.__exists() + + def _release(self) -> None: + if not self.__release: + super()._release() + else: + self.__release() + + def __init__( # noqa: too-many-arguments + self, + load: Callable[[], Any] | None, + save: Callable[[Any], None] | None, + exists: Callable[[], bool] = None, + release: Callable[[], None] = None, + metadata: dict[str, Any] = None, + ): + """Creates a new instance of ``LambdaDataset`` with references to the + required input/output data set methods. + + Args: + load: Method to load data from a data set. + save: Method to save data to a data set. + exists: Method to check whether output data already exists. + release: Method to release any cached information. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. + + Raises: + DatasetError: If a method is specified, but is not a Callable. + + """ + + for name, value in [ + ("load", load), + ("save", save), + ("exists", exists), + ("release", release), + ]: + if value is not None and not callable(value): + raise DatasetError( + f"'{name}' function for LambdaDataset must be a Callable. " + f"Object of type '{value.__class__.__name__}' provided instead." + ) + + self.__load = load + self.__save = save + self.__exists = exists + self.__release = release + self.metadata = metadata + + +def __getattr__(name): + if name == "LambdaDataSet": + alias = LambdaDataset + warnings.warn( + f"{repr(name)} has been renamed to {repr(alias.__name__)}, " + f"and the alias will be removed in Kedro 0.19.0", + DeprecationWarning, + stacklevel=2, + ) + return alias + raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/kedro/io/memory_data_set.py b/kedro/io/memory_dataset.py similarity index 57% rename from kedro/io/memory_data_set.py rename to kedro/io/memory_dataset.py index 27f9defe03..1dc5ded1b0 100644 --- a/kedro/io/memory_data_set.py +++ b/kedro/io/memory_dataset.py @@ -1,55 +1,32 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``MemoryDataSet`` is a data set implementation which handles in-memory data. +"""``MemoryDataset`` is a data set implementation which handles in-memory data. """ +from __future__ import annotations import copy -from typing import Any, Dict +import warnings +from typing import Any -from kedro.io.core import AbstractDataSet, DataSetError +from kedro.io.core import AbstractDataSet, DatasetError _EMPTY = object() +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +MemoryDataSet: type[MemoryDataset] -class MemoryDataSet(AbstractDataSet): - """``MemoryDataSet`` loads and saves data from/to an in-memory + +class MemoryDataset(AbstractDataSet): + """``MemoryDataset`` loads and saves data from/to an in-memory Python object. Example: :: - >>> from kedro.io import MemoryDataSet + >>> from kedro.io import MemoryDataset >>> import pandas as pd >>> >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) - >>> data_set = MemoryDataSet(data=data) + >>> data_set = MemoryDataset(data=data) >>> >>> loaded_data = data_set.load() >>> assert loaded_data.equals(data) @@ -61,8 +38,10 @@ class MemoryDataSet(AbstractDataSet): """ - def __init__(self, data: Any = _EMPTY, copy_mode: str = None): - """Creates a new instance of ``MemoryDataSet`` pointing to the + def __init__( + self, data: Any = _EMPTY, copy_mode: str = None, metadata: dict[str, Any] = None + ): + """Creates a new instance of ``MemoryDataset`` pointing to the provided Python object. Args: @@ -70,15 +49,18 @@ def __init__(self, data: Any = _EMPTY, copy_mode: str = None): copy_mode: The copy mode used to copy the data. Possible values are: "deepcopy", "copy" and "assign". If not provided, it is inferred based on the data type. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ self._data = _EMPTY self._copy_mode = copy_mode + self.metadata = metadata if data is not _EMPTY: self._save(data) def _load(self) -> Any: if self._data is _EMPTY: - raise DataSetError("Data for MemoryDataSet has not been saved yet.") + raise DatasetError("Data for MemoryDataset has not been saved yet.") copy_mode = self._copy_mode or _infer_copy_mode(self._data) data = _copy_with_mode(self._data, copy_mode=copy_mode) @@ -94,12 +76,12 @@ def _exists(self) -> bool: def _release(self) -> None: self._data = _EMPTY - def _describe(self) -> Dict[str, Any]: + def _describe(self) -> dict[str, Any]: if self._data is not _EMPTY: - return dict(data=f"<{type(self._data).__name__}>") + return {"data": f"<{type(self._data).__name__}>"} # the string representation of datasets leaves out __init__ # arguments that are empty/None, equivalent here is _EMPTY - return dict(data=None) # pragma: no cover + return {"data": None} # pragma: no cover def _infer_copy_mode(data: Any) -> str: @@ -111,7 +93,7 @@ def _infer_copy_mode(data: Any) -> str: Returns: One of "copy", "assign" or "deepcopy" as the copy mode to use. """ - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel try: import pandas as pd except ImportError: # pragma: no cover @@ -139,7 +121,7 @@ def _copy_with_mode(data: Any, copy_mode: str) -> Any: copy_mode: The copy mode to use, one of "deepcopy", "copy" and "assign". Raises: - DataSetError: If copy_mode is specified, but isn't valid + DatasetError: If copy_mode is specified, but isn't valid (i.e: not one of deepcopy, copy, assign) Returns: @@ -152,10 +134,22 @@ def _copy_with_mode(data: Any, copy_mode: str) -> Any: elif copy_mode == "assign": copied_data = data else: - raise DataSetError( - "Invalid copy mode: {}. Possible values are: deepcopy, copy, assign.".format( - copy_mode - ) + raise DatasetError( + f"Invalid copy mode: {copy_mode}. " + f"Possible values are: deepcopy, copy, assign." ) return copied_data + + +def __getattr__(name): + if name == "MemoryDataSet": + alias = MemoryDataset + warnings.warn( + f"{repr(name)} has been renamed to {repr(alias.__name__)}, " + f"and the alias will be removed in Kedro 0.19.0", + DeprecationWarning, + stacklevel=2, + ) + return alias + raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/kedro/io/partitioned_data_set.py b/kedro/io/partitioned_dataset.py similarity index 69% rename from kedro/io/partitioned_data_set.py rename to kedro/io/partitioned_dataset.py index 412e7de76c..3efe414c59 100644 --- a/kedro/io/partitioned_data_set.py +++ b/kedro/io/partitioned_dataset.py @@ -1,40 +1,14 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``PartitionedDataSet`` loads and saves partitioned file-like data using the +"""``PartitionedDataset`` loads and saves partitioned file-like data using the underlying dataset definition. It also uses `fsspec` for filesystem level operations. """ +from __future__ import annotations + import operator +import warnings from copy import deepcopy from pathlib import PurePosixPath -from typing import Any, Callable, Dict, List, Type, Union +from typing import Any, Callable from urllib.parse import urlparse -from warnings import warn from cachetools import Cache, cachedmethod @@ -42,7 +16,7 @@ VERSION_KEY, VERSIONED_FLAG_KEY, AbstractDataSet, - DataSetError, + DatasetError, parse_dataset_definition, ) from kedro.io.data_catalog import CREDENTIALS_KEY @@ -58,6 +32,9 @@ S3_PROTOCOLS = ("s3", "s3a", "s3n") +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +PartitionedDataSet: type[PartitionedDataset] +IncrementalDataSet: type[IncrementalDataset] def _grandparent(path: str) -> str: path_obj = PurePosixPath(path) @@ -71,55 +48,118 @@ def _grandparent(path: str) -> str: return str(grandparent) -class PartitionedDataSet(AbstractDataSet): - # pylint: disable=too-many-instance-attributes,protected-access - """``PartitionedDataSet`` loads and saves partitioned file-like data using the +class PartitionedDataset(AbstractDataSet): + # noqa: too-many-instance-attributes,protected-access + """``PartitionedDataset`` loads and saves partitioned file-like data using the underlying dataset definition. For filesystem level operations it uses `fsspec`: https://github.com/intake/filesystem_spec. - Example: + It also supports advanced features like + `lazy saving `_. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + station_data: + type: PartitionedDataset + path: data/03_primary/station_data + dataset: + type: pandas.CSVDataset + load_args: + sep: '\\t' + save_args: + sep: '\\t' + index: true + filename_suffix: '.dat' + + Example usage for the + `Python API `_: :: >>> import pandas as pd - >>> from kedro.io import PartitionedDataSet + >>> from kedro.io import PartitionedDataset + >>> + >>> # Create a fake pandas dataframe with 10 rows of data + >>> df = pd.DataFrame([{"DAY_OF_MONTH": str(i), "VALUE": i} for i in range(1, 11)]) + >>> + >>> # Convert it to a dict of pd.DataFrame with DAY_OF_MONTH as the dict key + >>> dict_df = { + day_of_month: df[df["DAY_OF_MONTH"] == day_of_month] + for day_of_month in df["DAY_OF_MONTH"] + } + >>> + >>> # Save it as small paritions with DAY_OF_MONTH as the partition key + >>> data_set = PartitionedDataset( + path="df_with_partition", + dataset="pandas.CSVDataset", + filename_suffix=".csv" + ) + >>> # This will create a folder `df_with_partition` and save multiple files + >>> # with the dict key + filename_suffix as filename, i.e. 1.csv, 2.csv etc. + >>> data_set.save(dict_df) + >>> + >>> # This will create lazy load functions instead of loading data into memory immediately. + >>> loaded = data_set.load() + >>> + >>> # Load all the partitions + >>> for partition_id, partition_load_func in loaded.items(): + # The actual function that loads the data + partition_data = partition_load_func() + >>> + >>> # Add the processing logic for individual partition HERE + >>> print(partition_data) + + You can also load multiple partitions from a remote storage and combine them + like this: + :: + + >>> import pandas as pd + >>> from kedro.io import PartitionedDataset >>> >>> # these credentials will be passed to both 'fsspec.filesystem()' call >>> # and the dataset initializer >>> credentials = {"key1": "secret1", "key2": "secret2"} >>> - >>> data_set = PartitionedDataSet( - >>> path="s3://bucket-name/path/to/folder", - >>> dataset="pandas.CSVDataSet", - >>> credentials=credentials - >>> ) + >>> data_set = PartitionedDataset( + path="s3://bucket-name/path/to/folder", + dataset="pandas.CSVDataset", + credentials=credentials + ) >>> loaded = data_set.load() >>> # assert isinstance(loaded, dict) >>> >>> combine_all = pd.DataFrame() >>> >>> for partition_id, partition_load_func in loaded.items(): - >>> partition_data = partition_load_func() - >>> combine_all = pd.concat( - >>> [combine_all, partition_data], ignore_index=True, sort=True - >>> ) + partition_data = partition_load_func() + combine_all = pd.concat( + [combine_all, partition_data], ignore_index=True, sort=True + ) >>> >>> new_data = pd.DataFrame({"new": [1, 2]}) >>> # creates "s3://bucket-name/path/to/folder/new/partition.csv" >>> data_set.save({"new/partition.csv": new_data}) - >>> + """ - def __init__( # pylint: disable=too-many-arguments + def __init__( # noqa: too-many-arguments self, path: str, - dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]], + dataset: str | type[AbstractDataSet] | dict[str, Any], filepath_arg: str = "filepath", filename_suffix: str = "", - credentials: Dict[str, Any] = None, - load_args: Dict[str, Any] = None, - fs_args: Dict[str, Any] = None, + credentials: dict[str, Any] = None, + load_args: dict[str, Any] = None, + fs_args: dict[str, Any] = None, + overwrite: bool = False, + metadata: dict[str, Any] = None, ): - """Creates a new instance of ``PartitionedDataSet``. + """Creates a new instance of ``PartitionedDataset``. Args: path: Path to the folder containing partitioned data. @@ -129,7 +169,7 @@ def __init__( # pylint: disable=too-many-arguments ``fsspec.implementations.local.LocalFileSystem`` will be used. **Note:** Some concrete implementations are bundled with ``fsspec``, while others (like ``s3`` or ``gcs``) must be installed separately - prior to usage of the ``PartitionedDataSet``. + prior to usage of the ``PartitionedDataset``. dataset: Underlying dataset definition. This is used to instantiate the dataset for each file located inside the ``path``. Accepted formats are: @@ -150,21 +190,29 @@ def __init__( # pylint: disable=too-many-arguments and the dataset initializer. If the dataset config contains explicit credentials spec, then such spec will take precedence. All possible credentials management scenarios are documented here: - https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html#partitioned-dataset-credentials + https://kedro.readthedocs.io/en/stable/data/kedro_io.html#partitioned-dataset-credentials load_args: Keyword arguments to be passed into ``find()`` method of the filesystem implementation. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + overwrite: If True, any existing partitions will be removed. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. + + Raises: + DatasetError: If versioning is enabled for the underlying dataset. """ - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from fsspec.utils import infer_storage_options # for performance reasons super().__init__() self._path = path self._filename_suffix = filename_suffix + self._overwrite = overwrite self._protocol = infer_storage_options(self._path)["protocol"] - self._partition_cache = Cache(maxsize=1) # type: Cache + self._partition_cache: Cache = Cache(maxsize=1) + self.metadata = metadata dataset = dataset if isinstance(dataset, dict) else {"type": dataset} self._dataset_type, self._dataset_config = parse_dataset_definition(dataset) @@ -192,9 +240,9 @@ def __init__( # pylint: disable=too-many-arguments self._filepath_arg = filepath_arg if self._filepath_arg in self._dataset_config: - warn( - "`{}` key must not be specified in the dataset definition as it " - "will be overwritten by partition path".format(self._filepath_arg) + warnings.warn( + f"'{self._filepath_arg}' key must not be specified in the dataset " + f"definition as it will be overwritten by partition path" ) self._load_args = deepcopy(load_args) or {} @@ -205,7 +253,7 @@ def __init__( # pylint: disable=too-many-arguments @property def _filesystem(self): # for performance reasons - import fsspec # pylint: disable=import-outside-toplevel + import fsspec # noqa: import-outside-toplevel protocol = "s3" if self._protocol in S3_PROTOCOLS else self._protocol return fsspec.filesystem(protocol, **self._credentials, **self._fs_args) @@ -217,7 +265,7 @@ def _normalized_path(self) -> str: return self._path @cachedmethod(cache=operator.attrgetter("_partition_cache")) - def _list_partitions(self) -> List[str]: + def _list_partitions(self) -> list[str]: dataset_is_versioned = VERSION_KEY in self._dataset_config return [ _grandparent(path) if dataset_is_versioned else path @@ -226,10 +274,11 @@ def _list_partitions(self) -> List[str]: ] def _join_protocol(self, path: str) -> str: - if self._path.startswith(self._protocol) and not path.startswith( - self._protocol + protocol_prefix = f"{self._protocol}://" + if self._path.startswith(protocol_prefix) and not path.startswith( + protocol_prefix ): - return f"{self._protocol}://{path}" + return f"{protocol_prefix}{path}" return path def _partition_to_path(self, path: str): @@ -245,7 +294,7 @@ def _path_to_partition(self, path: str) -> str: path = path[: -len(self._filename_suffix)] return path - def _load(self) -> Dict[str, Callable[[], Any]]: + def _load(self) -> dict[str, Callable[[], Any]]: partitions = {} for partition in self._list_partitions(): @@ -257,11 +306,14 @@ def _load(self) -> Dict[str, Callable[[], Any]]: partitions[partition_id] = dataset.load if not partitions: - raise DataSetError(f"No partitions found in `{self._path}`") + raise DatasetError(f"No partitions found in '{self._path}'") return partitions - def _save(self, data: Dict[str, Any]) -> None: + def _save(self, data: dict[str, Any]) -> None: + if self._overwrite and self._filesystem.exists(self._normalized_path): + self._filesystem.rm(self._normalized_path, recursive=True) + for partition_id, partition_data in sorted(data.items()): kwargs = deepcopy(self._dataset_config) partition = self._partition_to_path(partition_id) @@ -269,21 +321,21 @@ def _save(self, data: Dict[str, Any]) -> None: kwargs[self._filepath_arg] = self._join_protocol(partition) dataset = self._dataset_type(**kwargs) # type: ignore if callable(partition_data): - partition_data = partition_data() + partition_data = partition_data() # noqa: redefined-loop-name dataset.save(partition_data) self._invalidate_caches() - def _describe(self) -> Dict[str, Any]: + def _describe(self) -> dict[str, Any]: clean_dataset_config = ( {k: v for k, v in self._dataset_config.items() if k != CREDENTIALS_KEY} if isinstance(self._dataset_config, dict) else self._dataset_config ) - return dict( - path=self._path, - dataset_type=self._dataset_type.__name__, - dataset_config=clean_dataset_config, - ) + return { + "path": self._path, + "dataset_type": self._dataset_type.__name__, + "dataset_config": clean_dataset_config, + } def _invalidate_caches(self): self._partition_cache.clear() @@ -297,11 +349,11 @@ def _release(self) -> None: self._invalidate_caches() -class IncrementalDataSet(PartitionedDataSet): - """``IncrementalDataSet`` inherits from ``PartitionedDataSet``, which loads +class IncrementalDataset(PartitionedDataset): + """``IncrementalDataset`` inherits from ``PartitionedDataset``, which loads and saves partitioned file-like data using the underlying dataset definition. For filesystem level operations it uses `fsspec`: - https://github.com/intake/filesystem_spec. ``IncrementalDataSet`` also stores + https://github.com/intake/filesystem_spec. ``IncrementalDataset`` also stores the information about the last processed partition in so-called `checkpoint` that is persisted to the location of the data partitions by default, so that subsequent pipeline run loads only new partitions past the checkpoint. @@ -309,7 +361,7 @@ class IncrementalDataSet(PartitionedDataSet): Example: :: - >>> from kedro.io import IncrementalDataSet + >>> from kedro.io import IncrementalDataset >>> >>> # these credentials will be passed to: >>> # a) 'fsspec.filesystem()' call, @@ -317,9 +369,9 @@ class IncrementalDataSet(PartitionedDataSet): >>> # c) the checkpoint initializer >>> credentials = {"key1": "secret1", "key2": "secret2"} >>> - >>> data_set = IncrementalDataSet( + >>> data_set = IncrementalDataset( >>> path="s3://bucket-name/path/to/folder", - >>> dataset="pandas.CSVDataSet", + >>> dataset="pandas.CSVDataset", >>> credentials=credentials >>> ) >>> loaded = data_set.load() # loads all available partitions @@ -336,20 +388,20 @@ class IncrementalDataSet(PartitionedDataSet): DEFAULT_CHECKPOINT_TYPE = "kedro.extras.datasets.text.TextDataSet" DEFAULT_CHECKPOINT_FILENAME = "CHECKPOINT" - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, path: str, - dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]], - checkpoint: Union[str, Dict[str, Any]] = None, + dataset: str | type[AbstractDataSet] | dict[str, Any], + checkpoint: str | dict[str, Any] | None = None, filepath_arg: str = "filepath", filename_suffix: str = "", - credentials: Dict[str, Any] = None, - load_args: Dict[str, Any] = None, - fs_args: Dict[str, Any] = None, + credentials: dict[str, Any] = None, + load_args: dict[str, Any] = None, + fs_args: dict[str, Any] = None, + metadata: dict[str, Any] = None, ): - """Creates a new instance of ``IncrementalDataSet``. + """Creates a new instance of ``IncrementalDataset``. Args: path: Path to the folder containing partitioned data. @@ -359,7 +411,7 @@ def __init__( ``fsspec.implementations.local.LocalFileSystem`` will be used. **Note:** Some concrete implementations are bundled with ``fsspec``, while others (like ``s3`` or ``gcs``) must be installed separately - prior to usage of the ``PartitionedDataSet``. + prior to usage of the ``PartitionedDataset``. dataset: Underlying dataset definition. This is used to instantiate the dataset for each file located inside the ``path``. Accepted formats are: @@ -373,7 +425,7 @@ def __init__( with the corresponding dataset definition including ``filepath`` (unlike ``dataset`` argument). Checkpoint configuration is described here: - https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html#checkpoint-configuration + https://kedro.readthedocs.io/en/stable/data/kedro_io.html#checkpoint-configuration Credentials for the checkpoint can be explicitly specified in this configuration. filepath_arg: Underlying dataset initializer argument that will @@ -388,11 +440,13 @@ def __init__( the dataset or the checkpoint configuration contains explicit credentials spec, then such spec will take precedence. All possible credentials management scenarios are documented here: - https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html#partitioned-dataset-credentials + https://kedro.readthedocs.io/en/stable/data/kedro_io.html#partitioned-dataset-credentials load_args: Keyword arguments to be passed into ``find()`` method of the filesystem implementation. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: If versioning is enabled for the checkpoint dataset. @@ -410,6 +464,7 @@ def __init__( self._checkpoint_config = self._parse_checkpoint_config(checkpoint) self._force_checkpoint = self._checkpoint_config.pop("force_checkpoint", None) + self.metadata = metadata comparison_func = self._checkpoint_config.pop("comparison_func", operator.gt) if isinstance(comparison_func, str): @@ -417,19 +472,17 @@ def __init__( self._comparison_func = comparison_func def _parse_checkpoint_config( - self, checkpoint_config: Union[str, Dict[str, Any], None] - ) -> Dict[str, Any]: + self, checkpoint_config: str | dict[str, Any] | None + ) -> dict[str, Any]: checkpoint_config = deepcopy(checkpoint_config) if isinstance(checkpoint_config, str): checkpoint_config = {"force_checkpoint": checkpoint_config} checkpoint_config = checkpoint_config or {} for key in {VERSION_KEY, VERSIONED_FLAG_KEY} & checkpoint_config.keys(): - raise DataSetError( - "`{}` does not support versioning of the checkpoint. " - "Please remove `{}` key from the checkpoint definition.".format( - self.__class__.__name__, key - ) + raise DatasetError( + f"'{self.__class__.__name__}' does not support versioning of the " + f"checkpoint. Please remove '{key}' key from the checkpoint definition." ) default_checkpoint_path = self._sep.join( @@ -451,12 +504,10 @@ def _parse_checkpoint_config( return {**default_config, **checkpoint_config} @cachedmethod(cache=operator.attrgetter("_partition_cache")) - def _list_partitions(self) -> List[str]: + def _list_partitions(self) -> list[str]: checkpoint = self._read_checkpoint() - checkpoint_path = ( - self._filesystem._strip_protocol( # pylint: disable=protected-access - self._checkpoint_config[self._filepath_arg] - ) + checkpoint_path = self._filesystem._strip_protocol( # noqa: protected-access + self._checkpoint_config[self._filepath_arg] ) dataset_is_versioned = VERSION_KEY in self._dataset_config @@ -482,15 +533,15 @@ def _checkpoint(self) -> AbstractDataSet: type_, kwargs = parse_dataset_definition(self._checkpoint_config) return type_(**kwargs) # type: ignore - def _read_checkpoint(self) -> Union[str, None]: + def _read_checkpoint(self) -> str | None: if self._force_checkpoint is not None: return self._force_checkpoint try: return self._checkpoint.load() - except DataSetError: + except DatasetError: return None - def _load(self) -> Dict[str, Callable[[], Any]]: + def _load(self) -> dict[str, Callable[[], Any]]: partitions = {} for partition in self._list_partitions(): @@ -510,3 +561,22 @@ def confirm(self) -> None: partition_ids = [self._path_to_partition(p) for p in self._list_partitions()] if partition_ids: self._checkpoint.save(partition_ids[-1]) # checkpoint to last partition + + +_DEPRECATED_ERROR_CLASSES = { + "PartitionedDataSet": PartitionedDataset, + "IncrementalDataSet": IncrementalDataset, +} + + +def __getattr__(name): + if name in _DEPRECATED_ERROR_CLASSES: + alias = _DEPRECATED_ERROR_CLASSES[name] + warnings.warn( + f"{repr(name)} has been renamed to {repr(alias.__name__)}, " + f"and the alias will be removed in Kedro 0.19.0", + DeprecationWarning, + stacklevel=2, + ) + return alias + raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/kedro/io/transformers.py b/kedro/io/transformers.py deleted file mode 100644 index fe318160f8..0000000000 --- a/kedro/io/transformers.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -"""``Transformers`` modify the loading and saving of ``DataSets`` in a -``DataCatalog``. -""" -import abc -from typing import Any, Callable - - -class AbstractTransformer(abc.ABC): - """Transformers will be deprecated in Kedro 0.18.0 in favour of the Dataset Hooks. - - ``AbstractTransformer`` is the base class for all transformer implementations. - All transformer implementations should extend this abstract class - and customise the `load` and `save` methods where appropriate.""" - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - """ - This method will be deprecated in Kedro 0.18.0 in favour of the Dataset Hooks - `before_dataset_loaded` and `after_dataset_loaded`. - - Wrap the loading of a dataset. - Call ``load`` to get the data from the data set / next transformer. - - Args: - data_set_name: The name of the data set being loaded. - load: A callback to retrieve the data being loaded from the - data set / next transformer. - - Returns: - The loaded data. - """ - # pylint: disable=unused-argument, no-self-use - return load() - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - """ - This method will be deprecated in Kedro 0.18.0 in favour of the Dataset Hooks - `before_dataset_saved` and `after_dataset_saved`. - - Wrap the saving of a dataset. - Call ``save`` to pass the data to the data set / next transformer. - - Args: - data_set_name: The name of the data set being saved. - save: A callback to pass the data being saved on to the - data set / next transformer. - data: The data being saved - """ - # pylint: disable=unused-argument, no-self-use - save(data) diff --git a/kedro/ipython/__init__.py b/kedro/ipython/__init__.py new file mode 100644 index 0000000000..8341822255 --- /dev/null +++ b/kedro/ipython/__init__.py @@ -0,0 +1,171 @@ +""" +This script creates an IPython extension to load Kedro-related variables in +local scope. +""" +from __future__ import annotations + +import logging +import sys +from pathlib import Path +from typing import Any + +from IPython import get_ipython +from IPython.core.magic import needs_local_scope, register_line_magic +from IPython.core.magic_arguments import argument, magic_arguments, parse_argstring + +from kedro.framework.cli import load_entry_points +from kedro.framework.cli.project import PARAMS_ARG_HELP +from kedro.framework.cli.utils import ENV_HELP, _split_params +from kedro.framework.project import ( + LOGGING, # noqa + configure_project, + pipelines, +) +from kedro.framework.session import KedroSession +from kedro.framework.startup import _is_project, bootstrap_project + +logger = logging.getLogger(__name__) + + +def load_ipython_extension(ipython): + """ + Main entry point when %load_ext kedro.ipython is executed, either manually or + automatically through `kedro ipython` or `kedro jupyter lab/notebook`. + IPython will look for this function specifically. + See https://ipython.readthedocs.io/en/stable/config/extensions/index.html + """ + ipython.register_magic_function(magic_reload_kedro, magic_name="reload_kedro") + + if _find_kedro_project(Path.cwd()) is None: + logger.warning( + "Kedro extension was registered but couldn't find a Kedro project. " + "Make sure you run '%reload_kedro '." + ) + return + + reload_kedro() + + +@needs_local_scope +@magic_arguments() +@argument( + "path", + type=str, + help=( + "Path to the project root directory. If not given, use the previously set" + "project root." + ), + nargs="?", + default=None, +) +@argument("-e", "--env", type=str, default=None, help=ENV_HELP) +@argument( + "--params", + type=lambda value: _split_params(None, None, value), + default=None, + help=PARAMS_ARG_HELP, +) +def magic_reload_kedro(line: str, local_ns: dict[str, Any] = None): + """ + The `%reload_kedro` IPython line magic. + See https://kedro.readthedocs.io/en/stable/notebooks_and_ipython/kedro_and_notebooks.html#reload-kedro-line-magic # noqa: line-too-long + for more. + """ + args = parse_argstring(magic_reload_kedro, line) + reload_kedro(args.path, args.env, args.params, local_ns) + + +def reload_kedro( + path: str = None, + env: str = None, + extra_params: dict[str, Any] = None, + local_namespace: dict[str, Any] | None = None, +) -> None: # pragma: no cover + """Function that underlies the %reload_kedro Line magic. This should not be imported + or run directly but instead invoked through %reload_kedro.""" + + project_path = _resolve_project_path(path, local_namespace) + + metadata = bootstrap_project(project_path) + _remove_cached_modules(metadata.package_name) + configure_project(metadata.package_name) + + session = KedroSession.create( + metadata.package_name, project_path, env=env, extra_params=extra_params + ) + context = session.load_context() + catalog = context.catalog + + get_ipython().push( + variables={ + "context": context, + "catalog": catalog, + "session": session, + "pipelines": pipelines, + } + ) + + logger.info("Kedro project %s", str(metadata.project_name)) + logger.info( + "Defined global variable 'context', 'session', 'catalog' and 'pipelines'" + ) + + for line_magic in load_entry_points("line_magic"): + register_line_magic(needs_local_scope(line_magic)) + logger.info("Registered line magic '%s'", line_magic.__name__) # type: ignore + + +def _resolve_project_path( + path: str | None = None, local_namespace: dict[str, Any] | None = None +) -> Path: + """ + Resolve the project path to use with reload_kedro, updating or adding it + (in-place) to the local ipython Namespace (``local_namespace``) if necessary. + + Arguments: + path: the path to use as a string object + local_namespace: Namespace with local variables of the scope where the line + magic is invoked in a dict. + """ + if path: + project_path = Path(path).expanduser().resolve() + else: + if local_namespace and "context" in local_namespace: + # noqa: protected-access + project_path = local_namespace["context"]._project_path + else: + project_path = _find_kedro_project(Path.cwd()) + if project_path: + logger.info( + "Resolved project path as: %s.\nTo set a different path, run " + "'%%reload_kedro '", + project_path, + ) + + # noqa: protected-access + if ( + project_path + and local_namespace + and "context" in local_namespace + and project_path != local_namespace["context"]._project_path + ): + logger.info("Updating path to Kedro project: %s...", project_path) + + return project_path + + +def _remove_cached_modules(package_name): # pragma: no cover + to_remove = [mod for mod in sys.modules if mod.startswith(package_name)] + # `del` is used instead of `reload()` because: If the new version of a module does not + # define a name that was defined by the old version, the old definition remains. + for module in to_remove: + del sys.modules[module] + + +def _find_kedro_project(current_dir: Path): # pragma: no cover + while current_dir != current_dir.parent: + if _is_project(current_dir): + return current_dir + current_dir = current_dir.parent + + return None diff --git a/kedro/ipython/logo-32x32.png b/kedro/ipython/logo-32x32.png new file mode 100644 index 0000000000..d5d99ce6f1 Binary files /dev/null and b/kedro/ipython/logo-32x32.png differ diff --git a/kedro/ipython/logo-64x64.png b/kedro/ipython/logo-64x64.png new file mode 100644 index 0000000000..826a40720b Binary files /dev/null and b/kedro/ipython/logo-64x64.png differ diff --git a/kedro/ipython/logo-svg.svg b/kedro/ipython/logo-svg.svg new file mode 100644 index 0000000000..58109f4eb1 --- /dev/null +++ b/kedro/ipython/logo-svg.svg @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/kedro/logging.py b/kedro/logging.py new file mode 100644 index 0000000000..534776c566 --- /dev/null +++ b/kedro/logging.py @@ -0,0 +1,58 @@ +""" +This module contains a logging handler class which produces coloured logs and tracebacks. +""" + +import logging +import os +import sys +from pathlib import Path + +import click +import rich.logging +import rich.pretty +import rich.traceback + + +class RichHandler(rich.logging.RichHandler): + """Identical to rich's logging handler but with a few extra behaviours: + * warnings issued by the `warnings` module are redirected to logging + * pretty printing is enabled on the Python REPL (including IPython and Jupyter) + * all tracebacks are handled by rich when rich_tracebacks=True + * constructor's arguments are mapped and passed to `rich.traceback.install` + + The list of available options of ``RichHandler`` can be found here: + https://rich.readthedocs.io/en/stable/reference/logging.html#rich.logging.RichHandler + + The list of available options of `rich.traceback.install` can be found here: + https://rich.readthedocs.io/en/stable/reference/traceback.html#rich.traceback.install + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + logging.captureWarnings(True) + rich.pretty.install() + + # We suppress click here to hide tracebacks related to it conversely, + # kedro is not suppressed to show its tracebacks for easier debugging. + # sys.executable is used to get the kedro executable path to hide the + # top level traceback. + + traceback_install_kwargs = { + "suppress": [click, str(Path(sys.executable).parent)] + } + + # Mapping arguments from RichHandler's Constructor to rich.traceback.install + prefix = "tracebacks_" + for key, value in kwargs.items(): + if key.startswith(prefix): + key_prefix_removed = key[len(prefix) :] + if key_prefix_removed == "suppress": + traceback_install_kwargs[key_prefix_removed].extend(value) + else: + traceback_install_kwargs[key_prefix_removed] = value + + if self.rich_tracebacks and "DATABRICKS_RUNTIME_VERSION" not in os.environ: + # Rich traceback handling does not work on databricks. Hopefully this will be + # fixed on their side at some point, but until then we disable it. + # See https://github.com/Textualize/rich/issues/2455 + rich.traceback.install(**traceback_install_kwargs) diff --git a/kedro/pipeline/__init__.py b/kedro/pipeline/__init__.py index 069d17e7c8..c9136f1077 100644 --- a/kedro/pipeline/__init__.py +++ b/kedro/pipeline/__init__.py @@ -1,35 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.pipeline`` provides functionality to define and execute data-driven pipelines. """ -from .modular_pipeline import pipeline # NOQA -from .node import node # NOQA -from .pipeline import Pipeline # NOQA +from .modular_pipeline import pipeline +from .node import node +from .pipeline import Pipeline + +__all__ = ["pipeline", "node", "Pipeline"] diff --git a/kedro/pipeline/decorators.py b/kedro/pipeline/decorators.py deleted file mode 100644 index 9f994468c6..0000000000 --- a/kedro/pipeline/decorators.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -"""A module containing predefined node decorators in Kedro. -""" - -import logging -import time -from functools import wraps -from typing import Callable - - -def _func_full_name(func: Callable): - if not getattr(func, "__module__", None): - return getattr(func, "__qualname__", repr(func)) - return f"{func.__module__}.{func.__qualname__}" - - -def _human_readable_time(elapsed: float): # pragma: no cover - mins, secs = divmod(elapsed, 60) - hours, mins = divmod(mins, 60) - - if hours > 0: - message = "%dh%02dm%02ds" % (hours, mins, secs) - elif mins > 0: - message = "%dm%02ds" % (mins, secs) - elif secs >= 1: - message = f"{secs:.2f}s" - else: - message = f"{secs * 1000.0:.0f}ms" - - return message - - -def log_time(func: Callable) -> Callable: - """A function decorator which logs the time taken for executing a function. - - Args: - func: The function to be logged. - - Returns: - A wrapped function, which will execute the provided function and log - the running time. - - """ - - @wraps(func) - def with_time(*args, **kwargs): - log = logging.getLogger(__name__) - t_start = time.time() - result = func(*args, **kwargs) - t_end = time.time() - elapsed = t_end - t_start - - log.info( - "Running %r took %s [%.3fs]", - _func_full_name(func), - _human_readable_time(elapsed), - elapsed, - ) - return result - - return with_time diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index a529020664..0f429eed56 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -1,33 +1,8 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """Helper to integrate modular pipelines into a master pipeline.""" +from __future__ import annotations + import copy -from typing import AbstractSet, Dict, List, Set, Union +from typing import AbstractSet, Iterable from kedro.pipeline.node import Node from kedro.pipeline.pipeline import ( @@ -71,7 +46,7 @@ def _validate_inputs_outputs( if any(_is_parameter(i) for i in inputs): raise ModularPipelineError( - "Parameters should be specified in the `parameters` argument" + "Parameters should be specified in the 'parameters' argument" ) free_inputs = {_strip_transcoding(i) for i in pipe.inputs()} @@ -96,23 +71,22 @@ def _validate_datasets_exist( non_existent = (inputs | outputs | parameters) - existing if non_existent: raise ModularPipelineError( - "Failed to map datasets and/or parameters: {}".format( - ", ".join(sorted(non_existent)) - ) + f"Failed to map datasets and/or parameters: " + f"{', '.join(sorted(non_existent))}" ) def _get_dataset_names_mapping( - names: Union[str, Set[str], Dict[str, str]] = None -) -> Dict[str, str]: + names: str | set[str] | dict[str, str] | None = None +) -> dict[str, str]: """Take a name or a collection of dataset names and turn it into a mapping from the old dataset names to the provided ones if necessary. Args: names: A dataset name or collection of dataset names. - When str or Set[str] is provided, the listed names will stay + When str or set[str] is provided, the listed names will stay the same as they are named in the provided pipeline. - When Dict[str, str] is provided, current names will be + When dict[str, str] is provided, current names will be mapped to new names in the resultant pipeline. Returns: A dictionary that maps the old dataset names to the provided ones. @@ -120,9 +94,9 @@ def _get_dataset_names_mapping( >>> _get_dataset_names_mapping("dataset_name") {"dataset_name": "dataset_name"} # a str name will stay the same >>> _get_dataset_names_mapping(set(["ds_1", "ds_2"])) - {"ds_1": "ds_1", "ds_2": "ds_2"} # a Set[str] of names will stay the same + {"ds_1": "ds_1", "ds_2": "ds_2"} # a set[str] of names will stay the same >>> _get_dataset_names_mapping({"ds_1": "new_ds_1_name"}) - {"ds_1": "new_ds_1_name"} # a Dict[str, str] of names will map key to value + {"ds_1": "new_ds_1_name"} # a dict[str, str] of names will map key to value """ if names is None: return {} @@ -140,8 +114,8 @@ def _normalize_param_name(name: str) -> str: def _get_param_names_mapping( - names: Union[str, Set[str], Dict[str, str]] = None -) -> Dict[str, str]: + names: str | set[str] | dict[str, str] | None = None +) -> dict[str, str]: """Take a parameter or a collection of parameter names and turn it into a mapping from existing parameter names to new ones if necessary. It follows the same rule as `_get_dataset_names_mapping` and @@ -149,9 +123,9 @@ def _get_param_names_mapping( Args: names: A parameter name or collection of parameter names. - When str or Set[str] is provided, the listed names will stay + When str or set[str] is provided, the listed names will stay the same as they are named in the provided pipeline. - When Dict[str, str] is provided, current names will be + When dict[str, str] is provided, current names will be mapped to new names in the resultant pipeline. Returns: A dictionary that maps the old parameter names to the provided ones. @@ -159,10 +133,10 @@ def _get_param_names_mapping( >>> _get_param_names_mapping("param_name") {"params:param_name": "params:param_name"} # a str name will stay the same >>> _get_param_names_mapping(set(["param_1", "param_2"])) - # a Set[str] of names will stay the same + # a set[str] of names will stay the same {"params:param_1": "params:param_1", "params:param_2": "params:param_2"} >>> _get_param_names_mapping({"param_1": "new_name_for_param_1"}) - # a Dict[str, str] of names will map key to valu + # a dict[str, str] of names will map key to valu {"params:param_1": "params:new_name_for_param_1"} """ params = {} @@ -176,41 +150,46 @@ def _get_param_names_mapping( return params -def pipeline( - pipe: Pipeline, +def pipeline( # noqa: too-many-arguments + pipe: Iterable[Node | Pipeline] | Pipeline, *, - inputs: Union[str, Set[str], Dict[str, str]] = None, - outputs: Union[str, Set[str], Dict[str, str]] = None, - parameters: Union[str, Set[str], Dict[str, str]] = None, + inputs: str | set[str] | dict[str, str] | None = None, + outputs: str | set[str] | dict[str, str] | None = None, + parameters: str | set[str] | dict[str, str] | None = None, + tags: str | Iterable[str] | None = None, namespace: str = None, ) -> Pipeline: - """Create a copy of the pipeline and its nodes, - with some dataset names, parameter names and node names modified. - + r"""Create a ``Pipeline`` from a collection of nodes and/or ``Pipeline``\s. Args: - pipe: Original modular pipeline to integrate + pipe: The nodes the ``Pipeline`` will be made of. If you + provide pipelines among the list of nodes, those pipelines will + be expanded and all their nodes will become part of this + new pipeline. inputs: A name or collection of input names to be exposed as connection points - to other pipelines upstream. - When str or Set[str] is provided, the listed input names will stay + to other pipelines upstream. This is optional; if not provided, the + pipeline inputs are automatically inferred from the pipeline structure. + When str or set[str] is provided, the listed input names will stay the same as they are named in the provided pipeline. - When Dict[str, str] is provided, current input names will be + When dict[str, str] is provided, current input names will be mapped to new names. Must only refer to the pipeline's free inputs. outputs: A name or collection of names to be exposed as connection points - to other pipelines downstream. - When str or Set[str] is provided, the listed output names will stay + to other pipelines downstream. This is optional; if not provided, the + pipeline inputs are automatically inferred from the pipeline structure. + When str or set[str] is provided, the listed output names will stay the same as they are named in the provided pipeline. - When Dict[str, str] is provided, current output names will be + When dict[str, str] is provided, current output names will be mapped to new names. Can refer to both the pipeline's free outputs, as well as intermediate results that need to be exposed. parameters: A name or collection of parameters to namespace. - When str or Set[str] are provided, the listed parameter names will stay + When str or set[str] are provided, the listed parameter names will stay the same as they are named in the provided pipeline. - When Dict[str, str] is provided, current parameter names will be + When dict[str, str] is provided, current parameter names will be mapped to new names. The parameters can be specified without the `params:` prefix. + tags: Optional set of tags to be applied to all the pipeline nodes. namespace: A prefix to give to all dataset names, except those explicitly named with the `inputs`/`outputs` arguments, and parameter references (`params:` and `parameters`). @@ -222,9 +201,18 @@ def pipeline( any of the expected types (str, dict, list, or None). Returns: - A new ``Pipeline`` object with the new nodes, modified as requested. + A new ``Pipeline`` object. """ - # pylint: disable=protected-access + if isinstance(pipe, Pipeline): + # To ensure that we are always dealing with a *copy* of pipe. + pipe = Pipeline([pipe], tags=tags) + else: + pipe = Pipeline(pipe, tags=tags) + + if not any([inputs, outputs, parameters, namespace]): + return pipe + + # noqa: protected-access inputs = _get_dataset_names_mapping(inputs) outputs = _get_dataset_names_mapping(outputs) parameters = _get_param_names_mapping(parameters) @@ -271,8 +259,8 @@ def _rename(name: str): return name def _process_dataset_names( - datasets: Union[None, str, List[str], Dict[str, str]] - ) -> Union[None, str, List[str], Dict[str, str]]: + datasets: None | str | list[str] | dict[str, str] + ) -> None | str | list[str] | dict[str, str]: if datasets is None: return None if isinstance(datasets, str): @@ -301,4 +289,4 @@ def _copy_node(node: Node) -> Node: new_nodes = [_copy_node(n) for n in pipe.nodes] - return Pipeline(new_nodes) + return Pipeline(new_nodes, tags=tags) diff --git a/kedro/pipeline/node.py b/kedro/pipeline/node.py index e69ebded17..0d38f664c2 100644 --- a/kedro/pipeline/node.py +++ b/kedro/pipeline/node.py @@ -1,58 +1,33 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """This module provides user-friendly functions for creating nodes as parts of Kedro pipelines. """ +from __future__ import annotations + import copy import inspect import logging import re from collections import Counter -from functools import reduce -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Union +from typing import Any, Callable, Iterable from warnings import warn +from more_itertools import spy, unzip + -class Node: # pylint: disable=too-many-instance-attributes +class Node: """``Node`` is an auxiliary class facilitating the operations required to run user-provided functions as part of Kedro pipelines. """ - def __init__( + def __init__( # noqa: too-many-arguments self, func: Callable, - inputs: Union[None, str, List[str], Dict[str, str]], - outputs: Union[None, str, List[str], Dict[str, str]], + inputs: None | str | list[str] | dict[str, str], + outputs: None | str | list[str] | dict[str, str], *, name: str = None, - tags: Union[str, Iterable[str]] = None, - decorators: Iterable[Callable] = None, - confirms: Union[str, List[str]] = None, + tags: str | Iterable[str] | None = None, + confirms: str | list[str] | None = None, namespace: str = None, ): """Create a node in the pipeline by providing a function to be called @@ -64,17 +39,16 @@ def __init__( inputs: The name or the list of the names of variables used as inputs to the function. The number of names should match the number of arguments in the definition of the provided - function. When Dict[str, str] is provided, variable names + function. When dict[str, str] is provided, variable names will be mapped to function argument names. outputs: The name or the list of the names of variables used as outputs to the function. The number of names should match the number of outputs returned by the provided function. - When Dict[str, str] is provided, variable names will be mapped + When dict[str, str] is provided, variable names will be mapped to the named outputs the function returns. name: Optional node name to be used when displaying the node in logs or any other visualisations. tags: Optional set of tags to be applied to the node. - decorators: Optional list of decorators to be applied to the node. confirms: Optional name or the list of the names of the datasets that should be confirmed. This will result in calling ``confirm()`` method of the corresponding data set instance. @@ -97,29 +71,29 @@ def __init__( if not callable(func): raise ValueError( _node_error_message( - f"first argument must be a function, not `{type(func).__name__}`." + f"first argument must be a function, not '{type(func).__name__}'." ) ) if inputs and not isinstance(inputs, (list, dict, str)): raise ValueError( _node_error_message( - f"`inputs` type must be one of [String, List, Dict, None], " - f"not `{type(inputs).__name__}`." + f"'inputs' type must be one of [String, List, Dict, None], " + f"not '{type(inputs).__name__}'." ) ) if outputs and not isinstance(outputs, (list, dict, str)): raise ValueError( _node_error_message( - f"`outputs` type must be one of [String, List, Dict, None], " - f"not `{type(outputs).__name__}`." + f"'outputs' type must be one of [String, List, Dict, None], " + f"not '{type(outputs).__name__}'." ) ) if not inputs and not outputs: raise ValueError( - _node_error_message("it must have some `inputs` or `outputs`.") + _node_error_message("it must have some 'inputs' or 'outputs'.") ) self._validate_inputs(func, inputs) @@ -135,7 +109,6 @@ def __init__( self._name = name self._namespace = namespace self._tags = set(_to_list(tags)) - self._decorators = list(decorators or []) self._validate_unique_outputs() self._validate_inputs_dif_than_outputs() @@ -152,7 +125,6 @@ def _copy(self, **overwrite_params): "name": self._name, "namespace": self._namespace, "tags": self._tags, - "decorators": self._decorators, "confirms": self._confirms, } params.update(overwrite_params) @@ -166,6 +138,9 @@ def _logger(self): def _unique_key(self): def hashable(value): if isinstance(value, dict): + # we sort it because a node with inputs/outputs + # {"arg1": "a", "arg2": "b"} is equivalent to + # a node with inputs/outputs {"arg2": "b", "arg1": "a"} return tuple(sorted(value.items())) if isinstance(value, list): return tuple(value) @@ -187,21 +162,22 @@ def __hash__(self): return hash(self._unique_key) def __str__(self): - def _sorted_set_to_str(xset): - return f"[{','.join(sorted(xset))}]" + def _set_to_str(xset): + return f"[{';'.join(xset)}]" - out_str = _sorted_set_to_str(self.outputs) if self._outputs else "None" - in_str = _sorted_set_to_str(self.inputs) if self._inputs else "None" + out_str = _set_to_str(self.outputs) if self._outputs else "None" + in_str = _set_to_str(self.inputs) if self._inputs else "None" prefix = self._name + ": " if self._name else "" return prefix + f"{self._func_name}({in_str}) -> {out_str}" def __repr__(self): # pragma: no cover - return "Node({}, {!r}, {!r}, {!r})".format( - self._func_name, self._inputs, self._outputs, self._name + return ( + f"Node({self._func_name}, {repr(self._inputs)}, {repr(self._outputs)}, " + f"{repr(self._name)})" ) - def __call__(self, **kwargs) -> Dict[str, Any]: + def __call__(self, **kwargs) -> dict[str, Any]: return self.run(inputs=kwargs) @property @@ -209,9 +185,9 @@ def _func_name(self) -> str: name = _get_readable_func_name(self._func) if name == "": warn( - f"The node producing outputs `{self.outputs}` is made from a `partial` function. " - f"Partial functions do not have a `__name__` attribute: consider using " - f"`functools.update_wrapper` for better log messages." + f"The node producing outputs '{self.outputs}' is made from a 'partial' function. " + f"Partial functions do not have a '__name__' attribute: consider using " + f"'functools.update_wrapper' for better log messages." ) return name @@ -235,7 +211,7 @@ def func(self, func: Callable): self._func = func @property - def tags(self) -> Set[str]: + def tags(self) -> set[str]: """Return the tags assigned to the node. Returns: @@ -244,7 +220,7 @@ def tags(self) -> Set[str]: """ return set(self._tags) - def tag(self, tags: Union[str, Iterable[str]]) -> "Node": + def tag(self, tags: str | Iterable[str]) -> Node: """Create a new ``Node`` which is an exact copy of the current one, but with more tags added to it. @@ -283,7 +259,7 @@ def short_name(self) -> str: return self._func_name.replace("_", " ").title() @property - def namespace(self) -> Optional[str]: + def namespace(self) -> str | None: """Node's namespace. Returns: @@ -292,7 +268,7 @@ def namespace(self) -> Optional[str]: return self._namespace @property - def inputs(self) -> List[str]: + def inputs(self) -> list[str]: """Return node inputs as a list, in the order required to bind them properly to the node's function. @@ -305,7 +281,7 @@ def inputs(self) -> List[str]: return _to_list(self._inputs) @property - def outputs(self) -> List[str]: + def outputs(self) -> list[str]: """Return node outputs as a list preserving the original order if possible. @@ -316,7 +292,7 @@ def outputs(self) -> List[str]: return _to_list(self._outputs) @property - def confirms(self) -> List[str]: + def confirms(self) -> list[str]: """Return dataset names to confirm as a list. Returns: @@ -324,92 +300,7 @@ def confirms(self) -> List[str]: """ return _to_list(self._confirms) - @property - def _decorated_func(self): - return reduce(lambda g, f: f(g), self._decorators, self._func) - - def decorate(self, *decorators: Callable) -> "Node": - """Create a new ``Node`` by applying the provided decorators to the - underlying function. If no decorators are passed, it will return a - new ``Node`` object, but with no changes to the function. - - Args: - decorators: Decorators to be applied on the node function. - Decorators will be applied from right to left. - - Returns: - A new ``Node`` object with the decorators applied to the function. - - Example: - :: - - >>> - >>> from functools import wraps - >>> - >>> - >>> def apply_f(func: Callable) -> Callable: - >>> @wraps(func) - >>> def with_f(*args, **kwargs): - >>> args = ["f({})".format(a) for a in args] - >>> return func(*args, **kwargs) - >>> return with_f - >>> - >>> - >>> def apply_g(func: Callable) -> Callable: - >>> @wraps(func) - >>> def with_g(*args, **kwargs): - >>> args = ["g({})".format(a) for a in args] - >>> return func(*args, **kwargs) - >>> return with_g - >>> - >>> - >>> def apply_h(func: Callable) -> Callable: - >>> @wraps(func) - >>> def with_h(*args, **kwargs): - >>> args = ["h({})".format(a) for a in args] - >>> return func(*args, **kwargs) - >>> return with_h - >>> - >>> - >>> def apply_fg(func: Callable) -> Callable: - >>> @wraps(func) - >>> def with_fg(*args, **kwargs): - >>> args = ["fg({})".format(a) for a in args] - >>> return func(*args, **kwargs) - >>> return with_fg - >>> - >>> - >>> def identity(value): - >>> return value - >>> - >>> - >>> # using it as a regular python decorator - >>> @apply_f - >>> def decorated_identity(value): - >>> return value - >>> - >>> - >>> # wrapping the node function - >>> old_node = node(apply_g(decorated_identity), 'input', 'output', - >>> name='node') - >>> # using the .decorate() method to apply multiple decorators - >>> new_node = old_node.decorate(apply_h, apply_fg) - >>> result = new_node.run(dict(input=1)) - >>> - >>> assert old_node.name == new_node.name - >>> assert "output" in result - >>> assert result['output'] == "f(g(fg(h(1))))" - """ - warn( - "The node's `decorate` API will be deprecated in Kedro 0.18.0." - "Please use a node's Hooks to extend the node's behaviour in a pipeline." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/04_hooks.html", - DeprecationWarning, - ) - return self._copy(decorators=self._decorators + list(reversed(decorators))) - - def run(self, inputs: Dict[str, Any] = None) -> Dict[str, Any]: + def run(self, inputs: dict[str, Any] = None) -> dict[str, Any]: """Run this node using the provided inputs and return its results in a dictionary. @@ -448,7 +339,7 @@ def run(self, inputs: Dict[str, Any] = None) -> Dict[str, Any]: ) try: - inputs = dict() if inputs is None else inputs + inputs = {} if inputs is None else inputs if not self._inputs: outputs = self._run_with_no_inputs(inputs) elif isinstance(self._inputs, str): @@ -462,101 +353,110 @@ def run(self, inputs: Dict[str, Any] = None) -> Dict[str, Any]: # purposely catch all exceptions except Exception as exc: - self._logger.error("Node `%s` failed with error: \n%s", str(self), str(exc)) + self._logger.error( + "Node %s failed with error: \n%s", + str(self), + str(exc), + extra={"markup": True}, + ) raise exc - def _run_with_no_inputs(self, inputs: Dict[str, Any]): + def _run_with_no_inputs(self, inputs: dict[str, Any]): if inputs: raise ValueError( - "Node {} expected no inputs, " - "but got the following {} input(s) instead: {}".format( - str(self), len(inputs), list(sorted(inputs.keys())) - ) + f"Node {str(self)} expected no inputs, " + f"but got the following {len(inputs)} input(s) instead: " + f"{sorted(inputs.keys())}." ) - return self._decorated_func() + return self._func() - def _run_with_one_input(self, inputs: Dict[str, Any], node_input: str): + def _run_with_one_input(self, inputs: dict[str, Any], node_input: str): if len(inputs) != 1 or node_input not in inputs: raise ValueError( - "Node {} expected one input named '{}', " - "but got the following {} input(s) instead: {}".format( - str(self), node_input, len(inputs), list(sorted(inputs.keys())) - ) + f"Node {str(self)} expected one input named '{node_input}', " + f"but got the following {len(inputs)} input(s) instead: " + f"{sorted(inputs.keys())}." ) - return self._decorated_func(inputs[node_input]) + return self._func(inputs[node_input]) - def _run_with_list(self, inputs: Dict[str, Any], node_inputs: List[str]): + def _run_with_list(self, inputs: dict[str, Any], node_inputs: list[str]): # Node inputs and provided run inputs should completely overlap if set(node_inputs) != set(inputs.keys()): raise ValueError( - "Node {} expected {} input(s) {}, " - "but got the following {} input(s) instead: {}.".format( - str(self), - len(node_inputs), - node_inputs, - len(inputs), - list(sorted(inputs.keys())), - ) + f"Node {str(self)} expected {len(node_inputs)} input(s) {node_inputs}, " + f"but got the following {len(inputs)} input(s) instead: " + f"{sorted(inputs.keys())}." ) # Ensure the function gets the inputs in the correct order - return self._decorated_func(*[inputs[item] for item in node_inputs]) + return self._func(*(inputs[item] for item in node_inputs)) - def _run_with_dict(self, inputs: Dict[str, Any], node_inputs: Dict[str, str]): + def _run_with_dict(self, inputs: dict[str, Any], node_inputs: dict[str, str]): # Node inputs and provided run inputs should completely overlap if set(node_inputs.values()) != set(inputs.keys()): raise ValueError( - "Node {} expected {} input(s) {}, " - "but got the following {} input(s) instead: {}.".format( - str(self), - len(set(node_inputs.values())), - list(sorted(set(node_inputs.values()))), - len(inputs), - list(sorted(inputs.keys())), - ) + f"Node {str(self)} expected {len(set(node_inputs.values()))} input(s) " + f"{sorted(set(node_inputs.values()))}, " + f"but got the following {len(inputs)} input(s) instead: " + f"{sorted(inputs.keys())}." ) kwargs = {arg: inputs[alias] for arg, alias in node_inputs.items()} - return self._decorated_func(**kwargs) + return self._func(**kwargs) def _outputs_to_dictionary(self, outputs): def _from_dict(): - if set(self._outputs.keys()) != set(outputs.keys()): + result, iterator = outputs, None + # generator functions are lazy and we need a peek into their first output + if inspect.isgenerator(outputs): + (result,), iterator = spy(outputs) + + keys = list(self._outputs.keys()) + names = list(self._outputs.values()) + if not isinstance(result, dict): + raise ValueError( + f"Failed to save outputs of node {self}.\n" + f"The node output is a dictionary, whereas the " + f"function output is {type(result)}." + ) + if set(keys) != set(result.keys()): raise ValueError( - "Failed to save outputs of node {}.\n" - "The node's output keys {} do not " - "match with the returned output's keys {}.".format( - str(self), set(outputs.keys()), set(self._outputs.keys()) - ) + f"Failed to save outputs of node {str(self)}.\n" + f"The node's output keys {set(result.keys())} " + f"do not match with the returned output's keys {set(keys)}." ) - return {name: outputs[key] for key, name in self._outputs.items()} + if iterator: + exploded = map(lambda x: tuple(x[k] for k in keys), iterator) + result = unzip(exploded) + else: + # evaluate this eagerly so we can reuse variable name + result = tuple(result[k] for k in keys) + return dict(zip(names, result)) def _from_list(): - if not isinstance(outputs, (list, tuple)): + result, iterator = outputs, None + # generator functions are lazy and we need a peek into their first output + if inspect.isgenerator(outputs): + (result,), iterator = spy(outputs) + + if not isinstance(result, (list, tuple)): raise ValueError( - "Failed to save outputs of node {}.\n" - "The node definition contains a list of " - "outputs {}, whereas the node function " - "returned a `{}`.".format( - str(self), self._outputs, type(outputs).__name__ - ) + f"Failed to save outputs of node {str(self)}.\n" + f"The node definition contains a list of " + f"outputs {self._outputs}, whereas the node function " + f"returned a '{type(result).__name__}'." ) - if len(outputs) != len(self._outputs): + if len(result) != len(self._outputs): raise ValueError( - "Failed to save outputs of node {}.\n" - "The node function returned {} output(s), " - "whereas the node definition contains {} " - "output(s).".format(str(self), len(outputs), len(self._outputs)) + f"Failed to save outputs of node {str(self)}.\n" + f"The node function returned {len(result)} output(s), " + f"whereas the node definition contains {len(self._outputs)} " + f"output(s)." ) - return dict(zip(self._outputs, outputs)) - - if isinstance(self._outputs, dict) and not isinstance(outputs, dict): - raise ValueError( - f"Failed to save outputs of node {self}.\n" - f"The node output is a dictionary, whereas the " - f"function output is not." - ) + if iterator: + result = unzip(iterator) + return dict(zip(self._outputs, result)) if self._outputs is None: return {} @@ -602,11 +502,11 @@ def _validate_inputs_dif_than_outputs(self): ) @staticmethod - def _process_inputs_for_bind(inputs: Union[None, str, List[str], Dict[str, str]]): + def _process_inputs_for_bind(inputs: None | str | list[str] | dict[str, str]): # Safeguard that we do not mutate list inputs inputs = copy.copy(inputs) - args = [] # type: List[str] - kwargs = {} # type: Dict[str, str] + args: list[str] = [] + kwargs: dict[str, str] = {} if isinstance(inputs, str): args = [inputs] elif isinstance(inputs, list): @@ -618,19 +518,19 @@ def _process_inputs_for_bind(inputs: Union[None, str, List[str], Dict[str, str]] def _node_error_message(msg) -> str: return ( - "Invalid Node definition: {}\n" - "Format should be: node(function, inputs, outputs)" - ).format(msg) + f"Invalid Node definition: {msg}\n" + f"Format should be: node(function, inputs, outputs)" + ) -def node( +def node( # noqa: too-many-arguments func: Callable, - inputs: Union[None, str, List[str], Dict[str, str]], - outputs: Union[None, str, List[str], Dict[str, str]], + inputs: None | str | list[str] | dict[str, str], + outputs: None | str | list[str] | dict[str, str], *, name: str = None, - tags: Iterable[str] = None, - confirms: Union[str, List[str]] = None, + tags: str | Iterable[str] | None = None, + confirms: str | list[str] | None = None, namespace: str = None, ) -> Node: """Create a node in the pipeline by providing a function to be called @@ -642,11 +542,11 @@ def node( inputs: The name or the list of the names of variables used as inputs to the function. The number of names should match the number of arguments in the definition of the provided function. When - Dict[str, str] is provided, variable names will be mapped to + dict[str, str] is provided, variable names will be mapped to function argument names. outputs: The name or the list of the names of variables used as outputs to the function. The number of names should match the number of - outputs returned by the provided function. When Dict[str, str] + outputs returned by the provided function. When dict[str, str] is provided, variable names will be mapped to the named outputs the function returns. name: Optional node name to be used when displaying the node in logs or @@ -669,7 +569,7 @@ def node( >>> import numpy as np >>> >>> def clean_data(cars: pd.DataFrame, - >>> boats: pd.DataFrame) -> Dict[str, pd.DataFrame]: + >>> boats: pd.DataFrame) -> dict[str, pd.DataFrame]: >>> return dict(cars_df=cars.dropna(), boats_df=boats.dropna()) >>> >>> def halve_dataframe(data: pd.DataFrame) -> List[pd.DataFrame]: @@ -699,7 +599,7 @@ def node( ) -def _dict_inputs_to_list(func: Callable[[Any], Any], inputs: Dict[str, str]): +def _dict_inputs_to_list(func: Callable[[Any], Any], inputs: dict[str, str]): """Convert a dict representation of the node inputs to a list, ensuring the appropriate order for binding them to the node's function. """ @@ -707,11 +607,11 @@ def _dict_inputs_to_list(func: Callable[[Any], Any], inputs: Dict[str, str]): return [*sig.args, *sig.kwargs.values()] -def _to_list(element: Union[None, str, Iterable[str], Dict[str, str]]) -> List[str]: +def _to_list(element: None | str | Iterable[str] | dict[str, str]) -> list[str]: """Make a list out of node inputs/outputs. Returns: - List[str]: Node input/output names as a list to standardise. + list[str]: Node input/output names as a list to standardise. """ if element is None: @@ -719,7 +619,7 @@ def _to_list(element: Union[None, str, Iterable[str], Dict[str, str]]) -> List[s if isinstance(element, str): return [element] if isinstance(element, dict): - return sorted(element.values()) + return list(element.values()) return list(element) diff --git a/kedro/pipeline/pipeline.py b/kedro/pipeline/pipeline.py index eeafe9f020..5b76416182 100644 --- a/kedro/pipeline/pipeline.py +++ b/kedro/pipeline/pipeline.py @@ -1,41 +1,15 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """A ``Pipeline`` is a collection of ``Node`` objects which can be executed as a Directed Acyclic Graph, sequentially or in parallel. The ``Pipeline`` class offers quick access to input dependencies, produced outputs and execution order. """ +from __future__ import annotations + import copy import json from collections import Counter, defaultdict from itertools import chain -from typing import Callable, Dict, Iterable, List, Set, Tuple, Union -from warnings import warn +from typing import Iterable from toposort import CircularDependencyError as ToposortCircleError from toposort import toposort @@ -46,7 +20,7 @@ TRANSCODING_SEPARATOR = "@" -def _transcode_split(element: str) -> Tuple[str, str]: +def _transcode_split(element: str) -> tuple[str, str]: """Split the name by the transcoding separator. If the transcoding part is missing, empty string will be put in. @@ -58,11 +32,10 @@ def _transcode_split(element: str) -> Tuple[str, str]: """ split_name = element.split(TRANSCODING_SEPARATOR) - if len(split_name) > 2: + if len(split_name) > 2: # noqa: PLR2004 raise ValueError( - "Expected maximum 1 transcoding separator, found {} instead: '{}'.".format( - len(split_name) - 1, element - ) + f"Expected maximum 1 transcoding separator, found {len(split_name) - 1} " + f"instead: '{element}'." ) if len(split_name) == 1: split_name.append("") @@ -98,7 +71,7 @@ class ConfirmNotUniqueError(Exception): pass -class Pipeline: # pylint: disable=too-many-public-methods +class Pipeline: # noqa: too-many-public-methods """A ``Pipeline`` defined as a collection of ``Node`` objects. This class treats nodes as part of a graph representation and provides inputs, outputs and execution order. @@ -106,9 +79,9 @@ class Pipeline: # pylint: disable=too-many-public-methods def __init__( self, - nodes: Iterable[Union[Node, "Pipeline"]], + nodes: Iterable[Node | Pipeline], *, - tags: Union[str, Iterable[str]] = None, + tags: str | Iterable[str] | None = None, ): """Initialise ``Pipeline`` with a list of ``Node`` instances. @@ -158,7 +131,7 @@ def __init__( """ if nodes is None: raise ValueError( - "`nodes` argument of `Pipeline` is None. It must be an " + "'nodes' argument of 'Pipeline' is None. It must be an " "iterable of nodes and/or pipelines instead." ) nodes = list(nodes) # in case it's a generator @@ -179,13 +152,13 @@ def __init__( _validate_unique_confirms(nodes) # input -> nodes with input - self._nodes_by_input = defaultdict(set) # type: Dict[str, Set[Node]] + self._nodes_by_input: dict[str, set[Node]] = defaultdict(set) for node in nodes: for input_ in node.inputs: self._nodes_by_input[_strip_transcoding(input_)].add(node) # output -> node with output - self._nodes_by_output = {} # type: Dict[str, Node] + self._nodes_by_output: dict[str, Node] = {} for node in nodes: for output in node.outputs: self._nodes_by_output[_strip_transcoding(output)] = node @@ -200,9 +173,8 @@ def __repr__(self): # pragma: no cover nodes_reprs = [repr(node) for node in self.nodes[:max_nodes_to_display]] if len(self.nodes) > max_nodes_to_display: nodes_reprs.append("...") - nodes_reprs_str = ( - "[\n{}\n]".format(",\n".join(nodes_reprs)) if nodes_reprs else "[]" - ) + sep = ",\n" + nodes_reprs_str = f"[\n{sep.join(nodes_reprs)}\n]" if nodes_reprs else "[]" constructor_repr = f"({nodes_reprs_str})" return f"{self.__class__.__name__}{constructor_repr}" @@ -211,6 +183,11 @@ def __add__(self, other): return NotImplemented return Pipeline(set(self.nodes + other.nodes)) + def __radd__(self, other): + if isinstance(other, int) and other == 0: + return self + return self.__add__(other) + def __sub__(self, other): if not isinstance(other, Pipeline): return NotImplemented @@ -226,31 +203,31 @@ def __or__(self, other): return NotImplemented return Pipeline(set(self.nodes + other.nodes)) - def all_inputs(self) -> Set[str]: + def all_inputs(self) -> set[str]: """All inputs for all nodes in the pipeline. Returns: All node input names as a Set. """ - return set.union(set(), *[node.inputs for node in self.nodes]) + return set.union(set(), *(node.inputs for node in self.nodes)) - def all_outputs(self) -> Set[str]: + def all_outputs(self) -> set[str]: """All outputs of all nodes in the pipeline. Returns: All node outputs. """ - return set.union(set(), *[node.outputs for node in self.nodes]) + return set.union(set(), *(node.outputs for node in self.nodes)) - def _remove_intermediates(self, datasets: Set[str]) -> Set[str]: + def _remove_intermediates(self, datasets: set[str]) -> set[str]: intermediate = {_strip_transcoding(i) for i in self.all_inputs()} & { _strip_transcoding(o) for o in self.all_outputs() } return {d for d in datasets if _strip_transcoding(d) not in intermediate} - def inputs(self) -> Set[str]: + def inputs(self) -> set[str]: """The names of free inputs that must be provided at runtime so that the pipeline is runnable. Does not include intermediate inputs which are produced and consumed by the inner pipeline nodes. Resolves @@ -262,7 +239,7 @@ def inputs(self) -> Set[str]: """ return self._remove_intermediates(self.all_inputs()) - def outputs(self) -> Set[str]: + def outputs(self) -> set[str]: """The names of outputs produced when the whole pipeline is run. Does not include intermediate outputs that are consumed by other pipeline nodes. Resolves transcoded names where necessary. @@ -273,7 +250,7 @@ def outputs(self) -> Set[str]: """ return self._remove_intermediates(self.all_outputs()) - def data_sets(self) -> Set[str]: + def data_sets(self) -> set[str]: """The names of all data sets used by the ``Pipeline``, including inputs and outputs. @@ -346,7 +323,7 @@ def set_to_string(set_of_strings): ) @property - def node_dependencies(self) -> Dict[Node, Set[Node]]: + def node_dependencies(self) -> dict[Node, set[Node]]: """All dependencies of nodes where the first Node has a direct dependency on the second Node. @@ -354,9 +331,7 @@ def node_dependencies(self) -> Dict[Node, Set[Node]]: Dictionary where keys are nodes and values are sets made up of their parent nodes. Independent nodes have this as empty sets. """ - dependencies = { - node: set() for node in self._nodes - } # type: Dict[Node, Set[Node]] + dependencies: dict[Node, set[Node]] = {node: set() for node in self._nodes} for parent in self._nodes: for output in parent.outputs: for child in self._nodes_by_input[_strip_transcoding(output)]: @@ -365,7 +340,7 @@ def node_dependencies(self) -> Dict[Node, Set[Node]]: return dependencies @property - def nodes(self) -> List[Node]: + def nodes(self) -> list[Node]: """Return a list of the pipeline nodes in topological order, i.e. if node A needs to be run before node B, it will appear earlier in the list. @@ -377,7 +352,7 @@ def nodes(self) -> List[Node]: return list(chain.from_iterable(self._topo_sorted_nodes)) @property - def grouped_nodes(self) -> List[Set[Node]]: + def grouped_nodes(self) -> list[list[Node]]: """Return a list of the pipeline nodes in topologically ordered groups, i.e. if node A needs to be run before node B, it will appear in an earlier group. @@ -388,12 +363,12 @@ def grouped_nodes(self) -> List[Set[Node]]: """ return copy.copy(self._topo_sorted_nodes) - def only_nodes(self, *node_names: str) -> "Pipeline": + def only_nodes(self, *node_names: str) -> Pipeline: """Create a new ``Pipeline`` which will contain only the specified nodes by name. Args: - node_names: One or more node names. The returned ``Pipeline`` + *node_names: One or more node names. The returned ``Pipeline`` will only contain these nodes. Raises: @@ -405,6 +380,21 @@ def only_nodes(self, *node_names: str) -> "Pipeline": """ unregistered_nodes = set(node_names) - set(self._nodes_by_name.keys()) if unregistered_nodes: + # check if unregistered nodes are available under namespace + namespaces = [] + for unregistered_node in unregistered_nodes: + namespaces.extend( + [ + node_name + for node_name in self._nodes_by_name.keys() + if node_name.endswith(f".{unregistered_node}") + ] + ) + if namespaces: + raise ValueError( + f"Pipeline does not contain nodes named {list(unregistered_nodes)}. " + f"Did you mean: {namespaces}?" + ) raise ValueError( f"Pipeline does not contain nodes named {list(unregistered_nodes)}." ) @@ -412,7 +402,7 @@ def only_nodes(self, *node_names: str) -> "Pipeline": nodes = [self._nodes_by_name[name] for name in node_names] return Pipeline(nodes) - def only_nodes_with_namespace(self, node_namespace: str) -> "Pipeline": + def only_nodes_with_namespace(self, node_namespace: str) -> Pipeline: """Creates a new ``Pipeline`` containing only nodes with the specified namespace. @@ -432,13 +422,13 @@ def only_nodes_with_namespace(self, node_namespace: str) -> "Pipeline": ] if not nodes: raise ValueError( - f"Pipeline does not contain nodes with namespace `{node_namespace}`" + f"Pipeline does not contain nodes with namespace '{node_namespace}'" ) return Pipeline(nodes) def _get_nodes_with_inputs_transcode_compatible( - self, datasets: Set[str] - ) -> Set[Node]: + self, datasets: set[str] + ) -> set[Node]: """Retrieves nodes that use the given `datasets` as inputs. If provided a name, but no format, for a transcoded dataset, it includes all nodes that use inputs with that name, otherwise it @@ -468,8 +458,8 @@ def _get_nodes_with_inputs_transcode_compatible( return relevant_nodes def _get_nodes_with_outputs_transcode_compatible( - self, datasets: Set[str] - ) -> Set[Node]: + self, datasets: set[str] + ) -> set[Node]: """Retrieves nodes that output to the given `datasets`. If provided a name, but no format, for a transcoded dataset, it includes the node that outputs to that name, otherwise it matches @@ -500,7 +490,7 @@ def _get_nodes_with_outputs_transcode_compatible( return relevant_nodes - def only_nodes_with_inputs(self, *inputs: str) -> "Pipeline": + def only_nodes_with_inputs(self, *inputs: str) -> Pipeline: """Create a new ``Pipeline`` object with the nodes which depend directly on the provided inputs. If provided a name, but no format, for a transcoded input, it @@ -508,7 +498,7 @@ def only_nodes_with_inputs(self, *inputs: str) -> "Pipeline": matches to the fully-qualified name only (i.e. name@format). Args: - inputs: A list of inputs which should be used as a starting + *inputs: A list of inputs which should be used as a starting point of the new ``Pipeline``. Raises: @@ -526,7 +516,7 @@ def only_nodes_with_inputs(self, *inputs: str) -> "Pipeline": return Pipeline(nodes) - def from_inputs(self, *inputs: str) -> "Pipeline": + def from_inputs(self, *inputs: str) -> Pipeline: """Create a new ``Pipeline`` object with the nodes which depend directly or transitively on the provided inputs. If provided a name, but no format, for a transcoded input, it @@ -534,7 +524,7 @@ def from_inputs(self, *inputs: str) -> "Pipeline": matches to the fully-qualified name only (i.e. name@format). Args: - inputs: A list of inputs which should be used as a starting point + *inputs: A list of inputs which should be used as a starting point of the new ``Pipeline`` Raises: @@ -549,7 +539,7 @@ def from_inputs(self, *inputs: str) -> "Pipeline": """ starting = set(inputs) - result = set() # type: Set[Node] + result: set[Node] = set() next_nodes = self._get_nodes_with_inputs_transcode_compatible(starting) while next_nodes: @@ -566,7 +556,7 @@ def from_inputs(self, *inputs: str) -> "Pipeline": return Pipeline(result) - def only_nodes_with_outputs(self, *outputs: str) -> "Pipeline": + def only_nodes_with_outputs(self, *outputs: str) -> Pipeline: """Create a new ``Pipeline`` object with the nodes which are directly required to produce the provided outputs. If provided a name, but no format, for a transcoded dataset, it @@ -574,7 +564,7 @@ def only_nodes_with_outputs(self, *outputs: str) -> "Pipeline": to the fully-qualified name only (i.e. name@format). Args: - outputs: A list of outputs which should be the final outputs + *outputs: A list of outputs which should be the final outputs of the new ``Pipeline``. Raises: @@ -591,7 +581,7 @@ def only_nodes_with_outputs(self, *outputs: str) -> "Pipeline": return Pipeline(nodes) - def to_outputs(self, *outputs: str) -> "Pipeline": + def to_outputs(self, *outputs: str) -> Pipeline: """Create a new ``Pipeline`` object with the nodes which are directly or transitively required to produce the provided outputs. If provided a name, but no format, for a transcoded dataset, it @@ -599,7 +589,7 @@ def to_outputs(self, *outputs: str) -> "Pipeline": to the fully-qualified name only (i.e. name@format). Args: - outputs: A list of outputs which should be the final outputs of + *outputs: A list of outputs which should be the final outputs of the new ``Pipeline``. Raises: @@ -614,7 +604,7 @@ def to_outputs(self, *outputs: str) -> "Pipeline": """ starting = set(outputs) - result = set() # type: Set[Node] + result: set[Node] = set() next_nodes = self._get_nodes_with_outputs_transcode_compatible(starting) while next_nodes: @@ -630,12 +620,12 @@ def to_outputs(self, *outputs: str) -> "Pipeline": return Pipeline(result) - def from_nodes(self, *node_names: str) -> "Pipeline": + def from_nodes(self, *node_names: str) -> Pipeline: """Create a new ``Pipeline`` object with the nodes which depend directly or transitively on the provided nodes. Args: - node_names: A list of node_names which should be used as a + *node_names: A list of node_names which should be used as a starting point of the new ``Pipeline``. Raises: ValueError: Raised when any of the given names do not exist in the @@ -651,12 +641,12 @@ def from_nodes(self, *node_names: str) -> "Pipeline": res += self.from_inputs(*map(_strip_transcoding, res.all_outputs())) return res - def to_nodes(self, *node_names: str) -> "Pipeline": + def to_nodes(self, *node_names: str) -> Pipeline: """Create a new ``Pipeline`` object with the nodes required directly or transitively by the provided nodes. Args: - node_names: A list of node_names which should be used as an + *node_names: A list of node_names which should be used as an end point of the new ``Pipeline``. Raises: ValueError: Raised when any of the given names do not exist in the @@ -672,13 +662,13 @@ def to_nodes(self, *node_names: str) -> "Pipeline": res += self.to_outputs(*map(_strip_transcoding, res.all_inputs())) return res - def only_nodes_with_tags(self, *tags: str) -> "Pipeline": + def only_nodes_with_tags(self, *tags: str) -> Pipeline: """Creates a new ``Pipeline`` object with the nodes which contain *any* of the provided tags. The resulting ``Pipeline`` is empty if no tags are provided. Args: - tags: A list of node tags which should be used to lookup + *tags: A list of node tags which should be used to lookup the nodes of the new ``Pipeline``. Returns: Pipeline: A new ``Pipeline`` object, containing a subset of the @@ -689,8 +679,7 @@ def only_nodes_with_tags(self, *tags: str) -> "Pipeline": nodes = [node for node in self.nodes if tags & node.tags] return Pipeline(nodes) - # pylint: disable=too-many-arguments - def filter( + def filter( # noqa: too-many-arguments self, tags: Iterable[str] = None, from_nodes: Iterable[str] = None, @@ -699,7 +688,7 @@ def filter( from_inputs: Iterable[str] = None, to_outputs: Iterable[str] = None, node_namespace: str = None, - ) -> "Pipeline": + ) -> Pipeline: """Creates a new ``Pipeline`` object with the nodes that meet all of the specified filtering conditions. @@ -781,38 +770,14 @@ def filter( ) return filtered_pipeline - def decorate(self, *decorators: Callable) -> "Pipeline": - """Create a new ``Pipeline`` by applying the provided decorators to - all the nodes in the pipeline. If no decorators are passed, it will - return a copy of the current ``Pipeline`` object. - - Args: - decorators: Decorators to be applied on all node functions in - the pipeline, always applied from right to left. - - Returns: - A new ``Pipeline`` object with all nodes decorated with the - provided decorators. - - """ - warn( - "The pipeline's `decorate` API will be deprecated in Kedro 0.18.0." - "Please use a node's Hooks to extend the node's behaviour in a pipeline." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/04_hooks.html", - DeprecationWarning, - ) - nodes = [node.decorate(*decorators) for node in self.nodes] - return Pipeline(nodes) - - def tag(self, tags: Union[str, Iterable[str]]) -> "Pipeline": - """Returns a copy of the pipeline, with each node tagged accordingly. + def tag(self, tags: str | Iterable[str]) -> Pipeline: + """Tags all the nodes in the pipeline. Args: tags: The tags to be added to the nodes. Returns: - New `Pipeline` object. + New ``Pipeline`` object with nodes tagged. """ nodes = [n.tag(tags) for n in self.nodes] return Pipeline(nodes) @@ -836,9 +801,9 @@ def to_json(self): return json.dumps(pipeline_versioned) -def _validate_duplicate_nodes(nodes_or_pipes: Iterable[Union[Node, Pipeline]]): - seen_nodes = set() # type: Set[str] - duplicates = defaultdict(set) # type: Dict[Union[Pipeline, None], Set[str]] +def _validate_duplicate_nodes(nodes_or_pipes: Iterable[Node | Pipeline]): + seen_nodes: set[str] = set() + duplicates: dict[Pipeline | None, set[str]] = defaultdict(set) def _check_node(node_: Node, pipeline_: Pipeline = None): name = node_.name @@ -867,33 +832,33 @@ def _check_node(node_: Node, pipeline_: Pipeline = None): raise ValueError( f"Pipeline nodes must have unique names. The following node names " f"appear more than once:\n\n{duplicates_info}\nYou can name your " - f"nodes using the last argument of `node()`." + f"nodes using the last argument of 'node()'." ) -def _validate_unique_outputs(nodes: List[Node]) -> None: +def _validate_unique_outputs(nodes: list[Node]) -> None: outputs = chain.from_iterable(node.outputs for node in nodes) outputs = map(_strip_transcoding, outputs) duplicates = [key for key, value in Counter(outputs).items() if value > 1] if duplicates: raise OutputNotUniqueError( - "Output(s) {} are returned by more than one nodes. Node " - "outputs must be unique.".format(sorted(duplicates)) + f"Output(s) {sorted(duplicates)} are returned by more than one nodes. Node " + f"outputs must be unique." ) -def _validate_unique_confirms(nodes: List[Node]) -> None: +def _validate_unique_confirms(nodes: list[Node]) -> None: confirms = chain.from_iterable(node.confirms for node in nodes) confirms = map(_strip_transcoding, confirms) duplicates = [key for key, value in Counter(confirms).items() if value > 1] if duplicates: raise ConfirmNotUniqueError( - "{} datasets are confirmed by more than one node. Node " - "confirms must be unique.".format(sorted(duplicates)) + f"{sorted(duplicates)} datasets are confirmed by more than one node. Node " + f"confirms must be unique." ) -def _validate_transcoded_inputs_outputs(nodes: List[Node]) -> None: +def _validate_transcoded_inputs_outputs(nodes: list[Node]) -> None: """Users should not be allowed to refer to a transcoded dataset both with and without the separator. """ @@ -912,14 +877,14 @@ def _validate_transcoded_inputs_outputs(nodes: List[Node]) -> None: if invalid: raise ValueError( - "The following datasets are used with transcoding, but " - "were referenced without the separator: {}.\n" - "Please specify a transcoding option or " - "rename the datasets.".format(", ".join(invalid)) + f"The following datasets are used with transcoding, but " + f"were referenced without the separator: {', '.join(invalid)}.\n" + f"Please specify a transcoding option or " + f"rename the datasets." ) -def _topologically_sorted(node_dependencies) -> List[Set[Node]]: +def _topologically_sorted(node_dependencies) -> list[list[Node]]: """Topologically group and sort (order) nodes such that no node depends on a node that appears in the same or a later group. @@ -933,7 +898,7 @@ def _topologically_sorted(node_dependencies) -> List[Set[Node]]: executed on the second step, etc. """ - def _circle_error_message(error_data: Dict[str, str]) -> str: + def _circle_error_message(error_data: dict[str, str]) -> str: """Error messages provided by the toposort library will refer to indices that are used as an intermediate step. This method can be used to replace that message with @@ -943,7 +908,9 @@ def _circle_error_message(error_data: Dict[str, str]) -> str: return f"Circular dependencies exist among these items: {circular}" try: - return list(toposort(node_dependencies)) + # Sort it so it has consistent order when run with SequentialRunner + result = [sorted(dependencies) for dependencies in toposort(node_dependencies)] + return result except ToposortCircleError as exc: message = _circle_error_message(exc.data) raise CircularDependencyError(message) from exc diff --git a/kedro/runner/__init__.py b/kedro/runner/__init__.py index 5d06a7ddeb..0725d56a8a 100644 --- a/kedro/runner/__init__.py +++ b/kedro/runner/__init__.py @@ -1,36 +1,16 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.runner`` provides runners that are able to execute ``Pipeline`` instances. """ -from .parallel_runner import ParallelRunner # NOQA -from .runner import AbstractRunner, run_node # NOQA -from .sequential_runner import SequentialRunner # NOQA -from .thread_runner import ThreadRunner # NOQA +from .parallel_runner import ParallelRunner +from .runner import AbstractRunner, run_node +from .sequential_runner import SequentialRunner +from .thread_runner import ThreadRunner + +__all__ = [ + "AbstractRunner", + "ParallelRunner", + "SequentialRunner", + "ThreadRunner", + "run_node", +] diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 945b3fd0d9..b9a45792da 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -1,47 +1,30 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``ParallelRunner`` is an ``AbstractRunner`` implementation. It can be used to run the ``Pipeline`` in parallel groups formed by toposort. """ -import logging.config +from __future__ import annotations + import multiprocessing import os import pickle import sys +import warnings from collections import Counter from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait from itertools import chain from multiprocessing.managers import BaseProxy, SyncManager # type: ignore from multiprocessing.reduction import ForkingPickler from pickle import PicklingError -from typing import Any, Dict, Iterable, Set +from typing import Any, Iterable + +from pluggy import PluginManager -from kedro.io import DataCatalog, DataSetError, MemoryDataSet +from kedro.framework.hooks.manager import ( + _create_hook_manager, + _register_hooks, + _register_hooks_setuptools, +) +from kedro.framework.project import settings +from kedro.io import DataCatalog, DatasetError, MemoryDataset from kedro.pipeline import Pipeline from kedro.pipeline.node import Node from kedro.runner.runner import AbstractRunner, run_node @@ -49,97 +32,111 @@ # see https://github.com/python/cpython/blob/master/Lib/concurrent/futures/process.py#L114 _MAX_WINDOWS_WORKERS = 61 +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +_SharedMemoryDataSet: type[_SharedMemoryDataset] -class _SharedMemoryDataSet: - """``_SharedMemoryDataSet`` a wrapper class for a shared MemoryDataSet in SyncManager. + +class _SharedMemoryDataset: + """``_SharedMemoryDataset`` is a wrapper class for a shared MemoryDataset in SyncManager. It is not inherited from AbstractDataSet class. """ def __init__(self, manager: SyncManager): - """Creates a new instance of ``_SharedMemoryDataSet``, + """Creates a new instance of ``_SharedMemoryDataset``, and creates shared memorydataset attribute. Args: manager: An instance of multiprocessing manager for shared objects. """ - self.shared_memory_dataset = manager.MemoryDataSet() # type: ignore + self.shared_memory_dataset = manager.MemoryDataset() # type: ignore def __getattr__(self, name): - # This if condition prevents recursive call when deserializing + # This if condition prevents recursive call when deserialising if name == "__setstate__": raise AttributeError() return getattr(self.shared_memory_dataset, name) def save(self, data: Any): - """Calls save method of a shared MemoryDataSet in SyncManager.""" + """Calls save method of a shared MemoryDataset in SyncManager.""" try: self.shared_memory_dataset.save(data) - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: # Checks if the error is due to serialisation or not try: pickle.dumps(data) except Exception as serialisation_exc: # SKIP_IF_NO_SPARK - raise DataSetError( - f"{str(data.__class__)} cannot be serialized. ParallelRunner " - "implicit memory datasets can only be used with serializable data" + raise DatasetError( + f"{str(data.__class__)} cannot be serialised. ParallelRunner " + "implicit memory datasets can only be used with serialisable data" ) from serialisation_exc - else: - raise exc + raise exc + + +def __getattr__(name): + if name == "_SharedMemoryDataSet": + alias = _SharedMemoryDataset + warnings.warn( + f"{repr(name)} has been renamed to {repr(alias.__name__)}, " + f"and the alias will be removed in Kedro 0.19.0", + DeprecationWarning, + stacklevel=2, + ) + return alias + raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") class ParallelRunnerManager(SyncManager): - """``ParallelRunnerManager`` is used to create shared ``MemoryDataSet`` + """``ParallelRunnerManager`` is used to create shared ``MemoryDataset`` objects as default data sets in a pipeline. """ -ParallelRunnerManager.register( # pylint: disable=no-member - "MemoryDataSet", MemoryDataSet -) +ParallelRunnerManager.register("MemoryDataset", MemoryDataset) # noqa: no-member -def _bootstrap_subprocess(package_name: str, conf_logging: Dict[str, Any]): - # pylint: disable=import-outside-toplevel,cyclic-import - from kedro.framework.project import configure_project +def _bootstrap_subprocess(package_name: str, logging_config: dict[str, Any]): + # noqa: import-outside-toplevel,cyclic-import + from kedro.framework.project import configure_logging, configure_project configure_project(package_name) - logging.config.dictConfig(conf_logging) + configure_logging(logging_config) -def _run_node_synchronization( # pylint: disable=too-many-arguments +def _run_node_synchronization( # noqa: too-many-arguments node: Node, catalog: DataCatalog, is_async: bool = False, - run_id: str = None, + session_id: str = None, package_name: str = None, - conf_logging: Dict[str, Any] = None, + logging_config: dict[str, Any] = None, ) -> Node: """Run a single `Node` with inputs from and outputs to the `catalog`. - `KedroSession` instance is activated in every subprocess because of Windows - (and latest OSX with Python 3.8) limitation. - Windows has no "fork", so every subprocess is a brand new process - created via "spawn", hence the need to a) setup the logging, b) register - the hooks, and c) activate `KedroSession` in every subprocess. + + A ``PluginManager`` instance is created in each subprocess because the + ``PluginManager`` can't be serialised. Args: node: The ``Node`` to run. catalog: A ``DataCatalog`` containing the node's inputs and outputs. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - run_id: The id of the pipeline run. + session_id: The session id of the pipeline run. package_name: The name of the project Python package. - conf_logging: A dictionary containing logging configuration. + logging_config: A dictionary containing logging configuration. Returns: The node argument. """ - if multiprocessing.get_start_method() == "spawn" and package_name: # type: ignore - conf_logging = conf_logging or dict() - _bootstrap_subprocess(package_name, conf_logging) + if multiprocessing.get_start_method() == "spawn" and package_name: + _bootstrap_subprocess(package_name, logging_config) # type: ignore - return run_node(node, catalog, is_async, run_id) + hook_manager = _create_hook_manager() + _register_hooks(hook_manager, settings.HOOKS) + _register_hooks_setuptools(hook_manager, settings.DISABLE_HOOKS_FOR_PLUGINS) + + return run_node(node, catalog, hook_manager, is_async, session_id) class ParallelRunner(AbstractRunner): @@ -167,7 +164,7 @@ def __init__(self, max_workers: int = None, is_async: bool = False): """ super().__init__(is_async=is_async) self._manager = ParallelRunnerManager() - self._manager.start() # pylint: disable=consider-using-with + self._manager.start() # noqa: consider-using-with # This code comes from the concurrent.futures library # https://github.com/python/cpython/blob/master/Lib/concurrent/futures/process.py#L588 @@ -185,82 +182,83 @@ def __del__(self): def create_default_data_set( # type: ignore self, ds_name: str - ) -> _SharedMemoryDataSet: - """Factory method for creating the default data set for the runner. + ) -> _SharedMemoryDataset: + """Factory method for creating the default dataset for the runner. Args: - ds_name: Name of the missing data set + ds_name: Name of the missing dataset. Returns: - An instance of an implementation of _SharedMemoryDataSet to be used - for all unregistered data sets. + An instance of ``_SharedMemoryDataset`` to be used for all + unregistered datasets. """ - return _SharedMemoryDataSet(self._manager) + return _SharedMemoryDataset(self._manager) @classmethod def _validate_nodes(cls, nodes: Iterable[Node]): - """Ensure all tasks are serializable.""" - unserializable = [] + """Ensure all tasks are serialisable.""" + unserialisable = [] for node in nodes: try: ForkingPickler.dumps(node) except (AttributeError, PicklingError): - unserializable.append(node) + unserialisable.append(node) - if unserializable: + if unserialisable: raise AttributeError( - "The following nodes cannot be serialized: {}\nIn order to " - "utilize multiprocessing you need to make sure all nodes are " - "serializable, i.e. nodes should not include lambda " - "functions, nested functions, closures, etc.\nIf you " - "are using custom decorators ensure they are correctly using " - "functools.wraps().".format(sorted(unserializable)) + f"The following nodes cannot be serialised: {sorted(unserialisable)}\n" + f"In order to utilize multiprocessing you need to make sure all nodes " + f"are serialisable, i.e. nodes should not include lambda " + f"functions, nested functions, closures, etc.\nIf you " + f"are using custom decorators ensure they are correctly decorated using " + f"functools.wraps()." ) @classmethod def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline): - """Ensure that all data sets are serializable and that we do not have + """Ensure that all data sets are serialisable and that we do not have any non proxied memory data sets being used as outputs as their content will not be synchronized across threads. """ - data_sets = catalog._data_sets # pylint: disable=protected-access + data_sets = catalog._data_sets # noqa: protected-access - unserializable = [] + unserialisable = [] for name, data_set in data_sets.items(): if getattr(data_set, "_SINGLE_PROCESS", False): # SKIP_IF_NO_SPARK - unserializable.append(name) + unserialisable.append(name) continue try: ForkingPickler.dumps(data_set) except (AttributeError, PicklingError): - unserializable.append(name) + unserialisable.append(name) - if unserializable: + if unserialisable: raise AttributeError( - "The following data sets cannot be used with multiprocessing: " - "{}\nIn order to utilize multiprocessing you need to make sure " - "all data sets are serializable, i.e. data sets should not make " - "use of lambda functions, nested functions, closures etc.\nIf you " - "are using custom decorators ensure they are correctly using " - "functools.wraps().".format(sorted(unserializable)) + f"The following data sets cannot be used with multiprocessing: " + f"{sorted(unserialisable)}\nIn order to utilize multiprocessing you " + f"need to make sure all data sets are serialisable, i.e. data sets " + f"should not make use of lambda functions, nested functions, closures " + f"etc.\nIf you are using custom decorators ensure they are correctly " + f"decorated using functools.wraps()." ) - memory_data_sets = [] + memory_datasets = [] for name, data_set in data_sets.items(): if ( name in pipeline.all_outputs() - and isinstance(data_set, MemoryDataSet) + and isinstance(data_set, MemoryDataset) and not isinstance(data_set, BaseProxy) ): - memory_data_sets.append(name) + memory_datasets.append(name) - if memory_data_sets: + if memory_datasets: raise AttributeError( - "The following data sets are memory data sets: {}\n" - "ParallelRunner does not support output to externally created " - "MemoryDataSets".format(sorted(memory_data_sets)) + f"The following data sets are memory data sets: " + f"{sorted(memory_datasets)}\n" + f"ParallelRunner does not support output to externally created " + f"MemoryDatasets" ) def _get_required_workers_count(self, pipeline: Pipeline): @@ -277,15 +275,20 @@ def _get_required_workers_count(self, pipeline: Pipeline): return min(required_processes, self._max_workers) - def _run( # pylint: disable=too-many-locals,useless-suppression - self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None + def _run( # noqa: too-many-locals,useless-suppression + self, + pipeline: Pipeline, + catalog: DataCatalog, + hook_manager: PluginManager, + session_id: str = None, ) -> None: """The abstract interface for running pipelines. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. - run_id: The id of the run. + hook_manager: The ``PluginManager`` to activate hooks. + session_id: The id of the session. Raises: AttributeError: When the provided pipeline is not suitable for @@ -295,8 +298,7 @@ def _run( # pylint: disable=too-many-locals,useless-suppression Exception: In case of any downstream node failure. """ - # pylint: disable=import-outside-toplevel,cyclic-import - from kedro.framework.session.session import get_current_session + # noqa: import-outside-toplevel,cyclic-import nodes = pipeline.nodes self._validate_catalog(catalog, pipeline) @@ -305,16 +307,12 @@ def _run( # pylint: disable=too-many-locals,useless-suppression load_counts = Counter(chain.from_iterable(n.inputs for n in nodes)) node_dependencies = pipeline.node_dependencies todo_nodes = set(node_dependencies.keys()) - done_nodes = set() # type: Set[Node] + done_nodes: set[Node] = set() futures = set() done = None max_workers = self._get_required_workers_count(pipeline) - from kedro.framework.project import PACKAGE_NAME - - session = get_current_session(silent=True) - # pylint: disable=protected-access - conf_logging = session._get_logging_config() if session else None + from kedro.framework.project import LOGGING, PACKAGE_NAME with ProcessPoolExecutor(max_workers=max_workers) as pool: while True: @@ -327,9 +325,9 @@ def _run( # pylint: disable=too-many-locals,useless-suppression node, catalog, self._is_async, - run_id, + session_id, package_name=PACKAGE_NAME, - conf_logging=conf_logging, + logging_config=LOGGING, # type: ignore ) ) if not futures: @@ -350,15 +348,12 @@ def _run( # pylint: disable=too-many-locals,useless-suppression break # pragma: no cover done, futures = wait(futures, return_when=FIRST_COMPLETED) for future in done: - try: - node = future.result() - except Exception: - self._suggest_resume_scenario(pipeline, done_nodes) - raise + node = future.result() done_nodes.add(node) - # decrement load counts and release any data sets we've finished with - # this is particularly important for the shared datasets we create above + # Decrement load counts, and release any datasets we + # have finished with. This is particularly important + # for the shared, default datasets we created above. for data_set in node.inputs: load_counts[data_set] -= 1 if ( diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 90aa682d8c..be379ace71 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -1,36 +1,13 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``AbstractRunner`` is the base class for all ``Pipeline`` runner implementations. """ +from __future__ import annotations +import inspect +import itertools as it import logging from abc import ABC, abstractmethod +from collections import deque from concurrent.futures import ( ALL_COMPLETED, Future, @@ -38,10 +15,13 @@ as_completed, wait, ) -from typing import Any, Dict, Iterable +from typing import Any, Iterable, Iterator -from kedro.framework.hooks import get_hook_manager -from kedro.io import AbstractDataSet, DataCatalog +from more_itertools import interleave +from pluggy import PluginManager + +from kedro.framework.hooks.manager import _NullPluginManager +from kedro.io import AbstractDataSet, DataCatalog, MemoryDataset from kedro.pipeline import Pipeline from kedro.pipeline.node import Node @@ -52,7 +32,7 @@ class AbstractRunner(ABC): """ def __init__(self, is_async: bool = False): - """Instantiates the runner classs. + """Instantiates the runner class. Args: is_async: If True, the node inputs and outputs are loaded and saved @@ -66,15 +46,20 @@ def _logger(self): return logging.getLogger(self.__module__) def run( - self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None - ) -> Dict[str, Any]: - """Run the ``Pipeline`` using the ``DataSet``s provided by ``catalog`` + self, + pipeline: Pipeline, + catalog: DataCatalog, + hook_manager: PluginManager = None, + session_id: str = None, + ) -> dict[str, Any]: + """Run the ``Pipeline`` using the datasets provided by ``catalog`` and save results back to the same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. - run_id: The id of the run. + hook_manager: The ``PluginManager`` to activate hooks. + session_id: The id of the session. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. @@ -86,16 +71,28 @@ def run( """ + hook_manager = hook_manager or _NullPluginManager() catalog = catalog.shallow_copy() - unsatisfied = pipeline.inputs() - set(catalog.list()) + # Check which datasets used in the pipeline are in the catalog or match + # a pattern in the catalog + registered_ds = [ds for ds in pipeline.data_sets() if ds in catalog] + + # Check if there are any input datasets that aren't in the catalog and + # don't match a pattern in the catalog. + unsatisfied = pipeline.inputs() - set(registered_ds) + if unsatisfied: raise ValueError( f"Pipeline input(s) {unsatisfied} not found in the DataCatalog" ) - free_outputs = pipeline.outputs() - set(catalog.list()) - unregistered_ds = pipeline.data_sets() - set(catalog.list()) + # Check if there's any output datasets that aren't in the catalog and don't match a pattern + # in the catalog. + free_outputs = pipeline.outputs() - set(registered_ds) + unregistered_ds = pipeline.data_sets() - set(registered_ds) + + # Create a default dataset for unregistered datasets for ds_name in unregistered_ds: catalog.add(ds_name, self.create_default_data_set(ds_name)) @@ -103,29 +100,31 @@ def run( self._logger.info( "Asynchronous mode is enabled for loading and saving data" ) - self._run(pipeline, catalog, run_id) + self._run(pipeline, catalog, hook_manager, session_id) self._logger.info("Pipeline execution completed successfully.") return {ds_name: catalog.load(ds_name) for ds_name in free_outputs} def run_only_missing( - self, pipeline: Pipeline, catalog: DataCatalog - ) -> Dict[str, Any]: + self, pipeline: Pipeline, catalog: DataCatalog, hook_manager: PluginManager + ) -> dict[str, Any]: """Run only the missing outputs from the ``Pipeline`` using the - ``DataSet``s provided by ``catalog`` and save results back to the same - objects. + datasets provided by ``catalog``, and save results back to the + same objects. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. + hook_manager: The ``PluginManager`` to activate hooks. Raises: - ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. + ValueError: Raised when ``Pipeline`` inputs cannot be + satisfied. Returns: - Any node outputs that cannot be processed by the ``DataCatalog``. - These are returned in a dictionary, where the keys are defined - by the node outputs. + Any node outputs that cannot be processed by the + ``DataCatalog``. These are returned in a dictionary, where + the keys are defined by the node outputs. """ free_outputs = pipeline.outputs() - set(catalog.list()) @@ -135,18 +134,22 @@ def run_only_missing( *to_build ) - # we also need any memory data sets that feed into that - # including chains of memory data sets - memory_sets = pipeline.data_sets() - set(catalog.list()) - output_to_memory = pipeline.only_nodes_with_outputs(*memory_sets) - input_from_memory = to_rerun.inputs() & memory_sets - to_rerun += output_to_memory.to_outputs(*input_from_memory) + # We also need any missing datasets that are required to run the + # `to_rerun` pipeline, including any chains of missing datasets. + unregistered_ds = pipeline.data_sets() - set(catalog.list()) + output_to_unregistered = pipeline.only_nodes_with_outputs(*unregistered_ds) + input_from_unregistered = to_rerun.inputs() & unregistered_ds + to_rerun += output_to_unregistered.to_outputs(*input_from_unregistered) - return self.run(to_rerun, catalog) + return self.run(to_rerun, catalog, hook_manager) @abstractmethod # pragma: no cover def _run( - self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None + self, + pipeline: Pipeline, + catalog: DataCatalog, + hook_manager: PluginManager, + session_id: str = None, ) -> None: """The abstract interface for running pipelines, assuming that the inputs have already been checked and normalized by run(). @@ -154,185 +157,315 @@ def _run( Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. - run_id: The id of the run. + hook_manager: The ``PluginManager`` to activate hooks. + session_id: The id of the session. """ pass @abstractmethod # pragma: no cover def create_default_data_set(self, ds_name: str) -> AbstractDataSet: - """Factory method for creating the default data set for the runner. + """Factory method for creating the default dataset for the runner. Args: - ds_name: Name of the missing data set + ds_name: Name of the missing dataset. Returns: - An instance of an implementation of AbstractDataSet to be - used for all unregistered data sets. - + An instance of an implementation of ``AbstractDataSet`` to be + used for all unregistered datasets. """ pass def _suggest_resume_scenario( - self, pipeline: Pipeline, done_nodes: Iterable[Node] + self, + pipeline: Pipeline, + done_nodes: Iterable[Node], + catalog: DataCatalog, ) -> None: + """ + Suggest a command to the user to resume a run after it fails. + The run should be started from the point closest to the failure + for which persisted input exists. + + Args: + pipeline: the ``Pipeline`` of the run. + done_nodes: the ``Node``s that executed successfully. + catalog: the ``DataCatalog`` of the run. + + """ remaining_nodes = set(pipeline.nodes) - set(done_nodes) postfix = "" if done_nodes: node_names = (n.name for n in remaining_nodes) resume_p = pipeline.only_nodes(*node_names) - start_p = resume_p.only_nodes_with_inputs(*resume_p.inputs()) - start_node_names = (n.name for n in start_p.nodes) + + # find the nearest persistent ancestors of the nodes in start_p + start_p_persistent_ancestors = _find_persistent_ancestors( + pipeline, start_p.nodes, catalog + ) + + start_node_names = (n.name for n in start_p_persistent_ancestors) postfix += f" --from-nodes \"{','.join(start_node_names)}\"" - self._logger.warning( - "There are %d nodes that have not run.\n" - "You can resume the pipeline run by adding the following " - "argument to your previous command:\n%s", - len(remaining_nodes), - postfix, - ) + if not postfix: + self._logger.warning( + "No nodes ran. Repeat the previous command to attempt a new run." + ) + else: + self._logger.warning( + "There are %d nodes that have not run.\n" + "You can resume the pipeline run from the nearest nodes with " + "persisted inputs by adding the following " + "argument to your previous command:\n%s", + len(remaining_nodes), + postfix, + ) + + +def _find_persistent_ancestors( + pipeline: Pipeline, children: Iterable[Node], catalog: DataCatalog +) -> set[Node]: + """Breadth-first search approach to finding the complete set of + persistent ancestors of an iterable of ``Node``s. Persistent + ancestors exclusively have persisted ``Dataset``s as inputs. + + Args: + pipeline: the ``Pipeline`` to find ancestors in. + children: the iterable containing ``Node``s to find ancestors of. + catalog: the ``DataCatalog`` of the run. + + Returns: + A set containing first persistent ancestors of the given + ``Node``s. + + """ + ancestor_nodes_to_run = set() + queue, visited = deque(children), set(children) + while queue: + current_node = queue.popleft() + if _has_persistent_inputs(current_node, catalog): + ancestor_nodes_to_run.add(current_node) + continue + for parent in _enumerate_parents(pipeline, current_node): + if parent in visited: + continue + visited.add(parent) + queue.append(parent) + return ancestor_nodes_to_run + + +def _enumerate_parents(pipeline: Pipeline, child: Node) -> list[Node]: + """For a given ``Node``, returns a list containing the direct parents + of that ``Node`` in the given ``Pipeline``. + + Args: + pipeline: the ``Pipeline`` to search for direct parents in. + child: the ``Node`` to find parents of. + + Returns: + A list of all ``Node``s that are direct parents of ``child``. + + """ + parent_pipeline = pipeline.only_nodes_with_outputs(*child.inputs) + return parent_pipeline.nodes + + +def _has_persistent_inputs(node: Node, catalog: DataCatalog) -> bool: + """Check if a ``Node`` exclusively has persisted Datasets as inputs. + If at least one input is a ``MemoryDataset``, return False. + + Args: + node: the ``Node`` to check the inputs of. + catalog: the ``DataCatalog`` of the run. + + Returns: + True if the ``Node`` being checked exclusively has inputs that + are not ``MemoryDataset``, else False. + + """ + for node_input in node.inputs: + # noqa: protected-access + if isinstance(catalog._data_sets[node_input], MemoryDataset): + return False + return True def run_node( - node: Node, catalog: DataCatalog, is_async: bool = False, run_id: str = None + node: Node, + catalog: DataCatalog, + hook_manager: PluginManager, + is_async: bool = False, + session_id: str = None, ) -> Node: """Run a single `Node` with inputs from and outputs to the `catalog`. Args: node: The ``Node`` to run. catalog: A ``DataCatalog`` containing the node's inputs and outputs. + hook_manager: The ``PluginManager`` to activate hooks. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - run_id: The id of the pipeline run + session_id: The session id of the pipeline run. + + Raises: + ValueError: Raised if is_async is set to True for nodes wrapping + generator functions. Returns: The node argument. """ + if is_async and inspect.isgeneratorfunction(node.func): + raise ValueError( + f"Async data loading and saving does not work with " + f"nodes wrapping generator functions. Please make " + f"sure you don't use `yield` anywhere " + f"in node {str(node)}." + ) + if is_async: - node = _run_node_async(node, catalog, run_id) + node = _run_node_async(node, catalog, hook_manager, session_id) else: - node = _run_node_sequential(node, catalog, run_id) + node = _run_node_sequential(node, catalog, hook_manager, session_id) for name in node.confirms: catalog.confirm(name) return node -def _collect_inputs_from_hook( +def _collect_inputs_from_hook( # noqa: too-many-arguments node: Node, catalog: DataCatalog, - inputs: Dict[str, Any], + inputs: dict[str, Any], is_async: bool, - run_id: str = None, -) -> Dict[str, Any]: + hook_manager: PluginManager, + session_id: str = None, +) -> dict[str, Any]: + inputs = inputs.copy() # shallow copy to prevent in-place modification by the hook - hook_manager = get_hook_manager() - hook_response = hook_manager.hook.before_node_run( # pylint: disable=no-member + hook_response = hook_manager.hook.before_node_run( node=node, catalog=catalog, inputs=inputs, is_async=is_async, - run_id=run_id, + session_id=session_id, ) additional_inputs = {} - for response in hook_response: - if response is not None and not isinstance(response, dict): - response_type = type(response).__name__ - raise TypeError( - f"`before_node_run` must return either None or a dictionary mapping " - f"dataset names to updated values, got `{response_type}` instead." - ) - response = response or {} - additional_inputs.update(response) + if ( + hook_response is not None + ): # all hooks on a _NullPluginManager will return None instead of a list + for response in hook_response: + if response is not None and not isinstance(response, dict): + response_type = type(response).__name__ + raise TypeError( + f"'before_node_run' must return either None or a dictionary mapping " + f"dataset names to updated values, got '{response_type}' instead." + ) + additional_inputs.update(response or {}) return additional_inputs -def _call_node_run( +def _call_node_run( # noqa: too-many-arguments node: Node, catalog: DataCatalog, - inputs: Dict[str, Any], + inputs: dict[str, Any], is_async: bool, - run_id: str = None, -) -> Dict[str, Any]: - hook_manager = get_hook_manager() + hook_manager: PluginManager, + session_id: str = None, +) -> dict[str, Any]: + try: outputs = node.run(inputs) except Exception as exc: - hook_manager.hook.on_node_error( # pylint: disable=no-member + hook_manager.hook.on_node_error( error=exc, node=node, catalog=catalog, inputs=inputs, is_async=is_async, - run_id=run_id, + session_id=session_id, ) raise exc - hook_manager.hook.after_node_run( # pylint: disable=no-member + hook_manager.hook.after_node_run( node=node, catalog=catalog, inputs=inputs, outputs=outputs, is_async=is_async, - run_id=run_id, + session_id=session_id, ) return outputs -def _run_node_sequential(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: +def _run_node_sequential( + node: Node, + catalog: DataCatalog, + hook_manager: PluginManager, + session_id: str = None, +) -> Node: inputs = {} - hook_manager = get_hook_manager() for name in node.inputs: - hook_manager.hook.before_dataset_loaded( # pylint: disable=no-member - dataset_name=name - ) + hook_manager.hook.before_dataset_loaded(dataset_name=name, node=node) inputs[name] = catalog.load(name) - hook_manager.hook.after_dataset_loaded( # pylint: disable=no-member - dataset_name=name, data=inputs[name] + hook_manager.hook.after_dataset_loaded( + dataset_name=name, data=inputs[name], node=node ) is_async = False additional_inputs = _collect_inputs_from_hook( - node, catalog, inputs, is_async, run_id=run_id + node, catalog, inputs, is_async, hook_manager, session_id=session_id ) inputs.update(additional_inputs) - outputs = _call_node_run(node, catalog, inputs, is_async, run_id=run_id) + outputs = _call_node_run( + node, catalog, inputs, is_async, hook_manager, session_id=session_id + ) - for name, data in outputs.items(): - hook_manager.hook.before_dataset_saved( # pylint: disable=no-member - dataset_name=name, data=data - ) + items: Iterable = outputs.items() + # if all outputs are iterators, then the node is a generator node + if all(isinstance(d, Iterator) for d in outputs.values()): + # Python dictionaries are ordered, so we are sure + # the keys and the chunk streams are in the same order + # [a, b, c] + keys = list(outputs.keys()) + # [Iterator[chunk_a], Iterator[chunk_b], Iterator[chunk_c]] + streams = list(outputs.values()) + # zip an endless cycle of the keys + # with an interleaved iterator of the streams + # [(a, chunk_a), (b, chunk_b), ...] until all outputs complete + items = zip(it.cycle(keys), interleave(*streams)) + + for name, data in items: + hook_manager.hook.before_dataset_saved(dataset_name=name, data=data, node=node) catalog.save(name, data) - hook_manager.hook.after_dataset_saved( # pylint: disable=no-member - dataset_name=name, data=data - ) + hook_manager.hook.after_dataset_saved(dataset_name=name, data=data, node=node) return node -def _run_node_async(node: Node, catalog: DataCatalog, run_id: str = None) -> Node: +def _run_node_async( + node: Node, + catalog: DataCatalog, + hook_manager: PluginManager, + session_id: str = None, +) -> Node: def _synchronous_dataset_load(dataset_name: str): """Minimal wrapper to ensure Hooks are run synchronously within an asynchronous dataset load.""" - hook_manager.hook.before_dataset_loaded( # pylint: disable=no-member - dataset_name=dataset_name - ) + hook_manager.hook.before_dataset_loaded(dataset_name=dataset_name, node=node) return_ds = catalog.load(dataset_name) - hook_manager.hook.after_dataset_loaded( # pylint: disable=no-member - dataset_name=dataset_name, data=return_ds + hook_manager.hook.after_dataset_loaded( + dataset_name=dataset_name, data=return_ds, node=node ) return return_ds with ThreadPoolExecutor() as pool: - inputs: Dict[str, Future] = {} - hook_manager = get_hook_manager() + inputs: dict[str, Future] = {} for name in node.inputs: inputs[name] = pool.submit(_synchronous_dataset_load, name) @@ -341,25 +474,28 @@ def _synchronous_dataset_load(dataset_name: str): inputs = {key: value.result() for key, value in inputs.items()} is_async = True additional_inputs = _collect_inputs_from_hook( - node, catalog, inputs, is_async, run_id=run_id + node, catalog, inputs, is_async, hook_manager, session_id=session_id ) inputs.update(additional_inputs) - outputs = _call_node_run(node, catalog, inputs, is_async, run_id=run_id) - - save_futures = set() + outputs = _call_node_run( + node, catalog, inputs, is_async, hook_manager, session_id=session_id + ) + future_dataset_mapping = {} for name, data in outputs.items(): - hook_manager.hook.before_dataset_saved( # pylint: disable=no-member - dataset_name=name, data=data + hook_manager.hook.before_dataset_saved( + dataset_name=name, data=data, node=node ) - save_futures.add(pool.submit(catalog.save, name, data)) + future = pool.submit(catalog.save, name, data) + future_dataset_mapping[future] = (name, data) - for future in as_completed(save_futures): + for future in as_completed(future_dataset_mapping): exception = future.exception() if exception: raise exception - hook_manager.hook.after_dataset_saved( # pylint: disable=no-member - dataset_name=name, data=data # pylint: disable=undefined-loop-variable + name, data = future_dataset_mapping[future] + hook_manager.hook.after_dataset_saved( + dataset_name=name, data=data, node=node ) return node diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index f28c00a9f8..59f53e7b7a 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``SequentialRunner`` is an ``AbstractRunner`` implementation. It can be used to run the ``Pipeline`` in a sequential manner using a topological sort of provided nodes. @@ -33,7 +6,9 @@ from collections import Counter from itertools import chain -from kedro.io import AbstractDataSet, DataCatalog, MemoryDataSet +from pluggy import PluginManager + +from kedro.io import AbstractDataSet, DataCatalog, MemoryDataset from kedro.pipeline import Pipeline from kedro.runner.runner import AbstractRunner, run_node @@ -65,17 +40,22 @@ def create_default_data_set(self, ds_name: str) -> AbstractDataSet: for all unregistered data sets. """ - return MemoryDataSet() + return MemoryDataset() def _run( - self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None + self, + pipeline: Pipeline, + catalog: DataCatalog, + hook_manager: PluginManager, + session_id: str = None, ) -> None: """The method implementing sequential pipeline running. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. - run_id: The id of the run. + hook_manager: The ``PluginManager`` to activate hooks. + session_id: The id of the session. Raises: Exception: in case of any downstream node failure. @@ -87,10 +67,10 @@ def _run( for exec_index, node in enumerate(nodes): try: - run_node(node, catalog, self._is_async, run_id) + run_node(node, catalog, hook_manager, self._is_async, session_id) done_nodes.add(node) except Exception: - self._suggest_resume_scenario(pipeline, done_nodes) + self._suggest_resume_scenario(pipeline, done_nodes, catalog) raise # decrement load counts and release any data sets we've finished with diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index 3fd42c177e..6f3d6818d1 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -1,41 +1,17 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """``ThreadRunner`` is an ``AbstractRunner`` implementation. It can be used to run the ``Pipeline`` in parallel groups formed by toposort using threads. """ +from __future__ import annotations + import warnings from collections import Counter from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait from itertools import chain -from typing import Set -from kedro.io import AbstractDataSet, DataCatalog, MemoryDataSet +from pluggy import PluginManager + +from kedro.io import DataCatalog, MemoryDataset from kedro.pipeline import Pipeline from kedro.pipeline.node import Node from kedro.runner.runner import AbstractRunner, run_node @@ -64,9 +40,9 @@ def __init__(self, max_workers: int = None, is_async: bool = False): """ if is_async: warnings.warn( - "`ThreadRunner` doesn't support loading and saving the " + "'ThreadRunner' doesn't support loading and saving the " "node inputs and outputs asynchronously with threads. " - "Setting `is_async` to False." + "Setting 'is_async' to False." ) super().__init__(is_async=False) @@ -75,18 +51,18 @@ def __init__(self, max_workers: int = None, is_async: bool = False): self._max_workers = max_workers - def create_default_data_set(self, ds_name: str) -> AbstractDataSet: - """Factory method for creating the default data set for the runner. + def create_default_data_set(self, ds_name: str) -> MemoryDataset: # type: ignore + """Factory method for creating the default dataset for the runner. Args: - ds_name: Name of the missing data set + ds_name: Name of the missing dataset. Returns: - An instance of an implementation of AbstractDataSet to be used - for all unregistered data sets. + An instance of ``MemoryDataset`` to be used for all + unregistered datasets. """ - return MemoryDataSet() + return MemoryDataset() def _get_required_workers_count(self, pipeline: Pipeline): """ @@ -105,15 +81,20 @@ def _get_required_workers_count(self, pipeline: Pipeline): else required_threads ) - def _run( # pylint: disable=too-many-locals,useless-suppression - self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None + def _run( # noqa: too-many-locals,useless-suppression + self, + pipeline: Pipeline, + catalog: DataCatalog, + hook_manager: PluginManager, + session_id: str = None, ) -> None: """The abstract interface for running pipelines. Args: pipeline: The ``Pipeline`` to run. catalog: The ``DataCatalog`` from which to fetch data. - run_id: The id of the run. + hook_manager: The ``PluginManager`` to activate hooks. + session_id: The id of the session. Raises: Exception: in case of any downstream node failure. @@ -123,7 +104,7 @@ def _run( # pylint: disable=too-many-locals,useless-suppression load_counts = Counter(chain.from_iterable(n.inputs for n in nodes)) node_dependencies = pipeline.node_dependencies todo_nodes = set(node_dependencies.keys()) - done_nodes = set() # type: Set[Node] + done_nodes: set[Node] = set() futures = set() done = None max_workers = self._get_required_workers_count(pipeline) @@ -134,7 +115,14 @@ def _run( # pylint: disable=too-many-locals,useless-suppression todo_nodes -= ready for node in ready: futures.add( - pool.submit(run_node, node, catalog, self._is_async, run_id) + pool.submit( + run_node, + node, + catalog, + hook_manager, + self._is_async, + session_id, + ) ) if not futures: assert not todo_nodes, (todo_nodes, done_nodes, ready, done) @@ -144,7 +132,7 @@ def _run( # pylint: disable=too-many-locals,useless-suppression try: node = future.result() except Exception: - self._suggest_resume_scenario(pipeline, done_nodes) + self._suggest_resume_scenario(pipeline, done_nodes, catalog) raise done_nodes.add(node) self._logger.info("Completed node: %s", node.name) @@ -152,9 +140,8 @@ def _run( # pylint: disable=too-many-locals,useless-suppression "Completed %d out of %d tasks", len(done_nodes), len(nodes) ) - # decrement load counts and release any data sets we've finished - # with this is particularly important for the shared datasets we - # create above + # Decrement load counts, and release any datasets we + # have finished with. for data_set in node.inputs: load_counts[data_set] -= 1 if ( diff --git a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/__init__.py b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/__init__.py index 4a2ff49991..8e84ca9c0e 100644 --- a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/__init__.py +++ b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/__init__.py @@ -1,35 +1,10 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """ This is a boilerplate pipeline '{{ cookiecutter.pipeline_name }}' generated using Kedro {{ cookiecutter.kedro_version }} """ -from .pipeline import create_pipeline # NOQA +from .pipeline import create_pipeline + +__all__ = ["create_pipeline"] __version__ = "0.1" diff --git a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml index e94e6a6efd..c4242b73e6 100644 --- a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml +++ b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml @@ -2,4 +2,4 @@ # using Kedro {{ cookiecutter.kedro_version }}. # # Documentation for this file format can be found in "Parameters" -# Link: https://kedro.readthedocs.io/en/{{ cookiecutter.kedro_version }}/04_kedro_project_setup/02_configuration.html#parameters +# Link: https://docs.kedro.org/en/{{ cookiecutter.kedro_version }}/kedro_project_setup/configuration.html#parameters diff --git a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/nodes.py b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/nodes.py index c3689a5c66..87161a28ee 100644 --- a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/nodes.py +++ b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/nodes.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """ This is a boilerplate pipeline '{{ cookiecutter.pipeline_name }}' generated using Kedro {{ cookiecutter.kedro_version }} diff --git a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/pipeline.py b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/pipeline.py index 9b33123c7d..587123c64c 100644 --- a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/pipeline.py +++ b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/pipeline.py @@ -1,38 +1,10 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """ This is a boilerplate pipeline '{{ cookiecutter.pipeline_name }}' generated using Kedro {{ cookiecutter.kedro_version }} """ -from kedro.pipeline import Pipeline, node +from kedro.pipeline import Pipeline, pipeline -def create_pipeline(**kwargs): - return Pipeline([]) +def create_pipeline(**kwargs) -> Pipeline: + return pipeline([]) diff --git a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/tests/test_pipeline.py b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/tests/test_pipeline.py index 3c1354a60c..5296883ca9 100644 --- a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/tests/test_pipeline.py +++ b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/tests/test_pipeline.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """ This is a boilerplate test file for pipeline '{{ cookiecutter.pipeline_name }}' generated using Kedro {{ cookiecutter.kedro_version }}. diff --git a/kedro/templates/project/cookiecutter.json b/kedro/templates/project/cookiecutter.json index 71f06bbf59..6c697e9c6b 100644 --- a/kedro/templates/project/cookiecutter.json +++ b/kedro/templates/project/cookiecutter.json @@ -1,6 +1,6 @@ { "project_name": "New Kedro Project", - "repo_name": "{{ cookiecutter.project_name.replace(' ', '-').lower().strip('-') }}", - "python_package": "{{ cookiecutter.project_name.replace(' ', '_').lower().strip('-') }}", + "repo_name": "{{ cookiecutter.project_name.strip().replace(' ', '-').replace('_', '-').lower() }}", + "python_package": "{{ cookiecutter.project_name.strip().replace(' ', '_').replace('-', '_').lower() }}", "kedro_version": "{{ cookiecutter.kedro_version }}" } diff --git a/kedro/templates/project/prompts.yml b/kedro/templates/project/prompts.yml index 01a7da5ac0..7e4bf62f66 100644 --- a/kedro/templates/project/prompts.yml +++ b/kedro/templates/project/prompts.yml @@ -1,28 +1,9 @@ project_name: - title: "Project Name:" + title: "Project Name" text: | Please enter a human readable name for your new project. - Spaces and punctuation are allowed. - -repo_name: - title: "Repository Name:" - text: | - Please enter a directory name for your new project repository. - Alphanumeric characters, hyphens and underscores are allowed. - Lowercase is recommended. - regex_validator: "^\\w+(-*\\w+)*$" - error_message: | - It must contain only word symbols and/or hyphens, must also - start and end with alphanumeric symbol." - -python_package: - title: "Python Package Name:" - text: | - Please enter a valid Python package name for your project package. - Alphanumeric characters and underscores are allowed. - Lowercase is recommended. Package name must start with a letter - or underscore. - regex_validator: "^[a-zA-Z_]\\w{1,}$" + Spaces, hyphens, and underscores are allowed. + regex_validator: "^[\\w -]{2,}$" error_message: | - It must start with a letter or underscore, be at least 2 characters long - and contain only letters, digits, and/or underscores. + It must contain only alphanumeric symbols, spaces, underscores and hyphens and + be at least 2 characters long. diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/.coveragerc b/kedro/templates/project/{{ cookiecutter.repo_name }}/.coveragerc deleted file mode 100644 index 003c131200..0000000000 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/.coveragerc +++ /dev/null @@ -1,6 +0,0 @@ -[report] -fail_under=0 -show_missing=True -exclude_lines = - pragma: no cover - raise NotImplementedError diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/.gitignore b/kedro/templates/project/{{ cookiecutter.repo_name }}/.gitignore index 1f0b415c17..8cd10b4acc 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/.gitignore +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/.gitignore @@ -144,6 +144,7 @@ celerybeat-schedule # Environments .env +.envrc .venv env/ venv/ diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/.ipython/profile_default/ipython_config.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/.ipython/profile_default/ipython_config.py deleted file mode 100644 index a76e260386..0000000000 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/.ipython/profile_default/ipython_config.py +++ /dev/null @@ -1 +0,0 @@ -c.InteractiveShellApp.extensions.append("kedro.extras.extensions.ipython") diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/README.md b/kedro/templates/project/{{ cookiecutter.repo_name }}/README.md index 44a529d9b9..07ae44d46c 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/README.md +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/README.md @@ -2,16 +2,16 @@ ## Overview -This is your new Kedro project, which was generated using `Kedro {{ cookiecutter.kedro_version }}`. +This is your new Kedro project, which was generated using `kedro {{ cookiecutter.kedro_version }}`. -Take a look at the [Kedro documentation](https://kedro.readthedocs.io) to get started. +Take a look at the [Kedro documentation](https://docs.kedro.org) to get started. ## Rules and guidelines In order to get the best out of the template: * Don't remove any lines from the `.gitignore` file we provide -* Make sure your results can be reproduced by following a [data engineering convention](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#what-is-data-engineering-convention) +* Make sure your results can be reproduced by following a data engineering convention * Don't commit data to your repository * Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/` @@ -22,7 +22,7 @@ Declare any dependencies in `src/requirements.txt` for `pip` installation and `s To install them, run: ``` -kedro install +pip install -r src/requirements.txt ``` ## How to run your Kedro pipeline @@ -51,17 +51,17 @@ To generate or update the dependency requirements for your project: kedro build-reqs ``` -This will copy the contents of `src/requirements.txt` into a new file `src/requirements.in` which will be used as the source for `pip-compile`. You can see the output of the resolution by opening `src/requirements.txt`. +This will `pip-compile` the contents of `src/requirements.txt` into a new file `src/requirements.lock`. You can see the output of the resolution by opening `src/requirements.lock`. -After this, if you'd like to update your project requirements, please update `src/requirements.in` and re-run `kedro build-reqs`. +After this, if you'd like to update your project requirements, please update `src/requirements.txt` and re-run `kedro build-reqs`. -[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/04_kedro_project_setup/01_dependencies.html#project-specific-dependencies) +[Further information about project dependencies](https://docs.kedro.org/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) ## How to work with Kedro and notebooks > Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `context`, `catalog`, and `startup_error`. > -> Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `kedro install` you will not need to take any extra steps before you use them. +> Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r src/requirements.txt` you will not need to take any extra steps before you use them. ### Jupyter To use Jupyter notebooks in your Kedro project, you need to install Jupyter: @@ -119,4 +119,4 @@ To automatically strip out all output cell contents before committing to `git`, ## Package your Kedro project -[Further information about building project documentation and packaging your project](https://kedro.readthedocs.io/en/stable/03_tutorial/05_package_a_project.html) +[Further information about building project documentation and packaging your project](https://docs.kedro.org/en/stable/tutorial/package_a_project.html) diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/README.md b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/README.md index 8089c8fb48..4379b1e59a 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/README.md +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/README.md @@ -21,6 +21,6 @@ WARNING: Please do not put access credentials in the base configuration folder. +## Need help? -## Find out more -You can find out more about configuration from the [user guide documentation](https://kedro.readthedocs.io/en/stable/04_user_guide/03_configuration.html). +[Find out more about configuration from the Kedro documentation](https://docs.kedro.org/en/stable/kedro_project_setup/configuration.html). diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index dfeffb42bb..be73adae2a 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -1,4 +1,4 @@ # Here you can define all your data sets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" -# Link: https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html +# Link: https://docs.kedro.org/en/stable/data/data_catalog.html diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/logging.yml b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/logging.yml deleted file mode 100644 index 3689418056..0000000000 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/logging.yml +++ /dev/null @@ -1,66 +0,0 @@ -version: 1 -disable_existing_loggers: False -formatters: - simple: - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - json_formatter: - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - class: pythonjsonlogger.jsonlogger.JsonFormatter - -handlers: - console: - class: logging.StreamHandler - level: INFO - formatter: simple - stream: ext://sys.stdout - - info_file_handler: - class: logging.handlers.RotatingFileHandler - level: INFO - formatter: simple - filename: logs/info.log - maxBytes: 10485760 # 10MB - backupCount: 20 - encoding: utf8 - delay: True - - error_file_handler: - class: logging.handlers.RotatingFileHandler - level: ERROR - formatter: simple - filename: logs/errors.log - maxBytes: 10485760 # 10MB - backupCount: 20 - encoding: utf8 - delay: True - - journal_file_handler: - class: kedro.versioning.journal.JournalFileHandler - level: INFO - base_dir: logs/journals - formatter: json_formatter - -loggers: - anyconfig: - level: WARNING - handlers: [console, info_file_handler, error_file_handler] - propagate: no - - kedro.io: - level: INFO - handlers: [console, info_file_handler, error_file_handler] - propagate: no - - kedro.pipeline: - level: INFO - handlers: [console, info_file_handler, error_file_handler] - propagate: no - - kedro.journal: - level: INFO - handlers: [journal_file_handler] - propagate: no - -root: - level: INFO - handlers: [console, info_file_handler, error_file_handler] diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/.gitkeep b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/docs/source/conf.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/docs/source/conf.py index e9be509eea..b0382b6735 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/docs/source/conf.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/docs/source/conf.py @@ -1,33 +1,5 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. # {{ cookiecutter.python_package }} documentation build # configuration file, created by sphinx-quickstart. @@ -48,15 +20,13 @@ import re from kedro.framework.cli.utils import find_stylesheets -from recommonmark.transform import AutoStructify from {{ cookiecutter.python_package }} import __version__ as release # -- Project information ----------------------------------------------------- project = "{{ cookiecutter.python_package }}" -copyright = "2021, QuantumBlack Visual Analytics Limited" -author = "QuantumBlack" +author = "Kedro" # The short X.Y version. version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) @@ -82,7 +52,7 @@ "sphinx.ext.viewcode", "sphinx.ext.mathjax", "nbsphinx", - "recommonmark", + "myst_parser", "sphinx_copybutton", ] @@ -179,7 +149,7 @@ master_doc, "{{ cookiecutter.python_package }}.tex", "{{ cookiecutter.python_package }} Documentation", - "QuantumBlack", + "Kedro", "manual", ) ] @@ -251,7 +221,4 @@ def setup(app): app.connect("autodoc-skip-member", skip) # add Kedro stylesheets for stylesheet in find_stylesheets(): - app.add_stylesheet(stylesheet) - # enable rendering RST tables in Markdown - app.add_config_value("recommonmark_config", {"enable_eval_rst": True}, True) - app.add_transform(AutoStructify) + app.add_css_file(stylesheet) diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/logs/.gitkeep b/kedro/templates/project/{{ cookiecutter.repo_name }}/logs/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/logs/journals/.gitkeep b/kedro/templates/project/{{ cookiecutter.repo_name }}/logs/journals/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/pyproject.toml b/kedro/templates/project/{{ cookiecutter.repo_name }}/pyproject.toml index bf58e6fac0..7ae06368bd 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/pyproject.toml +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/pyproject.toml @@ -1,17 +1,17 @@ [tool.kedro] package_name = "{{ cookiecutter.python_package }}" project_name = "{{ cookiecutter.project_name }}" -project_version = "{{ cookiecutter.kedro_version }}" +kedro_init_version = "{{ cookiecutter.kedro_version }}" [tool.isort] -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -use_parentheses = true -line_length = 88 -known_third_party = "kedro" +profile = "black" [tool.pytest.ini_options] addopts = """ --cov-report term-missing \ --cov src/{{ cookiecutter.python_package }} -ra""" + +[tool.coverage.report] +fail_under = 0 +show_missing = true +exclude_lines = ["pragma: no cover", "raise NotImplementedError"] diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt index 9dc3d2c5af..aa7ee32014 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt @@ -1,14 +1,14 @@ -black==21.5b1 -flake8>=3.7.9, <4.0 -ipython~=7.10 +black~=22.0 +flake8>=3.7.9, <5.0 +ipython>=7.31.1, <8.0; python_version < '3.8' +ipython~=8.10; python_version >= '3.8' isort~=5.0 jupyter~=1.0 -jupyter_client>=5.1, <7.0 -jupyterlab~=3.0 -kedro=={{ cookiecutter.kedro_version }} -kedro-telemetry~=0.1.0; python_version < '3.9' +jupyterlab_server>=2.11.1, <2.16.0 +jupyterlab~=3.0, <3.6.0 +kedro~={{ cookiecutter.kedro_version }} +kedro-telemetry~=0.2.0 nbstripout~=0.4 -pytest-cov~=2.5 +pytest-cov~=3.0 pytest-mock>=1.7.1, <2.0 -pytest~=6.2 -wheel>=0.35, <0.37 +pytest~=7.2 diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/setup.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/setup.py index 940262245b..8e62d661f8 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/setup.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/setup.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from setuptools import find_packages, setup entry_point = ( @@ -34,7 +6,7 @@ # get the dependencies and installs -with open("requirements.txt", "r", encoding="utf-8") as f: +with open("requirements.txt", encoding="utf-8") as f: # Make sure we strip all comments and options (e.g "--extra-index-url") # that arise from a modified pip.conf file that configure global options # when running kedro build-reqs @@ -52,14 +24,16 @@ install_requires=requires, extras_require={ "docs": [ + "docutils<0.18.0", "sphinx~=3.4.3", "sphinx_rtd_theme==0.5.1", "nbsphinx==0.8.1", "nbstripout~=0.4", - "recommonmark==0.7.1", "sphinx-autodoc-typehints==1.11.1", "sphinx_copybutton==0.3.1", "ipykernel>=5.3, <7.0", + "Jinja2<3.1.0", + "myst-parser~=0.17.2", ] }, ) diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py index 851e969127..785c5a40b9 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """ This module contains an example test. @@ -39,13 +11,25 @@ from pathlib import Path import pytest + +from kedro.config import ConfigLoader from kedro.framework.context import KedroContext +from kedro.framework.hooks import _create_hook_manager +from kedro.framework.project import settings + + +@pytest.fixture +def config_loader(): + return ConfigLoader(conf_source=str(Path.cwd() / settings.CONF_SOURCE)) @pytest.fixture -def project_context(): +def project_context(config_loader): return KedroContext( - package_name="{{ cookiecutter.python_package }}", project_path=Path.cwd() + package_name="{{ cookiecutter.python_package }}", + project_path=Path.cwd(), + config_loader=config_loader, + hook_manager=_create_hook_manager(), ) @@ -53,5 +37,5 @@ def project_context(): # and should be replaced with the ones testing the project # functionality class TestProjectContext: - def test_package_name(self, project_context): - assert project_context.package_name == "{{ cookiecutter.python_package }}" + def test_project_path(self, project_context): + assert project_context.project_path == Path.cwd() diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py index 6a65ee3d29..177bba98c1 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__init__.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """{{ cookiecutter.project_name }} """ diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py index c018e6878e..9e6750922a 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. """{{ cookiecutter.project_name }} file for ensuring the package is executable as `{{ cookiecutter.repo_name }}` and `python -m {{ cookiecutter.python_package }}` """ @@ -47,7 +20,7 @@ def _find_run_command(package_name): if run: # use run command from installed plugin if it exists return run - # use run command from the framework project + # use run command from `kedro.framework.cli.project` from kedro.framework.cli.project import run return run @@ -63,11 +36,11 @@ def _find_run_command_in_plugins(plugins): return group.commands["run"] -def main(): +def main(*args, **kwargs): package_name = Path(__file__).parent.name configure_project(package_name) run = _find_run_command(package_name) - run() + run(*args, **kwargs) if __name__ == "__main__": diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py deleted file mode 100644 index 135fc6b04b..0000000000 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Project hooks.""" -from typing import Any, Dict, Optional - -from kedro.framework.hooks import hook_impl -from kedro.io import DataCatalog -from kedro.versioning import Journal - - -class ProjectHooks: - @hook_impl - def register_catalog( - self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: - return DataCatalog.from_config( - catalog, credentials, load_versions, save_version, journal - ) diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py index cdc67202e0..2109a75017 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipeline_registry.py @@ -1,41 +1,16 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Project pipelines.""" -from typing import Dict +from __future__ import annotations +from kedro.framework.project import find_pipelines from kedro.pipeline import Pipeline -def register_pipelines() -> Dict[str, Pipeline]: +def register_pipelines() -> dict[str, Pipeline]: """Register the project's pipelines. Returns: - A mapping from a pipeline name to a ``Pipeline`` object. + A mapping from pipeline names to ``Pipeline`` objects. """ - return {"__default__": Pipeline([])} + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + return pipelines diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py index b895b36ac8..86a92b1c80 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py @@ -1,65 +1,41 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +"""Project settings. There is no need to edit this file unless you want to change values +from the Kedro defaults. For further information, including these default values, see +https://kedro.readthedocs.io/en/stable/kedro_project_setup/settings.html.""" -"""Project settings.""" -from {{cookiecutter.python_package}}.hooks import ProjectHooks +# Instantiated project hooks. +# For example, after creating a hooks.py and defining a ProjectHooks class there, do +# from {{cookiecutter.python_package}}.hooks import ProjectHooks +# HOOKS = (ProjectHooks(),) -# Instantiate and list your project hooks here -HOOKS = (ProjectHooks(),) - -# List the installed plugins for which to disable auto-registry +# Installed plugins for which to disable hook auto-registration. # DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) -# Define where to store data from a KedroSession. Defaults to BaseSessionStore. -# from kedro.framework.session.store import ShelveStore -# SESSION_STORE_CLASS = ShelveStore - -# Define keyword arguments to be passed to `SESSION_STORE_CLASS` constructor +# Class that manages storing KedroSession data. +# from kedro.framework.session.store import BaseSessionStore +# SESSION_STORE_CLASS = BaseSessionStore +# Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor. # SESSION_STORE_ARGS = { # "path": "./sessions" # } -# Define custom context class. Defaults to `KedroContext` -# CONTEXT_CLASS = KedroContext - -# Define the configuration folder. Defaults to `conf` +# Directory that holds configuration. # CONF_SOURCE = "conf" -# Select the project ConfigLoader class here. -# Defaults to kedro.config.ConfigLoader -# Define the config loader. Defaults to ConfigLoader. -# from kedro.config import TemplatedConfigLoader -# CONFIG_LOADER_CLASS = TemplatedConfigLoader - -# Define keyword arguments to be passed to `CONFIG_LOADER_CLASS` constructor. -# These kwargs depend on the `ConfigLoader` class implementation. +# Class that manages how configuration is loaded. +# from kedro.config import OmegaConfigLoader +# CONFIG_LOADER_CLASS = OmegaConfigLoader +# Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor. # CONFIG_LOADER_ARGS = { -# "globals_pattern": "*globals.yml", -# "base_env": "base", -# "default_run_env": "local", +# "config_patterns": { +# "spark" : ["spark*/"], +# "parameters": ["parameters*", "parameters*/**", "**/parameters*"], +# } # } + +# Class that manages Kedro's library components. +# from kedro.framework.context import KedroContext +# CONTEXT_CLASS = KedroContext + +# Class that manages the Data Catalog. +# from kedro.io import DataCatalog +# DATA_CATALOG_CLASS = DataCatalog diff --git a/kedro/utils.py b/kedro/utils.py index 2411de2c4e..6067d96b6c 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """This module provides a set of helper functions being used across different components of kedro package. """ @@ -52,5 +24,5 @@ def load_obj(obj_path: str, default_obj_path: str = "") -> Any: obj_name = obj_path_list[0] module_obj = importlib.import_module(obj_path) if not hasattr(module_obj, obj_name): - raise AttributeError(f"Object `{obj_name}` cannot be loaded from `{obj_path}`.") + raise AttributeError(f"Object '{obj_name}' cannot be loaded from '{obj_path}'.") return getattr(module_obj, obj_name) diff --git a/kedro/versioning/__init__.py b/kedro/versioning/__init__.py deleted file mode 100644 index e8fd72089c..0000000000 --- a/kedro/versioning/__init__.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``kedro.versioning`` provides functionality to setup the Journal for -capturing information required to reproduce a Kedro run. -""" - -from .journal import Journal # NOQA diff --git a/kedro/versioning/journal.py b/kedro/versioning/journal.py deleted file mode 100644 index 6a77919ba0..0000000000 --- a/kedro/versioning/journal.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -"""This module provides journal logging to enable versioning support for -Kedro project.""" -import json -import logging -import subprocess -import warnings -from pathlib import Path -from typing import Any, Dict, Mapping, Optional, Union - -_JOURNAL_KEY = "kedro.journal" - - -class Journal: - """``Journal`` class provides journal logging to enable versioning support for - Kedro project. - """ - - def __init__(self, record_data: Dict[str, Any]): - """Initialise ``Journal`` as a session of the journal versioning, - and log the project context with an unique identifier. - - Args: - record_data: JSON serializable dictionary specific to project context. - Raises: - DeprecationWarning - """ - warnings.warn( - "`Journal` is now deprecated and will be removed in Kedro 0.18.0." - "For more information, please visit " - "https://github.com/quantumblacklabs/kedro/blob/master/RELEASE.md", - DeprecationWarning, - ) - - self.run_id = record_data["run_id"] - record_data["git_sha"] = _git_sha(record_data["project_path"]) - self._log_journal("ContextJournalRecord", record_data) - - def _log_journal(self, record_type: str, record_data: Mapping) -> None: - """Log a record to journal. - - Args: - record_type: A unique type identifier. - record_data: JSON serializable dictionary, specific to ``record_type``. - - """ - # pylint: disable=no-self-use - try: - logging.getLogger(_JOURNAL_KEY).info( - json.dumps({"type": record_type, **record_data}) - ) - except TypeError: - logging.getLogger(__name__).error( - "Unable to record %s to journal, make sure it's a " - "serializable dictionary", - repr(record_data), - ) - - def log_catalog( - self, dataset_name: str, operation: str, version: str = None - ) -> None: - """Log journal record for ``DataCatalog``. - - Args: - dataset_name: Name of dataset being logged. - operation: Operation on dataset, one of {'save', 'load'}. - version: Dataset version corresponding to operation (i.e if operation - is "save" then this is "save_version"). - - """ - record_data = { - "run_id": self.run_id, - "name": dataset_name, - "operation": operation, - "version": version, - } - self._log_journal("DatasetJournalRecord", record_data) - - -def _git_sha(proj_dir: Union[str, Path] = None) -> Optional[str]: - """Git description of working tree. - - Returns: Git description or None. - - """ - proj_dir = str(proj_dir or Path.cwd()) - try: - res = subprocess.check_output( - ["git", "rev-parse", "--short", "HEAD"], cwd=proj_dir - ) - return res.decode().strip() - # `subprocess.check_output()` raises `NotADirectoryError` on Windows - except (subprocess.CalledProcessError, FileNotFoundError, NotADirectoryError): - logging.getLogger(__name__).warning("Unable to git describe %s", proj_dir) - return None - - -class JournalFileHandler(logging.Handler): - """Handler for logging journal record to a file based on journal ID.""" - - def __init__(self, base_dir: Union[str, Path]): - """Initialise ``JournalFileHandler`` which will handle logging journal record. - - Args: - base_dir: Base directory for saving journals. - - """ - super().__init__() - self.base_dir = Path(base_dir).expanduser() - self._file_handlers = {} # type:Dict[str, logging.FileHandler] - - def _generate_handler(self, run_id: str) -> logging.FileHandler: - """Generate unique filename for journal record path. - - Returns: - Logging FileHandler object. - - """ - self.base_dir.mkdir(parents=True, exist_ok=True) - handler_path = self.base_dir.resolve() / f"journal_{run_id}.log" - return logging.FileHandler(str(handler_path), mode="a") - - def emit(self, record: logging.LogRecord) -> None: - """Overriding emit function in logging.Handler, which will output the record to - the filelog based on run id. - - Args: - record: logging record. - - """ - message = json.loads(record.getMessage()) - - handler = self._file_handlers.setdefault( - message["run_id"], self._generate_handler(message["run_id"]) - ) - - handler.emit(record) diff --git a/kedro_technical_charter.pdf b/kedro_technical_charter.pdf new file mode 100644 index 0000000000..62ab06b7c5 Binary files /dev/null and b/kedro_technical_charter.pdf differ diff --git a/pyproject.toml b/pyproject.toml index 789df8dafd..0a46e7e0dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,17 +1,219 @@ +# PEP-518 https://peps.python.org/pep-0518/ +[build-system] +# Minimum requirements for the build system to execute. +requires = ["setuptools>=65.5.1"] # PEP 518 specifications +build-backend = "setuptools.build_meta" + +[project] +name = "kedro" +authors = [ + {name = "Kedro"} +] +description = "Kedro helps you build production-ready data and analytics pipelines" +requires-python = ">=3.7" +dependencies = [ + "anyconfig~=0.10.0", + "attrs>=21.3", + "build", + "cachetools~=5.3", + "click<9.0", + "cookiecutter>=2.1.1, <3.0", + "dynaconf>=3.1.2, <4.0", + "fsspec>=2021.4, <2024.1", # Upper bound set arbitrarily, to be reassessed in early 2024 + "gitpython~=3.0", + "importlib-metadata>=3.6; python_version >= '3.8'", + "importlib_metadata>=3.6, <5.0; python_version < '3.8'", # The "selectable" entry points were introduced in `importlib_metadata` 3.6 and Python 3.10. Bandit on Python 3.7 relies on a library with `importlib_metadata` < 5.0 + "importlib_resources>=1.3", # The `files()` API was introduced in `importlib_resources` 1.3 and Python 3.9. + "jmespath>=0.9.5, <2.0", + "more_itertools~=9.0", + "omegaconf~=2.3", + "parse~=1.19.0", + "pip-tools>=6.5,<8", + "pluggy~=1.0", + "PyYAML>=4.2, <7.0", + "rich>=12.0, <14.0", + "rope>=0.21, <2.0", # subject to LGPLv3 license + "setuptools>=65.5.1", + "toml~=0.10", + "toposort~=1.5", # Needs to be at least 1.5 to be able to raise CircularDependencyError +] +keywords = [ + "pipelines", + "machine learning", + "data pipelines", + "data science", + "data engineering", +] +license = {text = "Apache Software License (Apache 2.0)"} +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] +dynamic = ["readme", "version", "optional-dependencies"] + +[project.urls] +Homepage = "https://kedro.org" +Source = "https://github.com/kedro-org/kedro" +Documentation = "https://docs.kedro.org" +Tracker = "https://github.com/kedro-org/kedro/issues" + +[project.scripts] +kedro = "kedro.framework.cli:main" + +[tool.setuptools] +zip-safe = false + +[tool.setuptools.packages.find] +include = ["kedro*"] + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro.__version__"} + [tool.black] exclude = "/templates/|^features/steps/test_starter" [tool.isort] -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -use_parentheses = true -line_length = 88 -known_first_party = ["features", "kedro", "tests", "tools"] -default_section = "THIRDPARTY" +profile = "black" + + +[tool.pylint] +[tool.pylint.master] +ignore = "CVS" +ignore-patterns = "kedro/templates/*" +load-plugins = [ + "pylint.extensions.docparams", + "pylint.extensions.no_self_use" +] +extension-pkg-whitelist = "cv2" +unsafe-load-any-extension = false +[tool.pylint.messages_control] +disable = [ + "ungrouped-imports", + "duplicate-code", + "wrong-import-order", # taken care of by isort +] +enable = ["useless-suppression"] +[tool.pylint.refactoring] +max-nested-blocks = 5 +[tool.pylint.format] +indent-after-paren=4 +indent-string=" " +[tool.pylint.miscellaneous] +notes = [ + "FIXME", + "XXX" +] +[tool.pylint.design] +min-public-methods = 1 [tool.coverage.report] fail_under = 100 show_missing = true -omit = ["kedro/templates/*", "kedro/framework/cli/hooks/specs.py", "kedro/framework/hooks/specs.py", "kedro/extras/datasets/tensorflow/*"] +omit = [ + "kedro/templates/*", + "kedro/extras/logging/color_logger.py", + "kedro/extras/extensions/ipython.py", + "kedro/framework/cli/hooks/specs.py", + "kedro/framework/hooks/specs.py", + "kedro/extras/datasets/tensorflow/*", + "kedro/extras/datasets/holoviews/*", + "tests/*" +] exclude_lines = ["pragma: no cover", "raise NotImplementedError"] + +[tool.pytest.ini_options] +addopts=""" +--cov-config pyproject.toml \ +--cov-report xml:coverage.xml \ +--cov-report term-missing \ +--cov kedro \ +--cov tests \ +--ignore tests/template/fake_repo \ +--ignore features \ +--ignore kedro/templates \ +--no-cov-on-fail \ +-ra \ +-W ignore""" +testpaths = [ + "tests" +] + +[tool.importlinter] +root_package = "kedro" + +[[tool.importlinter.contracts]] +name = "CLI > Context > Library, Runner > Extras > IO & Pipeline" +type = "layers" +containers = "kedro" +layers = [ + "framework.cli", + "framework.session", + "framework.context", + "framework.project", + "runner", + "extras.datasets", + "io", + "pipeline", + "config" +] +ignore_imports = [ + "kedro.runner.parallel_runner -> kedro.framework.project", + "kedro.framework.hooks.specs -> kedro.framework.context" +] + +[[tool.importlinter.contracts]] +name = "Pipeline and IO are independent" +type = "independence" +modules = [ + "kedro.pipeline", + "kedro.io" +] + +[[tool.importlinter.contracts]] +name = "Config cannot import Runner et al" +type = "forbidden" +source_modules = [ + "kedro.config" +] +forbidden_modules = [ + "kedro.runner", + "kedro.io", + "kedro.pipeline", + "kedro.extras.datasets" +] + +[[tool.importlinter.contracts]] +name = "Runner et al cannot import Config" +type = "forbidden" +source_modules = [ + "kedro.runner", + "kedro.io", + "kedro.pipeline", + "kedro.extras.datasets" +] +forbidden_modules = [ + "kedro.config" +] +ignore_imports = [ + "kedro.framework.context.context -> kedro.config", + "kedro.framework.session.session -> kedro.config" +] + +[tool.ruff] +line-length = 88 +show-fixes = true +# select = ["A", "B", "C", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] +select = [ + "F", # Pyflakes + "E", # Pycodestyle + "W", # Pycodestyle + "UP", # pyupgrade + "I", # isort + "PL", # Pylint +] +ignore = ["E501"] # Black take care off line-too-long +unfixable = [] diff --git a/pyproject_no_spark.toml b/pyproject_no_spark.toml deleted file mode 100644 index 2e81a8c786..0000000000 --- a/pyproject_no_spark.toml +++ /dev/null @@ -1,12 +0,0 @@ -[tool.coverage.report] -fail_under = 100 -show_missing = true -omit = [ - "kedro/templates/*", - "kedro/extras/datasets/spark/*", - "tests/extras/datasets/spark/*", - "kedro/framework/cli/hooks/specs.py", - "kedro/framework/hooks/specs.py", - "kedro/extras/datasets/tensorflow/*" -] -exclude_lines = ["pragma: no cover", "raise NotImplementedError", "SKIP_IF_NO_SPARK"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 17b1457c67..0000000000 --- a/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -anyconfig~=0.10.0 -cachetools~=4.1 -click<8.0 -cookiecutter~=1.7.0 -dynaconf~=3.1.2 -fsspec>=2021.04, <2022.01 # Upper bound set arbitrarily, to be reassessed in early 2022 -gitpython~=3.0 -jmespath>=0.9.5, <1.0 -jupyter_client>=5.1, <7.0 -pip-tools~=5.0 -pluggy~=0.13.0 -python-json-logger~=2.0 -PyYAML>=4.2, <6.0 -setuptools>=38.0 -toml~=0.10 -toposort~=1.5 # Needs to be at least 1.5 to be able to raise CircularDependencyError diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 189a76c495..0000000000 --- a/setup.cfg +++ /dev/null @@ -1,71 +0,0 @@ -[metadata] -description-file=README.md - -[tool:pytest] -addopts=--cov-report xml:coverage.xml - --cov-report term-missing - --cov kedro - --cov tests - --ignore tests/template/fake_repo - --no-cov-on-fail - -ra - -W ignore - -[flake8] -ignore = E203, E266, E501, W503 -max-line-length = 80 -max-complexity = 18 -select = B,C,E,F,W,T4,B9 - -[importlinter] -root_package = kedro - -[importlinter:contract:1] -name = CLI > Context > Library, Runner > Extras > IO & Pipeline -type = layers -containers = kedro -layers = - framework.cli - framework.session - framework.context - framework.project - runner - extras.datasets - io - pipeline - config - versioning -ignore_imports = - kedro.runner.parallel_runner -> kedro.framework.project - kedro.runner.parallel_runner -> kedro.framework.session.session - -[importlinter:contract:2] -name = Pipeline and IO are independent -type = independence -modules = - kedro.pipeline - kedro.io - -[importlinter:contract:3] -name = Config cannot import Runner et al -type = forbidden -source_modules = - kedro.config -forbidden_modules = - kedro.runner - kedro.io - kedro.pipeline - kedro.extras.datasets - -[importlinter:contract:4] -name = Runner et al cannot import Config -type = forbidden -source_modules = - kedro.runner - kedro.io - kedro.pipeline - kedro.extras.datasets -forbidden_modules = - kedro.config -ignore_imports= - kedro.framework.context.context -> kedro.config diff --git a/setup.py b/setup.py index 4f27edfd06..b7ba8b0988 100644 --- a/setup.py +++ b/setup.py @@ -1,77 +1,14 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from codecs import open from glob import glob from itertools import chain -from os import path - -from setuptools import find_packages, setup - -name = "kedro" -here = path.abspath(path.dirname(__file__)) +from setuptools import setup -PANDAS = ( - "pandas~=1.2" # in 1.1 pandas started using fsspec, in 1.2 they fixed a lot of bugs -) +# at least 1.3 to be able to use XMLDataSet and pandas integration with fsspec +PANDAS = "pandas~=1.3" SPARK = "pyspark>=2.2, <4.0" HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" -# get package version -with open(path.join(here, name, "__init__.py"), encoding="utf-8") as f: - result = re.search(r'__version__ = ["\']([^"\']+)', f.read()) - - if not result: - raise ValueError("Can't find the version in kedro/__init__.py") - - version = result.group(1) - -# get the dependencies and installs -with open("requirements.txt", "r", encoding="utf-8") as f: - requires = [x.strip() for x in f if x.strip()] - -# get test dependencies and installs -with open("test_requirements.txt", "r", encoding="utf-8") as f: - test_requires = [x.strip() for x in f if x.strip() and not x.startswith("-r")] - - -# Get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - -doc_html_files = [ - name.replace("kedro/", "", 1) - for name in glob("kedro/framework/html/**/*", recursive=True) -] - template_files = [] for pattern in ["**/*", "**/.*", "**/.*/**", "**/.*/.**"]: template_files.extend( @@ -88,7 +25,7 @@ def _collect_requirements(requires): api_require = {"api.APIDataSet": ["requests~=2.20"]} biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]} -dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.1"]} +dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.10", "triad>=0.6.7, <1.0"]} geopandas_require = { "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] } @@ -99,54 +36,81 @@ def _collect_requirements(requires): "pandas.CSVDataSet": [PANDAS], "pandas.ExcelDataSet": [PANDAS, "openpyxl>=3.0.6, <4.0"], "pandas.FeatherDataSet": [PANDAS], - "pandas.GBQTableDataSet": [PANDAS, "pandas-gbq>=0.12.0, <1.0"], - "pandas.HDFDataSet": [PANDAS, "tables~=3.6"], + "pandas.GBQTableDataSet": [PANDAS, "pandas-gbq>=0.12.0, <0.18.0"], + "pandas.GBQQueryDataSet": [PANDAS, "pandas-gbq>=0.12.0, <0.18.0"], + "pandas.HDFDataSet": [ + PANDAS, + "tables~=3.6.0; platform_system == 'Windows'", + "tables~=3.6; platform_system != 'Windows'", + ], "pandas.JSONDataSet": [PANDAS], - "pandas.ParquetDataSet": [PANDAS, "pyarrow>=0.12.0, <4.0.0"], + "pandas.ParquetDataSet": [PANDAS, "pyarrow>=1.0, <7.0"], "pandas.SQLTableDataSet": [PANDAS, "SQLAlchemy~=1.2"], + "pandas.SQLQueryDataSet": [PANDAS, "SQLAlchemy~=1.2"], + "pandas.XMLDataSet": [PANDAS, "lxml~=4.6"], + "pandas.GenericDataSet": [PANDAS], } -pillow_require = {"pillow.ImageDataSet": ["Pillow~=8.0"]} -plotly_require = {"plotly.PlotlyDataSet": [PANDAS, "plotly~=4.14"]} +pickle_require = {"pickle.PickleDataSet": ["compress-pickle[lz4]~=2.1.0"]} +pillow_require = {"pillow.ImageDataSet": ["Pillow~=9.0"]} +video_require = { + "video.VideoDataSet": ["opencv-python~=4.5.5.64"] +} +plotly_require = { + "plotly.PlotlyDataSet": [PANDAS, "plotly>=4.8.0, <6.0"], + "plotly.JSONDataSet": ["plotly>=4.8.0, <6.0"], +} +redis_require = {"redis.PickleDataSet": ["redis~=4.1"]} spark_require = { "spark.SparkDataSet": [SPARK, HDFS, S3FS], "spark.SparkHiveDataSet": [SPARK, HDFS, S3FS], "spark.SparkJDBCDataSet": [SPARK, HDFS, S3FS], + "spark.DeltaTableDataSet": [SPARK, HDFS, S3FS, "delta-spark>=1.0, <3.0"], } +svmlight_require = {"svmlight.SVMLightDataSet": ["scikit-learn~=1.0.2", "scipy~=1.7.3"]} tensorflow_required = { "tensorflow.TensorflowModelDataset": [ # currently only TensorFlow V2 supported for saving and loading. # V1 requires HDF5 and serialises differently - "tensorflow~=2.0", + "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", + # https://developer.apple.com/metal/tensorflow-plugin/ + "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", ] } -yaml_require = {"yaml.YAMLDataSet": [PANDAS, "PyYAML>=4.2, <6.0"]} +yaml_require = {"yaml.YAMLDataSet": [PANDAS, "PyYAML>=4.2, <7.0"]} extras_require = { "api": _collect_requirements(api_require), "biosequence": _collect_requirements(biosequence_require), "dask": _collect_requirements(dask_require), "docs": [ + # docutils>=0.17 changed the HTML + # see https://github.com/readthedocs/sphinx_rtd_theme/issues/1115 "docutils==0.16", - "sphinx~=3.4.3", - "sphinx_rtd_theme==0.4.1", - "nbsphinx==0.8.1", - "nbstripout~=0.4", - "recommonmark==0.7.1", - "sphinx-autodoc-typehints==1.11.1", + "sphinx~=5.3.0", + "sphinx_rtd_theme==1.2.0", + # Regression on sphinx-autodoc-typehints 1.21 + # that creates some problematic docstrings + "sphinx-autodoc-typehints==1.20.2", "sphinx_copybutton==0.3.1", + "sphinx-notfound-page", "ipykernel>=5.3, <7.0", + "sphinxcontrib-mermaid~=0.7.1", + "myst-parser~=1.0.0", + "Jinja2<3.1.0", + "kedro-datasets[all]~=1.4.2", ], "geopandas": _collect_requirements(geopandas_require), - "ipython": ["ipython~=7.10"], "matplotlib": _collect_requirements(matplotlib_require), "holoviews": _collect_requirements(holoviews_require), "networkx": _collect_requirements(networkx_require), - "notebook_templates": ["nbconvert>=5.3.1, <6.0", "nbformat~=4.4"], "pandas": _collect_requirements(pandas_require), + "pickle": _collect_requirements(pickle_require), "pillow": _collect_requirements(pillow_require), + "video": _collect_requirements(video_require), "plotly": _collect_requirements(plotly_require), - "profilers": ["memory_profiler>=0.50.0, <1.0"], + "redis": _collect_requirements(redis_require), "spark": _collect_requirements(spark_require), + "svmlight": _collect_requirements(svmlight_require), "tensorflow": _collect_requirements(tensorflow_required), "yaml": _collect_requirements(yaml_require), **api_require, @@ -157,40 +121,88 @@ def _collect_requirements(requires): **holoviews_require, **networkx_require, **pandas_require, + **pickle_require, **pillow_require, + **video_require, **plotly_require, **spark_require, + **svmlight_require, **tensorflow_required, **yaml_require, } extras_require["all"] = _collect_requirements(extras_require) +extras_require["test"] = [ + "adlfs>=2021.7.1, <=2022.2; python_version == '3.7'", + "adlfs~=2023.1; python_version >= '3.8'", + "bandit>=1.6.2, <2.0", + "behave==1.2.6", + "biopython~=1.73", + "blacken-docs==1.9.2", + "black~=22.0", + "compress-pickle[lz4]~=2.1.0", + "coverage[toml]", + "dask[complete]~=2021.10", # pinned by Snyk to avoid a vulnerability + "delta-spark~=1.2.1", # 1.2.0 has a bug that breaks some of our tests: https://github.com/delta-io/delta/issues/1070 + "dill~=0.3.1", + "filelock>=3.4.0, <4.0", + "gcsfs>=2021.4, <=2023.1; python_version == '3.7'", + "gcsfs>=2023.1, <2023.3; python_version >= '3.8'", + "geopandas>=0.6.0, <1.0", + "hdfs>=2.5.8, <3.0", + "holoviews~=1.13.0", + "import-linter[toml]==1.8.0", + "ipython>=7.31.1, <8.0; python_version < '3.8'", + "ipython~=8.10; python_version >= '3.8'", + "isort~=5.0", + "Jinja2<3.1.0", + "joblib>=0.14", + "jupyterlab_server>=2.11.1, <2.16.0", # 2.16.0 requires importlib_metedata >= 4.8.3 which conflicts with flake8 requirement + "jupyterlab~=3.0, <3.6.0", # 3.6.0 requires jupyterlab_server~=2.19 + "jupyter~=1.0", + "lxml~=4.6", + "matplotlib>=3.0.3, <3.4; python_version < '3.10'", # 3.4.0 breaks holoviews + "matplotlib>=3.5, <3.6; python_version == '3.10'", + "memory_profiler>=0.50.0, <1.0", + "moto==1.3.7; python_version < '3.10'", + "moto==3.0.4; python_version == '3.10'", + "networkx~=2.4", + "opencv-python~=4.5.5.64", + "openpyxl>=3.0.3, <4.0", + "pandas-gbq>=0.12.0, <0.18.0", + "pandas~=1.3 # 1.3 for read_xml/to_xml", + "Pillow~=9.0", + "plotly>=4.8.0, <6.0", + "pre-commit>=2.9.2, <3.0", # The hook `mypy` requires pre-commit version 2.9.2. + "psutil~=5.8", + "pyarrow>=6.0", + "pylint>=2.17.0, <3.0", + "pyproj~=3.0", + "pyspark>=2.2, <4.0", + "pytest-cov~=3.0", + "pytest-mock>=1.7.1, <2.0", + "pytest-xdist[psutil]~=2.2.1", + "pytest~=7.2", + "redis~=4.1", + "requests-mock~=1.6", + "requests~=2.20", + "s3fs>=0.3.0, <0.5", # Needs to be at least 0.3.0 to make use of `cachable` attribute on S3FileSystem. + "scikit-learn~=1.0.2", + "scipy~=1.7.3", + "SQLAlchemy~=1.2", + "tables~=3.6.0; platform_system == 'Windows' and python_version<'3.9'", + "tables~=3.6; platform_system != 'Windows'", + "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", + # https://developer.apple.com/metal/tensorflow-plugin/ + "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", + "triad>=0.6.7, <1.0", + "trufflehog~=2.1", + "xlsxwriter~=1.0", +] setup( - name=name, - version=version, - description="Kedro helps you build production-ready data and analytics pipelines", - license="Apache Software License (Apache 2.0)", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/quantumblacklabs/kedro", - python_requires=">=3.7, <3.10", - packages=find_packages(exclude=["docs*", "tests*", "tools*", "features*"]), - include_package_data=True, - tests_require=test_requires, - install_requires=requires, - author="QuantumBlack Labs", - entry_points={"console_scripts": ["kedro = kedro.framework.cli:main"]}, package_data={ - name: ["py.typed", "test_requirements.txt"] + template_files + doc_html_files + "kedro": ["py.typed"] + template_files }, - zip_safe=False, - keywords="pipelines, machine learning, data pipelines, data science, data engineering", - classifiers=[ - "Development Status :: 4 - Beta", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - ], extras_require=extras_require, ) diff --git a/static/img/kedro_banner.png b/static/img/kedro_banner.png index 57f6eeb58f..bdda6c9b03 100644 Binary files a/static/img/kedro_banner.png and b/static/img/kedro_banner.png differ diff --git a/static/img/pipeline_visualisation.png b/static/img/pipeline_visualisation.png deleted file mode 100644 index 9eb2aa3f2c..0000000000 Binary files a/static/img/pipeline_visualisation.png and /dev/null differ diff --git a/static/jsonschema/kedro-catalog-0.15.9.json b/static/jsonschema/kedro-catalog-0.15.9.json index beaecd5a3f..a5e755569d 100644 --- a/static/jsonschema/kedro-catalog-0.15.9.json +++ b/static/jsonschema/kedro-catalog-0.15.9.json @@ -79,7 +79,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -113,7 +113,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -149,7 +149,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" } } } @@ -179,7 +179,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -207,7 +207,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -241,7 +241,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -273,7 +273,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -307,7 +307,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -337,7 +337,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -369,7 +369,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -415,7 +415,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -447,7 +447,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -477,7 +477,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -503,7 +503,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -537,7 +537,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -567,7 +567,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -609,7 +609,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -645,7 +645,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -677,7 +677,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -715,7 +715,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -753,7 +753,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -849,7 +849,7 @@ }, "credentials": { "type": "object", - "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.15.9/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" }, "load_args": { "type": "object", diff --git a/static/jsonschema/kedro-catalog-0.16.json b/static/jsonschema/kedro-catalog-0.16.json index b8cb47db3e..32b23591bd 100644 --- a/static/jsonschema/kedro-catalog-0.16.json +++ b/static/jsonschema/kedro-catalog-0.16.json @@ -60,7 +60,7 @@ }, "credentials": { "type": "object", - "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.16.0/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" }, "load_args": { "type": "object", diff --git a/static/jsonschema/kedro-catalog-0.17.json b/static/jsonschema/kedro-catalog-0.17.json index d9c76ad87e..197e59dcb2 100644 --- a/static/jsonschema/kedro-catalog-0.17.json +++ b/static/jsonschema/kedro-catalog-0.17.json @@ -37,7 +37,9 @@ "pandas.HDFDataSet", "pandas.CSVDataSet", "pandas.ExcelDataSet", - "pandas.GBQTableDataSet" + "pandas.GBQTableDataSet", + "pandas.GBQQueryDataSet", + "pandas.GenericDataSet" ] } }, @@ -65,7 +67,7 @@ }, "credentials": { "type": "object", - "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.17.0/05_data/02_kedro_io.html#partitioned-dataset-credentials" }, "load_args": { "type": "object", @@ -374,6 +376,10 @@ "timeout": { "type": "integer", "description": "The wait time in seconds for a response, defaults to 1 minute.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#timeouts" + }, + "credentials": { + "pattern": ".*", + "description": "Same as ``auth``. Allows specifying ``auth`` secrets in \ncredentials.yml." } } } @@ -838,6 +844,38 @@ } } }, + { + "if": { "properties": { "type": { "const": "pandas.GenericDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "file_format" : { + "type": "string", + "description": "The read/write methods to retrieve from pandas (`pandas.read_{file_format}` or `pd.DataFrame.to_{file_format}`) on a best effort basis." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, { "if": { "properties": { "type": { "const": "pandas.ExcelDataSet" } } diff --git a/static/jsonschema/kedro-catalog-0.18.json b/static/jsonschema/kedro-catalog-0.18.json new file mode 100644 index 0000000000..13c010e5ce --- /dev/null +++ b/static/jsonschema/kedro-catalog-0.18.json @@ -0,0 +1,1423 @@ +{ + "type": "object", + "patternProperties": { + "^[a-z0-9-_]+$": { + "required": [ + "type" + ], + "properties": { + "type": { + "type": "string", + "enum": [ + "CachedDataSet", + "IncrementalDataSet", + "MemoryDataSet", + "LambdaDataSet", + "PartitionedDataSet", + "api.APIDataSet", + "biosequence.BioSequenceDataSet", + "dask.ParquetDataSet", + "email.EmailMessageDataSet", + "geopandas.GeoJSONDataSet", + "holoviews.HoloviewsWriter", + "json.JSONDataSet", + "matplotlib.MatplotlibWriter", + "networkx.NetworkXDataSet", + "pandas.CSVDataSet", + "pandas.ExcelDataSet", + "pandas.FeatherDataSet", + "pandas.GBQTableDataSet", + "pandas.HDFDataSet", + "pandas.JSONDataSet", + "pandas.ParquetDataSet", + "pandas.SQLTableDataSet", + "pandas.SQLQueryDataSet", + "pandas.XMLDataSet", + "pillow.ImageDataSet", + "pickle.PickleDataSet", + "plotly.PlotlyDataSet", + "redis.PickleDataSet", + "spark.SparkDataSet", + "spark.SparkHiveDataSet", + "spark.SparkJDBCDataSet", + "tensorflow.TensorFlowModelDataset", + "text.TextDataSet", + "tracking.JSONDataSet", + "tracking.MetricsDataSet", + "yaml.YAMLDataSet" + ] + } + }, + "allOf": [ + { + "if": { + "properties": { + "type": { + "const": "CachedDataSet" + } + } + }, + "then": { + "required": [ + "dataset" + ], + "properties": { + "dataset": { + "pattern": ".*", + "description": "A Kedro DataSet object or a dictionary to cache." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "IncrementalDataSet" + } + } + }, + "then": { + "required": [ + "path", + "dataset" + ], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "checkpoint": { + "pattern": "object", + "description": "Optional checkpoint configuration. Accepts a dictionary\nwith the corresponding dataset definition including ``filepath``\n(unlike ``dataset`` argument). Checkpoint configuration is\ndescribed here:\nhttps://kedro.readthedocs.io/en/0.18.0/data/kedro_io.html#checkpoint-configuration\nCredentials for the checkpoint can be explicitly specified\nin this configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.18.0/data/kedro_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "MemoryDataSet" + } + } + }, + "then": { + "required": [], + "properties": { + "data": { + "pattern": ".*", + "description": "Python object containing the data." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "LambdaDataSet" + } + } + }, + "then": { + "required": [ + "load", + "save" + ], + "properties": { + "load": { + "pattern": ".*", + "description": "Method to load data from a data set." + }, + "save": { + "pattern": ".*", + "description": "Method to save data to a data set." + }, + "exists": { + "pattern": ".*", + "description": "Method to check whether output data already exists." + }, + "release": { + "pattern": ".*", + "description": "Method to release any cached information." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "PartitionedDataSet" + } + } + }, + "then": { + "required": [ + "path", + "dataset" + ], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.18.0/data/kedro_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "api.APIDataSet" + } + } + }, + "then": { + "required": [ + "url" + ], + "properties": { + "url": { + "type": "string", + "description": "The API URL endpoint." + }, + "method": { + "type": "string", + "description": "The Method of the request, GET, POST, PUT, DELETE, HEAD, etc..." + }, + "data": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "params": { + "type": "object", + "description": "The url parameters of the API.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#passing-parameters-in-urls" + }, + "headers": { + "type": "object", + "description": "The HTTP headers.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#custom-headers" + }, + "auth": { + "pattern": ".*", + "description": "Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,\nor ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases." + }, + "json": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests, passed in\nto the json kwarg in the requests object.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "timeout": { + "type": "integer", + "description": "The wait time in seconds for a response, defaults to 1 minute.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#timeouts" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "biosequence.BioSequenceDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to sequence file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." + }, + "load_args": { + "type": "object", + "description": "Options for parsing sequence files by Biopython ``SeqIO.parse()``." + }, + "save_args": { + "type": "object", + "description": "file format supported by Biopython ``SeqIO.write()``.\nE.g. `{\"format\": \"fasta\"}`." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\n to pass to the filesystem's `open` method through nested keys\n `open_args_load` and `open_args_save`.\n Here you can find all available arguments for `open`:\n https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\n All defaults are preserved, except `mode`, which is set to `r` when loading\n and to `w` when saving.\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "dask.ParquetDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a parquet file\nparquet collection or the directory of a multipart parquet." + }, + "load_args": { + "type": "object", + "description": "Additional loading options `dask.dataframe.read_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet" + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `dask.dataframe.to_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Optional parameters to the backend file system driver:\nhttps://docs.dask.org/en/latest/remote-data-services.html#optional-parameters" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "email.EmailMessageDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "``email`` options for parsing email messages (arguments passed\ninto ``email.parser.Parser.parse``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser.parse\nIf you would like to specify options for the `Parser`,\nyou can include them under the \"parser\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser\nAll defaults are preserved, but \"policy\", which is set to ``email.policy.default``." + }, + "save_args": { + "type": "object", + "description": "``email`` options for generating MIME documents (arguments passed into\n``email.generator.Generator.flatten``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator.flatten\nIf you would like to specify options for the `Generator`,\nyou can include them under the \"generator\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "geopandas.GeoJSONDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a GeoJSON file prefixed with a protocol like\n`s3://`. If prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "GeoPandas options for loading GeoJSON files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference/geopandas.read_file.html" + }, + "save_args": { + "type": "object", + "description": "GeoPandas options for saving geojson files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference.html#geopandas.GeoDataFrame.to_file\nThe default_save_arg driver is 'GeoJSON', all others preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "credentials required to access the underlying filesystem.\nEg. for ``GCFileSystem`` it would look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "holoviews.HoloviewsWriter" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Extra save args passed to `holoviews.save()`. See\nhttps://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "json.JSONDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "matplotlib.MatplotlibWriter" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a matplot object file(s) prefixed with a protocol\nlike `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be\nused. The prefix should be any protocol supported by ``fsspec``." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Save args passed to `plt.savefig`. See\nhttps://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "networkx.NetworkXDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to the NetworkX graph JSON file." + }, + "load_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_graph``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html" + }, + "save_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_data``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.CSVDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a CSV file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.ExcelDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Excel file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "engine": { + "type": "string", + "description": "The engine used to write to excel files. The default\nengine is 'xlsxwriter'." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"xlrd\"." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under the \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.FeatherDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a feather file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading feather files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.GBQTableDataSet" + } + } + }, + "then": { + "required": [ + "dataset", + "table_name" + ], + "properties": { + "dataset": { + "type": "string", + "description": "Google BigQuery dataset." + }, + "table_name": { + "type": "string", + "description": "Google BigQuery table name." + }, + "project": { + "type": "string", + "description": "Google BigQuery Account project ID.\nOptional when available from the environment.\nhttps://cloud.google.com/resource-manager/docs/creating-managing-projects" + }, + "credentials": { + "pattern": ".*", + "description": "Credentials for accessing Google APIs.\nEither ``google.auth.credentials.Credentials`` object or dictionary with\nparameters required to instantiate ``google.oauth2.credentials.Credentials``.\nHere you can find all the arguments:\nhttps://google-auth.readthedocs.io/en/latest/reference/google.oauth2.credentials.html" + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading BigQuery table into DataFrame.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving DataFrame to BigQuery table.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html\nAll defaults are preserved, but \"progress_bar\", which is set to False." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.HDFDataSet" + } + } + }, + "then": { + "required": [ + "filepath", + "key" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a hdf file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "key": { + "type": "string", + "description": "Identifier to the group in the HDF store." + }, + "load_args": { + "type": "object", + "description": "PyTables options for loading hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "PyTables options for saving hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.JSONDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.ParquetDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Parquet file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nIt can also be a path to a directory. If the directory is\nprovided then it can be used for reading partitioned parquet files.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Additional options for loading Parquet file(s).\nHere you can find all available arguments when reading single file:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html\nHere you can find all available arguments when reading partitioned datasets:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `pyarrow.parquet.write_table` and\n`pyarrow.Table.from_pandas`.\nHere you can find all available arguments for `write_table()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table\nThe arguments for `from_pandas()` should be passed through a nested\nkey: `from_pandas`. E.g.: `save_args = {\"from_pandas\": {\"preserve_index\": False}}`\nHere you can find all available arguments for `from_pandas()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.SQLTableDataSet" + } + } + }, + "then": { + "required": [ + "table_name", + "credentials" + ], + "properties": { + "table_name": { + "type": "string", + "description": "The table name to load or save data to. It\noverwrites name in ``save_args`` and ``table_name``\nparameters in ``load_args``." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_table``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying pandas ``to_sql`` function along\nwith the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls\nIt has ``index=False`` in the default parameters." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.SQLQueryDataSet" + } + } + }, + "then": { + "required": [ + "sql", + "credentials" + ], + "properties": { + "sql": { + "type": "string", + "description": "The sql query statement." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_query``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "execution_options": { + "type": "object", + "description": "A dictionary with non-SQL options for the connection\nto be applied to the underlying engine.\nTo find all supported execution options, see here:\nhttps://docs.sqlalchemy.org/en/12/core/connections.html#sqlalchemy.engine.Connection.execution_options \nNote that this is not a standard argument supported by pandas API, but could be useful for handling large datasets." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pandas.XMLDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a XML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading XML files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_xml.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving XML files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_xml.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pickle.PickleDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Pickle file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "backend": { + "type": "string", + "description": "Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments for different backends:\npickle.load: https://docs.python.org/3/library/pickle.html#pickle.load\njoblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments for different backends:\npickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump\njoblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "pillow.ImageDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to an image file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "Pillow options for saving image files.\nHere you can find all available arguments:\nhttps://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save\nAll defaults are preserved." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "plotly.PlotlyDataSet" + } + } + }, + "then": { + "required": [ + "filepath", + "plotly_args" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "plotly_args": { + "type": "object", + "description": "Plotly configuration for generating a plotly graph object Figure\nrepresenting the plotted data." + }, + "load_args": { + "type": "object", + "description": "Plotly options for loading JSON files.\nHere you can find all available arguments:\nhttps://plotly.com/python-api-reference/generated/plotly.io.from_json.html#plotly.io.from_json\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Plotly options for saving JSON files.\nHere you can find all available arguments:\nhttps://plotly.com/python-api-reference/generated/plotly.io.write_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "redis.PickleDataSet" + } + } + }, + "then": { + "required": [ + "key" + ], + "properties": { + "key": { + "type": "string", + "description": "The key to use for saving/loading object to Redis." + }, + "backend": { + "type": "string", + "description": "Backend to use, must be an import path to a module which satisfies the ``pickle`` interface.\nThat is, contains a `loads` and `dumps` function. Defaults to 'pickle'." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments:\nhttps://docs.python.org/3/library/pickle.html#pickle.loads\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments:\nhttps://docs.python.org/3/library/pickle.html#pickle.dumps\nAll defaults are preserved." + }, + "credentials": { + "type": "object", + "description": "Credentials required to get access to the redis server." + }, + "redis_args": { + "type": "object", + "description": "Extra arguments to pass into the redis client constructor ``redis.StrictRedis.from_url``, as well as to pass to the ``redis.StrictRedis.set``" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "spark.SparkDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Spark dataframe. When using Databricks\nand working with data written to mount path points,\nspecify ``filepath``s for (versioned) ``SparkDataSet``s\nstarting with ``/dbfs/mnt``." + }, + "file_format": { + "type": "string", + "description": "File format used during load and save\noperations. These are formats supported by the running\nSparkContext include parquet, csv. For a list of supported\nformats please refer to Apache Spark documentation at\nhttps://spark.apache.org/docs/latest/sql-programming-guide.html" + }, + "load_args": { + "type": "object", + "description": "Load args passed to Spark DataFrameReader load method.\nIt is dependent on the selected file format. You can find\na list of read options for each supported format\nin Spark DataFrame read documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "save_args": { + "type": "object", + "description": "Save args passed to Spark DataFrame write options.\nSimilar to load_args this is dependent on the selected file\nformat. You can pass ``mode`` and ``partitionBy`` to specify\nyour overwrite mode and partitioning respectively. You can find\na list of options for each format in Spark DataFrame\nwrite documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials to access the S3 bucket, such as\n``key``, ``secret``, if ``filepath`` prefix is ``s3a://`` or ``s3n://``.\nOptional keyword arguments passed to ``hdfs.client.InsecureClient``\nif ``filepath`` prefix is ``hdfs://``. Ignored otherwise." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "spark.SparkHiveDataSet" + } + } + }, + "then": { + "required": [ + "database", + "table", + "write_mode" + ], + "properties": { + "database": { + "type": "string", + "description": "The name of the hive database." + }, + "table": { + "type": "string", + "description": "The name of the table within the database." + }, + "write_mode": { + "type": "string", + "description": "``insert``, ``upsert`` or ``overwrite`` are supported." + }, + "table_pk": { + "type": "array", + "description": "If performing an upsert, this identifies the primary key columns used to\nresolve preexisting data. Is required for ``write_mode=\"upsert\"``." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "spark.SparkJDBCDataSet" + } + } + }, + "then": { + "required": [ + "url", + "table" + ], + "properties": { + "url": { + "type": "string", + "description": "A JDBC URL of the form ``jdbc:subprotocol:subname``." + }, + "table": { + "type": "string", + "description": "The name of the table to load or save data to." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "A dictionary of JDBC database connection arguments.\nNormally at least properties ``user`` and ``password`` with\ntheir corresponding values. It updates ``properties``\nparameter in ``load_args`` and ``save_args`` in case it is\nprovided." + }, + "load_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameReader.jdbc.html" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html" + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "tensorflow.TensorFlowModelDataset" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a TensorFlow model directory prefixed with a\nprotocol like `s3://`. If prefix is not provided `file` protocol (local filesystem)\nwill be used. The prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "TensorFlow options for loading models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/load_model\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "TensorFlow options for saving models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/save_model\nAll defaults are preserved, except for \"save_format\", which is set to \"tf\"." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "text.TextDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "tracking.JSONDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "tracking.MetricsDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { + "type": { + "const": "yaml.YAMLDataSet" + } + } + }, + "then": { + "required": [ + "filepath" + ], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a YAML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "PyYAML options for saving YAML files (arguments passed\ninto ```yaml.dump``). Here you can find all available arguments:\nhttps://pyyaml.org/wiki/PyYAMLDocumentation\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": [ + "object", + "string" + ], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + } + ] + } + } +} diff --git a/test_requirements.txt b/test_requirements.txt deleted file mode 100644 index 48189ddafd..0000000000 --- a/test_requirements.txt +++ /dev/null @@ -1,48 +0,0 @@ --r requirements.txt -adlfs~=0.7 -bandit>=1.6.2, <2.0 -behave==1.2.6 -biopython~=1.73 -black==21.5b1 -blacken-docs==1.9.2 -compress-pickle[lz4]~=1.2.0 -dask[complete]~=2021.1 -flake8~=3.5 -gcsfs>=2021.04, <2022.01 # Upper bound set arbitrarily, to be reassessed in early 2022 -geopandas>=0.6.0, <1.0 -hdfs>=2.5.8, <3.0 -holoviews~=1.13.0 -import-linter==1.0 -ipython~=7.10 -joblib>=0.14 -matplotlib>=3.0.3, <3.4 # 3.4.0 breaks holoviews -memory_profiler>=0.50.0, <1.0 -moto==1.3.7 -mypy==0.812 -nbconvert>=5.3.1, <6.0 -nbformat~=4.4 -networkx~=2.4 -openpyxl>=3.0.3, <4.0 -pandas-gbq>=0.12.0, <1.0 -pandas~=1.2 -Pillow~=8.0 -plotly~=4.14 -pre-commit~=1.17 -psutil==5.8.0 -pyarrow>=0.12.0, <4.0.0 -pylint>=2.5.2, <3.0 -pyproj~=3.0 -pyspark>=2.2, <4.0 -pytest-cov~=2.5 -pytest-mock>=1.7.1, <2.0 -pytest-xdist[psutil]~=2.2.1 -pytest~=6.2 -requests-mock~=1.6 -requests~=2.20 -s3fs>=0.3.0, <0.5 # Needs to be at least 0.3.0 to make use of `cachable` attribute on S3FileSystem. -SQLAlchemy~=1.2 -tables~=3.6 -tensorflow~=2.0 -trufflehog~=2.1 -wheel~=0.35 -xlsxwriter~=1.0 diff --git a/tests/__init__.py b/tests/__init__.py index 92cb4eff58..e69de29bb2 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,29 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import kedro.config.default_logger # noqa diff --git a/tests/config/test_config.py b/tests/config/test_config.py index f1d5e06ad2..110d3de692 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -1,39 +1,13 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from __future__ import annotations import configparser import json import re from pathlib import Path -from typing import Dict import pytest import yaml +from yaml.parser import ParserError from kedro.config import BadConfigException, ConfigLoader, MissingConfigException @@ -41,34 +15,13 @@ _BASE_ENV = "base" -def _get_local_logging_config(): - return { - "version": 1, - "formatters": { - "simple": {"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"} - }, - "root": {"level": "INFO", "handlers": ["console"]}, - "loggers": { - "kedro": {"level": "INFO", "handlers": ["console"], "propagate": False} - }, - "handlers": { - "console": { - "class": "logging.StreamHandler", - "level": "INFO", - "formatter": "simple", - "stream": "ext://sys.stdout", - } - }, - } - - -def _write_yaml(filepath: Path, config: Dict): +def _write_yaml(filepath: Path, config: dict): filepath.parent.mkdir(parents=True, exist_ok=True) yaml_str = yaml.dump(config) filepath.write_text(yaml_str) -def _write_json(filepath: Path, config: Dict): +def _write_json(filepath: Path, config: dict): filepath.parent.mkdir(parents=True, exist_ok=True) json_str = json.dumps(config) filepath.write_text(json_str) @@ -113,14 +66,12 @@ def local_config(tmp_path): def create_config_dir(tmp_path, base_config, local_config): proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" local_catalog = tmp_path / _DEFAULT_RUN_ENV / "catalog.yml" - local_logging = tmp_path / _DEFAULT_RUN_ENV / "logging.yml" parameters = tmp_path / _BASE_ENV / "parameters.json" db_config_path = tmp_path / _BASE_ENV / "db.ini" - project_parameters = dict(param1=1, param2=2) + project_parameters = {"param1": 1, "param2": 2} _write_yaml(proj_catalog, base_config) _write_yaml(local_catalog, local_config) - _write_yaml(local_logging, _get_local_logging_config()) _write_json(parameters, project_parameters) _write_dummy_ini(db_config_path) @@ -142,6 +93,19 @@ def proj_catalog_nested(tmp_path): class TestConfigLoader: + @use_config_dir + def test_load_core_config_dict_get(self, tmp_path): + """Make sure core config can be fetched with a dict [] access.""" + conf = ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV) + params = conf["parameters"] + catalog = conf["catalog"] + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["boats"]["type"] == "MemoryDataSet" + assert not catalog["cars"]["save_args"]["index"] + @use_config_dir def test_load_local_config(self, tmp_path): """Make sure that configs from `local/` override the ones @@ -270,7 +234,7 @@ def test_same_key_in_same_dir(self, tmp_path, base_config): def test_empty_patterns(self, tmp_path): """Check the error if no config patterns were specified""" pattern = ( - r"`patterns` must contain at least one glob pattern " + r"'patterns' must contain at least one glob pattern " r"to match config filenames against" ) with pytest.raises(ValueError, match=pattern): @@ -289,6 +253,27 @@ def test_no_files_found(self, tmp_path): with pytest.raises(MissingConfigException, match=pattern): ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV).get("non-existent-pattern") + @use_config_dir + def test_key_not_found_dict_get(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + with pytest.raises(KeyError): + # pylint: disable=expression-not-assigned + ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV)["non-existent-pattern"] + + @use_config_dir + def test_no_files_found_dict_get(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + pattern = ( + r"No files found in " + r"\[\'.*base\', " + r"\'.*local\'\] " + r"matching the glob pattern\(s\): " + r"\[\'credentials\*\', \'credentials\*/\**\', \'\**/credentials\*\'\]" + ) + with pytest.raises(MissingConfigException, match=pattern): + # pylint: disable=expression-not-assigned + ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV)["credentials"] + def test_duplicate_paths(self, tmp_path, caplog): """Check that trying to load the same environment config multiple times logs a warning and skips the reload""" @@ -332,3 +317,58 @@ def test_overlapping_patterns(self, tmp_path, caplog): f"Config file(s): {expected_path} already processed, skipping loading..." ) assert expected_message in log_messages + + def test_yaml_parser_error(self, tmp_path): + conf_path = tmp_path / _BASE_ENV + conf_path.mkdir(parents=True, exist_ok=True) + + example_catalog = """ + example_iris_data: + type: pandas.CSVDataSet + filepath: data/01_raw/iris.csv + """ + + (conf_path / "catalog.yml").write_text(example_catalog) + + msg = f"Invalid YAML file {conf_path / 'catalog.yml'}, unable to read line 3, position 10." + with pytest.raises(ParserError, match=re.escape(msg)): + ConfigLoader(str(tmp_path)).get("catalog*.yml") + + def test_customised_config_patterns(self, tmp_path): + config_loader = ConfigLoader( + conf_source=str(tmp_path), + config_patterns={ + "spark": ["spark*/"], + "parameters": ["params*", "params*/**", "**/params*"], + }, + ) + assert config_loader.config_patterns["catalog"] == [ + "catalog*", + "catalog*/**", + "**/catalog*", + ] + assert config_loader.config_patterns["spark"] == ["spark*/"] + assert config_loader.config_patterns["parameters"] == [ + "params*", + "params*/**", + "**/params*", + ] + + @use_config_dir + def test_adding_extra_keys_to_confloader(self, tmp_path): + """Make sure extra keys can be added directly to the config loader instance.""" + conf = ConfigLoader(str(tmp_path)) + catalog = conf["catalog"] + conf["spark"] = {"spark_config": "emr.blabla"} + + assert catalog["trains"]["type"] == "MemoryDataSet" + assert conf["spark"] == {"spark_config": "emr.blabla"} + + @use_config_dir + def test_bypass_catalog_config_loading(self, tmp_path): + """Make sure core config loading can be bypassed by setting the key and values + directly on the config loader instance.""" + conf = ConfigLoader(str(tmp_path)) + conf["catalog"] = {"catalog_config": "something_new"} + + assert conf["catalog"] == {"catalog_config": "something_new"} diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py new file mode 100644 index 0000000000..6358c9e6ac --- /dev/null +++ b/tests/config/test_omegaconf_config.py @@ -0,0 +1,649 @@ +# pylint: disable=expression-not-assigned, pointless-statement +from __future__ import annotations + +import configparser +import json +import os +import re +import subprocess +import zipfile +from pathlib import Path + +import pytest +import yaml +from omegaconf import OmegaConf, errors +from omegaconf.resolvers import oc +from yaml.parser import ParserError + +from kedro.config import MissingConfigException, OmegaConfigLoader + +_DEFAULT_RUN_ENV = "local" +_BASE_ENV = "base" + + +def _write_yaml(filepath: Path, config: dict): + filepath.parent.mkdir(parents=True, exist_ok=True) + yaml_str = yaml.dump(config) + filepath.write_text(yaml_str) + + +def _write_json(filepath: Path, config: dict): + filepath.parent.mkdir(parents=True, exist_ok=True) + json_str = json.dumps(config) + filepath.write_text(json_str) + + +def _write_dummy_ini(filepath: Path): + filepath.parent.mkdir(parents=True, exist_ok=True) + config = configparser.ConfigParser() + config["prod"] = {"url": "postgresql://user:pass@url_prod/db"} + config["staging"] = {"url": "postgresql://user:pass@url_staging/db"} + with filepath.open("wt") as configfile: # save + config.write(configfile) + + +@pytest.fixture +def base_config(tmp_path): + filepath = str(tmp_path / "cars.csv") + return { + "trains": {"type": "MemoryDataSet"}, + "cars": { + "type": "pandas.CSVDataSet", + "filepath": filepath, + "save_args": {"index": True}, + }, + } + + +@pytest.fixture +def local_config(tmp_path): + filepath = str(tmp_path / "cars.csv") + return { + "cars": { + "type": "pandas.CSVDataSet", + "filepath": filepath, + "save_args": {"index": False}, + }, + "boats": {"type": "MemoryDataSet"}, + } + + +@pytest.fixture +def create_config_dir(tmp_path, base_config, local_config): + base_catalog = tmp_path / _BASE_ENV / "catalog.yml" + base_logging = tmp_path / _BASE_ENV / "logging.yml" + base_spark = tmp_path / _BASE_ENV / "spark.yml" + + local_catalog = tmp_path / _DEFAULT_RUN_ENV / "catalog.yml" + + parameters = tmp_path / _BASE_ENV / "parameters.json" + base_parameters = {"param1": 1, "param2": 2, "interpolated_param": "${test_env}"} + base_global_parameters = {"test_env": "base"} + local_global_parameters = {"test_env": "local"} + + _write_yaml(base_catalog, base_config) + _write_yaml(local_catalog, local_config) + + # Empty Config + _write_yaml(base_logging, {"version": 1}) + _write_yaml(base_spark, {"dummy": 1}) + + _write_json(parameters, base_parameters) + _write_json(tmp_path / _BASE_ENV / "parameters_global.json", base_global_parameters) + _write_json( + tmp_path / _DEFAULT_RUN_ENV / "parameters_global.json", local_global_parameters + ) + + +@pytest.fixture +def proj_catalog(tmp_path, base_config): + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" + _write_yaml(proj_catalog, base_config) + + +@pytest.fixture +def proj_catalog_nested(tmp_path): + path = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(path, {"nested": {"type": "MemoryDataSet"}}) + + +@pytest.fixture +def proj_catalog_env_variable(tmp_path): + path = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(path, {"test": {"file_path": "${oc.env:TEST_FILE_PATH}"}}) + + +@pytest.fixture +def proj_credentials_env_variable(tmp_path): + path = tmp_path / _DEFAULT_RUN_ENV / "credentials.yml" + _write_yaml( + path, {"user": {"name": "${oc.env:TEST_USERNAME}", "key": "${oc.env:TEST_KEY}"}} + ) + + +use_config_dir = pytest.mark.usefixtures("create_config_dir") +use_proj_catalog = pytest.mark.usefixtures("proj_catalog") +use_credentials_env_variable_yml = pytest.mark.usefixtures( + "proj_credentials_env_variable" +) +use_catalog_env_variable_yml = pytest.mark.usefixtures("proj_catalog_env_variable") + + +class TestOmegaConfigLoader: + @use_config_dir + def test_load_core_config_dict_syntax(self, tmp_path): + """Make sure core config can be fetched with a dict [] access.""" + conf = OmegaConfigLoader(str(tmp_path)) + params = conf["parameters"] + catalog = conf["catalog"] + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_load_core_config_get_syntax(self, tmp_path): + """Make sure core config can be fetched with .get()""" + conf = OmegaConfigLoader(str(tmp_path)) + params = conf.get("parameters") + catalog = conf.get("catalog") + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_load_local_config_overrides_base(self, tmp_path): + """Make sure that configs from `local/` override the ones + from `base/`""" + conf = OmegaConfigLoader(str(tmp_path)) + params = conf["parameters"] + catalog = conf["catalog"] + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["boats"]["type"] == "MemoryDataSet" + assert not catalog["cars"]["save_args"]["index"] + + @use_proj_catalog + def test_load_base_config(self, tmp_path, base_config): + """Test config loading if `local/` directory is empty""" + (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) + catalog = OmegaConfigLoader(str(tmp_path))["catalog"] + assert catalog == base_config + + @use_proj_catalog + def test_duplicate_patterns(self, tmp_path, base_config): + """Test config loading if the glob patterns cover the same file""" + (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) + conf = OmegaConfigLoader(str(tmp_path)) + catalog1 = conf["catalog"] + catalog2 = conf["catalog"] + assert catalog1 == catalog2 == base_config + + def test_subdirs_dont_exist(self, tmp_path, base_config): + """Check the error when config paths don't exist""" + pattern = ( + r"Given configuration path either does not exist " + r"or is not a valid directory\: {}" + ) + with pytest.raises(MissingConfigException, match=pattern.format(".*base")): + OmegaConfigLoader(str(tmp_path))["catalog"] + with pytest.raises(MissingConfigException, match=pattern.format(".*local")): + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" + _write_yaml(proj_catalog, base_config) + OmegaConfigLoader(str(tmp_path))["catalog"] + + @pytest.mark.usefixtures("create_config_dir", "proj_catalog", "proj_catalog_nested") + def test_nested(self, tmp_path): + """Test loading the config from subdirectories""" + config_loader = OmegaConfigLoader(str(tmp_path)) + config_loader.default_run_env = "prod" + + prod_catalog = tmp_path / "prod" / "catalog.yml" + _write_yaml(prod_catalog, {}) + + catalog = config_loader["catalog"] + assert catalog.keys() == {"cars", "trains", "nested"} + assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["cars"]["save_args"]["index"] is True + assert catalog["nested"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_nested_subdirs_duplicate(self, tmp_path, base_config): + """Check the error when the configs from subdirectories contain + duplicate keys""" + nested = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(nested, base_config) + + pattern = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*nested\.yml|.*nested\.yml and .*catalog\.yml)" + r"\: cars, trains" + ) + with pytest.raises(ValueError, match=pattern): + OmegaConfigLoader(str(tmp_path))["catalog"] + + @use_config_dir + def test_multiple_nested_subdirs_duplicates( + self, tmp_path, base_config, local_config + ): + """Check the error when several config files from subdirectories contain + duplicate keys""" + nested = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(nested, base_config) + + local = tmp_path / _BASE_ENV / "catalog" / "dir" / "local.yml" + _write_yaml(local, local_config) + + pattern_catalog_nested = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*nested\.yml|.*nested\.yml and .*catalog\.yml)" + r"\: cars, trains" + ) + pattern_catalog_local = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*local\.yml|.*local\.yml and .*catalog\.yml)" + r"\: cars" + ) + pattern_nested_local = ( + r"Duplicate keys found in " + r"(.*nested\.yml and .*local\.yml|.*local\.yml and .*nested\.yml)" + r"\: cars" + ) + + with pytest.raises(ValueError) as exc: + OmegaConfigLoader(str(tmp_path))["catalog"] + assert re.search(pattern_catalog_nested, str(exc.value)) + assert re.search(pattern_catalog_local, str(exc.value)) + assert re.search(pattern_nested_local, str(exc.value)) + + @use_config_dir + def test_bad_config_syntax(self, tmp_path): + conf_path = tmp_path / _BASE_ENV + conf_path.mkdir(parents=True, exist_ok=True) + (conf_path / "catalog.yml").write_text("bad:\nconfig") + + pattern = f"Invalid YAML or JSON file {conf_path.as_posix()}" + with pytest.raises(ParserError, match=re.escape(pattern)): + OmegaConfigLoader(str(tmp_path))["catalog"] + + def test_lots_of_duplicates(self, tmp_path): + data = {str(i): i for i in range(100)} + _write_yaml(tmp_path / _BASE_ENV / "catalog1.yml", data) + _write_yaml(tmp_path / _BASE_ENV / "catalog2.yml", data) + + conf = OmegaConfigLoader(str(tmp_path)) + pattern = ( + r"Duplicate keys found in " + r"(.*catalog2\.yml and .*catalog1\.yml|.*catalog1\.yml and .*catalog2\.yml)" + r"\: .*\.\.\.$" + ) + with pytest.raises(ValueError, match=pattern): + conf["catalog"] + + @use_config_dir + def test_same_key_in_same_dir(self, tmp_path, base_config): + """Check the error if 2 files in the same config dir contain + the same top-level key""" + dup_json = tmp_path / _BASE_ENV / "catalog.json" + _write_json(dup_json, base_config) + + pattern = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*catalog\.json|.*catalog\.json and .*catalog\.yml)" + r"\: cars, trains" + ) + with pytest.raises(ValueError, match=pattern): + OmegaConfigLoader(str(tmp_path))["catalog"] + + @use_config_dir + def test_pattern_key_not_found(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + key = "non-existent-pattern" + pattern = f"No config patterns were found for '{key}' in your config loader" + with pytest.raises(KeyError, match=pattern): + OmegaConfigLoader(str(tmp_path))[key] + + @use_config_dir + def test_cannot_load_non_yaml_or_json_files(self, tmp_path): + db_patterns = {"db": ["db*"]} + db_config_path = tmp_path / _BASE_ENV / "db.ini" + _write_dummy_ini(db_config_path) + + conf = OmegaConfigLoader(str(tmp_path), config_patterns=db_patterns) + pattern = ( + r"No files of YAML or JSON format found in " + r".*base or " + r".*local " + r"matching the glob pattern\(s\): " + r"\[\'db\*\'\]" + ) + with pytest.raises(MissingConfigException, match=pattern): + conf["db"] + + @use_config_dir + def test_no_files_found(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + pattern = ( + r"No files of YAML or JSON format found in " + r".*base or " + r".*local " + r"matching the glob pattern\(s\): " + r"\[\'credentials\*\', \'credentials\*/\**\', \'\**/credentials\*\'\]" + ) + with pytest.raises(MissingConfigException, match=pattern): + OmegaConfigLoader(str(tmp_path))["credentials"] + + def test_empty_catalog_file(self, tmp_path): + """Check that empty catalog file is read and returns an empty dict""" + _write_yaml(tmp_path / _BASE_ENV / "catalog_empty.yml", {}) + catalog_patterns = {"catalog": ["catalog*", "catalog*/**", "**/catalog*"]} + catalog = OmegaConfigLoader( + conf_source=tmp_path, env="base", config_patterns=catalog_patterns + )["catalog"] + assert catalog == {} + + def test_overlapping_patterns(self, tmp_path, mocker): + """Check that same configuration file is not loaded more than once.""" + _write_yaml( + tmp_path / _BASE_ENV / "catalog0.yml", + {"env": _BASE_ENV, "common": "common"}, + ) + _write_yaml( + tmp_path / "dev" / "catalog1.yml", {"env": "dev", "dev_specific": "wiz"} + ) + _write_yaml(tmp_path / "dev" / "user1" / "catalog2.yml", {"user1_c2": True}) + + catalog_patterns = { + "catalog": [ + "catalog*", + "catalog*/**", + "../**/user1/catalog2*", + "../**/catalog2*", + ] + } + + catalog = OmegaConfigLoader( + conf_source=str(tmp_path), env="dev", config_patterns=catalog_patterns + )["catalog"] + expected_catalog = { + "env": "dev", + "common": "common", + "dev_specific": "wiz", + "user1_c2": True, + } + assert catalog == expected_catalog + + mocked_load = mocker.patch("omegaconf.OmegaConf.load") + expected_path = (tmp_path / "dev" / "user1" / "catalog2.yml").resolve() + assert mocked_load.called_once_with(expected_path) + + def test_yaml_parser_error(self, tmp_path): + conf_path = tmp_path / _BASE_ENV + conf_path.mkdir(parents=True, exist_ok=True) + + example_catalog = """ + example_iris_data: + type: pandas.CSVDataSet + filepath: data/01_raw/iris.csv + """ + + (conf_path / "catalog.yml").write_text(example_catalog) + + msg = ( + f"Invalid YAML or JSON file {Path(conf_path, 'catalog.yml').as_posix()}, unable to read" + f" line 3, position 10." + ) + with pytest.raises(ParserError, match=re.escape(msg)): + OmegaConfigLoader(str(tmp_path))["catalog"] + + def test_customised_config_patterns(self, tmp_path): + config_loader = OmegaConfigLoader( + conf_source=str(tmp_path), + config_patterns={ + "spark": ["spark*/"], + "parameters": ["params*", "params*/**", "**/params*"], + }, + ) + assert config_loader.config_patterns["catalog"] == [ + "catalog*", + "catalog*/**", + "**/catalog*", + ] + assert config_loader.config_patterns["spark"] == ["spark*/"] + assert config_loader.config_patterns["parameters"] == [ + "params*", + "params*/**", + "**/params*", + ] + + def test_destructive_merging_strategy(self, tmp_path): + mlflow_patterns = {"mlflow": ["mlflow*", "mlflow*/**", "**/mlflow*"]} + base_mlflow = tmp_path / _BASE_ENV / "mlflow.yml" + base_config = { + "tracking": { + "disable_tracking": {"pipelines": "[on_exit_notification]"}, + "experiment": { + "name": "name-of-local-experiment", + }, + "params": {"long_params_strategy": "tag"}, + } + } + local_mlflow = tmp_path / _DEFAULT_RUN_ENV / "mlflow.yml" + local_config = { + "tracking": { + "experiment": { + "name": "name-of-prod-experiment", + }, + } + } + + _write_yaml(base_mlflow, base_config) + _write_yaml(local_mlflow, local_config) + + conf = OmegaConfigLoader(str(tmp_path), config_patterns=mlflow_patterns)[ + "mlflow" + ] + + assert conf == { + "tracking": { + "experiment": { + "name": "name-of-prod-experiment", + }, + } + } + + @use_config_dir + def test_adding_extra_keys_to_confloader(self, tmp_path): + """Make sure extra keys can be added directly to the config loader instance.""" + conf = OmegaConfigLoader(str(tmp_path)) + catalog = conf["catalog"] + conf["spark"] = {"spark_config": "emr.blabla"} + + assert catalog["trains"]["type"] == "MemoryDataSet" + assert conf["spark"] == {"spark_config": "emr.blabla"} + + @use_config_dir + def test_bypass_catalog_config_loading(self, tmp_path): + """Make sure core config loading can be bypassed by setting the key and values + directly on the config loader instance.""" + conf = OmegaConfigLoader(str(tmp_path)) + conf["catalog"] = {"catalog_config": "something_new"} + + assert conf["catalog"] == {"catalog_config": "something_new"} + + @use_config_dir + @use_credentials_env_variable_yml + def test_load_credentials_from_env_variables(self, tmp_path): + """Load credentials from environment variables""" + conf = OmegaConfigLoader(str(tmp_path)) + os.environ["TEST_USERNAME"] = "test_user" + os.environ["TEST_KEY"] = "test_key" + assert conf["credentials"]["user"]["name"] == "test_user" + assert conf["credentials"]["user"]["key"] == "test_key" + + @use_config_dir + @use_catalog_env_variable_yml + def test_env_resolver_not_used_for_catalog(self, tmp_path): + """Check that the oc.env resolver is not used for catalog loading""" + conf = OmegaConfigLoader(str(tmp_path)) + os.environ["TEST_DATASET"] = "test_dataset" + with pytest.raises(errors.UnsupportedInterpolationType): + conf["catalog"]["test"]["file_path"] + + @use_config_dir + @use_credentials_env_variable_yml + def test_env_resolver_is_cleared_after_loading(self, tmp_path): + """Check that the ``oc.env`` resolver is cleared after loading credentials + in the case that it was not registered beforehand.""" + conf = OmegaConfigLoader(str(tmp_path)) + os.environ["TEST_USERNAME"] = "test_user" + os.environ["TEST_KEY"] = "test_key" + assert conf["credentials"]["user"]["name"] == "test_user" + assert not OmegaConf.has_resolver("oc.env") + + @use_config_dir + @use_credentials_env_variable_yml + def test_env_resolver_is_registered_after_loading(self, tmp_path): + """Check that the ``oc.env`` resolver is registered after loading credentials + in the case that it was registered beforehand""" + conf = OmegaConfigLoader(str(tmp_path)) + OmegaConf.register_new_resolver("oc.env", oc.env) + os.environ["TEST_USERNAME"] = "test_user" + os.environ["TEST_KEY"] = "test_key" + assert conf["credentials"]["user"]["name"] == "test_user" + assert OmegaConf.has_resolver("oc.env") + OmegaConf.clear_resolver("oc.env") + + @use_config_dir + def test_load_config_from_tar_file(self, tmp_path): + subprocess.run( # pylint: disable=subprocess-run-check + [ + "tar", + "--exclude=local/*.yml", + "-czf", + f"{tmp_path}/tar_conf.tar.gz", + f"--directory={str(tmp_path.parent)}", + f"{tmp_path.name}", + ] + ) + + conf = OmegaConfigLoader(conf_source=f"{tmp_path}/tar_conf.tar.gz") + catalog = conf["catalog"] + assert catalog["trains"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_load_config_from_zip_file(self, tmp_path): + def zipdir(path, ziph): + # This is a helper method to zip up a directory without keeping the complete directory + # structure with all parent paths. + # ziph is zipfile handle + for root, _, files in os.walk(path): + for file in files: + ziph.write( + os.path.join(root, file), + os.path.relpath( + os.path.join(root, file), os.path.join(path, "..") + ), + ) + + with zipfile.ZipFile( + f"{tmp_path}/Python.zip", "w", zipfile.ZIP_DEFLATED + ) as zipf: + zipdir(tmp_path, zipf) + + conf = OmegaConfigLoader(conf_source=f"{tmp_path}/Python.zip") + catalog = conf["catalog"] + assert catalog["trains"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_variable_interpolation_with_correct_env(self, tmp_path): + """Make sure the parameters is interpolated with the correct environment""" + conf = OmegaConfigLoader(str(tmp_path)) + params = conf["parameters"] + # Making sure it is not override by local/parameters_global.yml + assert params["interpolated_param"] == "base" + + @use_config_dir + def test_runtime_params_override_interpolated_value(self, tmp_path): + """Make sure interpolated value is updated correctly with runtime_params""" + conf = OmegaConfigLoader(str(tmp_path), runtime_params={"test_env": "dummy"}) + params = conf["parameters"] + assert params["interpolated_param"] == "dummy" + + @use_config_dir + @use_credentials_env_variable_yml + def test_runtime_params_not_propogate_non_parameters_config(self, tmp_path): + """Make sure `catalog`, `credentials`, `logging` or any config other than + `parameters` are not updated by `runtime_params`.""" + # https://github.com/kedro-org/kedro/pull/2467 + key = "test_env" + runtime_params = {key: "dummy"} + conf = OmegaConfigLoader( + str(tmp_path), + config_patterns={"spark": ["spark*", "spark*/**", "**/spark*"]}, + runtime_params=runtime_params, + ) + parameters = conf["parameters"] + catalog = conf["catalog"] + credentials = conf["credentials"] + spark = conf["spark"] + + assert key in parameters + assert key not in catalog + assert key not in credentials + assert key not in spark + + def test_ignore_hidden_keys(self, tmp_path): + """Check that the config key starting with `_` are ignored and also + don't cause a config merge error""" + _write_yaml(tmp_path / _BASE_ENV / "catalog1.yml", {"k1": "v1", "_k2": "v2"}) + _write_yaml(tmp_path / _BASE_ENV / "catalog2.yml", {"k3": "v3", "_k2": "v4"}) + + conf = OmegaConfigLoader(str(tmp_path)) + conf.default_run_env = "" + catalog = conf["catalog"] + assert catalog.keys() == {"k1", "k3"} + + _write_yaml(tmp_path / _BASE_ENV / "catalog3.yml", {"k1": "dup", "_k2": "v5"}) + pattern = ( + r"Duplicate keys found in " + r"(.*catalog1\.yml and .*catalog3\.yml|.*catalog3\.yml and .*catalog1\.yml)" + r"\: k1" + ) + with pytest.raises(ValueError, match=pattern): + conf["catalog"] + + def test_variable_interpolation_in_catalog_with_templates(self, tmp_path): + base_catalog = tmp_path / _BASE_ENV / "catalog.yml" + catalog_config = { + "companies": { + "type": "${_pandas.type}", + "filepath": "data/01_raw/companies.csv", + }, + "_pandas": {"type": "pandas.CSVDataSet"}, + } + _write_yaml(base_catalog, catalog_config) + + conf = OmegaConfigLoader(str(tmp_path)) + conf.default_run_env = "" + assert conf["catalog"]["companies"]["type"] == "pandas.CSVDataSet" + + def test_variable_interpolation_in_catalog_with_separate_templates_file( + self, tmp_path + ): + base_catalog = tmp_path / _BASE_ENV / "catalog.yml" + catalog_config = { + "companies": { + "type": "${_pandas.type}", + "filepath": "data/01_raw/companies.csv", + } + } + tmp_catalog = tmp_path / _BASE_ENV / "catalog_temp.yml" + template = {"_pandas": {"type": "pandas.CSVDataSet"}} + _write_yaml(base_catalog, catalog_config) + _write_yaml(tmp_catalog, template) + + conf = OmegaConfigLoader(str(tmp_path)) + conf.default_run_env = "" + assert conf["catalog"]["companies"]["type"] == "pandas.CSVDataSet" diff --git a/tests/config/test_templated_config.py b/tests/config/test_templated_config.py index a7bdd83e07..9a8edbd0d4 100644 --- a/tests/config/test_templated_config.py +++ b/tests/config/test_templated_config.py @@ -1,33 +1,6 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from __future__ import annotations from pathlib import Path -from typing import Dict import pytest import yaml @@ -39,7 +12,7 @@ _BASE_ENV = "base" -def _write_yaml(filepath: Path, config: Dict): +def _write_yaml(filepath: Path, config: dict): filepath.parent.mkdir(parents=True, exist_ok=True) yaml_str = yaml.dump(config) filepath.write_text(yaml_str) @@ -235,6 +208,15 @@ def proj_catalog_param_with_default(tmp_path, param_config_with_default): class TestTemplatedConfigLoader: + @pytest.mark.usefixtures("proj_catalog_param") + def test_get_catalog_config_with_dict_get(self, tmp_path, template_config): + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=template_config + ) + config_loader.default_run_env = "" + catalog = config_loader["catalog"] + assert catalog["boats"]["type"] == "SparkDataSet" + @pytest.mark.usefixtures("proj_catalog_param") def test_catalog_parameterized_w_dict(self, tmp_path, template_config): """Test parameterized config with input from dictionary with values""" @@ -281,7 +263,7 @@ def test_catalog_parameterized_no_params_no_default(self, tmp_path): @pytest.mark.usefixtures("proj_catalog_param_with_default") def test_catalog_parameterized_empty_params_with_default(self, tmp_path): """Test parameterized config with empty globals dictionary""" - config_loader = TemplatedConfigLoader(str(tmp_path), globals_dict=dict()) + config_loader = TemplatedConfigLoader(str(tmp_path), globals_dict={}) config_loader.default_run_env = "" catalog = config_loader.get("catalog*.yml") @@ -395,6 +377,33 @@ def test_catalog_with_jinja2_syntax(self, tmp_path, template_config): } assert catalog == expected_catalog + @pytest.mark.usefixtures("proj_catalog_globals", "catalog_with_jinja2_syntax") + def test_catalog_with_jinja2_syntax_and_globals_file(self, tmp_path): + """Test catalog with jinja2 syntax with globals yaml file""" + proj_catalog = tmp_path / _DEFAULT_RUN_ENV / "catalog.yml" + _write_yaml(proj_catalog, {}) + config_loader = TemplatedConfigLoader( + str(tmp_path), + globals_pattern="*globals.yml", + ) + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*.yml") + expected_catalog = { + "fast-trains": {"type": "MemoryDataSet"}, + "fast-cars": { + "type": "pandas.CSVDataSet", + "filepath": "s3a://boat-and-car-bucket/fast-cars.csv", + "save_args": {"index": True}, + }, + "slow-trains": {"type": "MemoryDataSet"}, + "slow-cars": { + "type": "pandas.CSVDataSet", + "filepath": "s3a://boat-and-car-bucket/slow-cars.csv", + "save_args": {"index": True}, + }, + } + assert catalog == expected_catalog + class TestFormatObject: @pytest.mark.parametrize( @@ -432,7 +441,7 @@ class TestFormatObject: (["${a}", "X${a}"], {"a": "A"}, ["A", "XA"]), (["${b|D}"], {"a": "A"}, ["D"]), (["${b|abcDEF_.<>/@$%^&!}"], {"a": "A"}, ["abcDEF_.<>/@$%^&!"]), - # Dicts + # dicts ({"key": "${a}"}, {"a": "A"}, {"key": "A"}), ({"${a}": "value"}, {"a": "A"}, {"A": "value"}), ({"${a|D}": "value"}, {}, {"D": "value"}), @@ -460,3 +469,37 @@ def test_simple_replace(self, val, format_dict, expected): def test_raises_error(self, val, format_dict, expected_error_message): with pytest.raises(ValueError, match=expected_error_message): _format_object(val, format_dict) + + def test_customised_patterns(self, tmp_path): + config_loader = TemplatedConfigLoader( + str(tmp_path), + config_patterns={"spark": ["spark*/"]}, + ) + assert config_loader.config_patterns["catalog"] == [ + "catalog*", + "catalog*/**", + "**/catalog*", + ] + assert config_loader.config_patterns["spark"] == ["spark*/"] + + @pytest.mark.usefixtures("proj_catalog_param") + def test_adding_extra_keys_to_confloader(self, tmp_path, template_config): + """Make sure extra keys can be added directly to the config loader instance.""" + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=template_config + ) + config_loader.default_run_env = "" + catalog = config_loader["catalog"] + config_loader["spark"] = {"spark_config": "emr.blabla"} + + assert catalog["boats"]["type"] == "SparkDataSet" + assert config_loader["spark"] == {"spark_config": "emr.blabla"} + + @pytest.mark.usefixtures("proj_catalog_param") + def test_bypass_catalog_config_loading(self, tmp_path): + """Make sure core config loading can be bypassed by setting the key and values + directly on the config loader instance.""" + conf = TemplatedConfigLoader(str(tmp_path)) + conf["catalog"] = {"catalog_config": "something_new"} + + assert conf["catalog"] == {"catalog_config": "something_new"} diff --git a/tests/conftest.py b/tests/conftest.py index 6057d22d6e..255cd9c395 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """ This file contains the fixtures that are reusable by any tests within this directory. You don't need to import the fixtures as pytest will diff --git a/tests/extras/datasets/api/test_api_dataset.py b/tests/extras/datasets/api/test_api_dataset.py deleted file mode 100644 index aca092e271..0000000000 --- a/tests/extras/datasets/api/test_api_dataset.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -# pylint: disable=no-member -import json -import socket - -import pytest -import requests -import requests_mock - -from kedro.extras.datasets.api import APIDataSet -from kedro.io.core import DataSetError - -POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"] - -TEST_URL = "http://example.com/api/test" -TEST_TEXT_RESPONSE_DATA = "This is a response." -TEST_JSON_RESPONSE_DATA = [{"key": "value"}] - -TEST_PARAMS = {"param": "value"} -TEST_URL_WITH_PARAMS = TEST_URL + "?param=value" - -TEST_HEADERS = {"key": "value"} - - -@pytest.mark.parametrize("method", POSSIBLE_METHODS) -class TestAPIDataSet: - @pytest.fixture - def requests_mocker(self): - with requests_mock.Mocker() as mock: - yield mock - - def test_successfully_load_with_response(self, requests_mocker, method): - api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS - ) - requests_mocker.register_uri( - method, - TEST_URL_WITH_PARAMS, - headers=TEST_HEADERS, - text=TEST_TEXT_RESPONSE_DATA, - ) - - response = api_data_set.load() - assert isinstance(response, requests.Response) - assert response.text == TEST_TEXT_RESPONSE_DATA - - def test_successful_json_load_with_response(self, requests_mocker, method): - api_data_set = APIDataSet( - url=TEST_URL, - method=method, - json=TEST_JSON_RESPONSE_DATA, - headers=TEST_HEADERS, - ) - requests_mocker.register_uri( - method, - TEST_URL, - headers=TEST_HEADERS, - text=json.dumps(TEST_JSON_RESPONSE_DATA), - ) - - response = api_data_set.load() - assert isinstance(response, requests.Response) - assert response.json() == TEST_JSON_RESPONSE_DATA - - def test_http_error(self, requests_mocker, method): - api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS - ) - requests_mocker.register_uri( - method, - TEST_URL_WITH_PARAMS, - headers=TEST_HEADERS, - text="Nope, not found", - status_code=requests.codes.FORBIDDEN, - ) - - with pytest.raises(DataSetError, match="Failed to fetch data"): - api_data_set.load() - - def test_socket_error(self, requests_mocker, method): - api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS - ) - requests_mocker.register_uri(method, TEST_URL_WITH_PARAMS, exc=socket.error) - - with pytest.raises(DataSetError, match="Failed to connect"): - api_data_set.load() - - def test_read_only_mode(self, method): - """ - Saving is disabled on the data set. - """ - api_data_set = APIDataSet(url=TEST_URL, method=method) - with pytest.raises(DataSetError, match="is a read only data set type"): - api_data_set.save({}) - - def test_exists_http_error(self, requests_mocker, method): - """ - In case of an unexpected HTTP error, - ``exists()`` should not silently catch it. - """ - api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS - ) - requests_mocker.register_uri( - method, - TEST_URL_WITH_PARAMS, - headers=TEST_HEADERS, - text="Nope, not found", - status_code=requests.codes.FORBIDDEN, - ) - with pytest.raises(DataSetError, match="Failed to fetch data"): - api_data_set.exists() - - def test_exists_ok(self, requests_mocker, method): - """ - If the file actually exists and server responds 200, - ``exists()`` should return True - """ - api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS - ) - requests_mocker.register_uri( - method, - TEST_URL_WITH_PARAMS, - headers=TEST_HEADERS, - text=TEST_TEXT_RESPONSE_DATA, - ) - - assert api_data_set.exists() diff --git a/tests/extras/datasets/bioinformatics/test_biosequence_dataset.py b/tests/extras/datasets/bioinformatics/test_biosequence_dataset.py index f0818df038..b26271cb36 100644 --- a/tests/extras/datasets/bioinformatics/test_biosequence_dataset.py +++ b/tests/extras/datasets/bioinformatics/test_biosequence_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from io import StringIO from pathlib import PurePosixPath @@ -37,7 +9,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.biosequence import BioSequenceDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER LOAD_ARGS = {"format": "fasta"} @@ -105,7 +77,7 @@ def test_open_extra_args(self, biosequence_data_set, fs_args): def test_load_missing_file(self, biosequence_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set BioSequenceDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): biosequence_data_set.load() @pytest.mark.parametrize( diff --git a/tests/extras/datasets/conftest.py b/tests/extras/datasets/conftest.py index 3504600cb0..b9fddb3f88 100644 --- a/tests/extras/datasets/conftest.py +++ b/tests/extras/datasets/conftest.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """ This file contains the fixtures that are reusable by any tests within this directory. You don't need to import the fixtures as pytest will diff --git a/tests/extras/datasets/dask/test_parquet_dataset.py b/tests/extras/datasets/dask/test_parquet_dataset.py index 9c45d66df9..597d8c40a4 100644 --- a/tests/extras/datasets/dask/test_parquet_dataset.py +++ b/tests/extras/datasets/dask/test_parquet_dataset.py @@ -1,32 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - import boto3 import dask.dataframe as dd import pandas as pd @@ -38,7 +9,7 @@ from s3fs import S3FileSystem from kedro.extras.datasets.dask import ParquetDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError FILE_NAME = "test.parquet" BUCKET_NAME = "test_bucket" @@ -105,7 +76,7 @@ class TestParquetDataSet: def test_incorrect_credentials_load(self): """Test that incorrect credential keys won't instantiate dataset.""" pattern = r"unexpected keyword argument" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): ParquetDataSet( filepath=S3_PATH, credentials={ @@ -117,7 +88,7 @@ def test_incorrect_credentials_load(self): def test_empty_credentials_load(self, bad_credentials): parquet_data_set = ParquetDataSet(filepath=S3_PATH, credentials=bad_credentials) pattern = r"Failed while loading data from data set ParquetDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): parquet_data_set.load().compute() def test_pass_credentials(self, mocker): @@ -126,7 +97,7 @@ def test_pass_credentials(self, mocker): client_mock = mocker.patch("botocore.session.Session.create_client") s3_data_set = ParquetDataSet(filepath=S3_PATH, credentials=AWS_CREDENTIALS) pattern = r"Failed while loading data from data set ParquetDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): s3_data_set.load().compute() assert client_mock.call_count == 1 @@ -184,8 +155,69 @@ def test_load_extra_params(self, s3_data_set, load_args): ) def test_save_extra_params(self, s3_data_set, save_args): """Test overriding the default save arguments.""" + s3_data_set._process_schema() + assert s3_data_set._save_args.get("schema") is None + for key, value in save_args.items(): assert s3_data_set._save_args[key] == value for key, value in s3_data_set.DEFAULT_SAVE_ARGS.items(): assert s3_data_set._save_args[key] == value + + @pytest.mark.parametrize( + "save_args", + [{"schema": {"col1": "[[int64]]", "col2": "string"}}], + indirect=True, + ) + def test_save_extra_params_schema_dict(self, s3_data_set, save_args): + """Test setting the schema as dictionary of pyarrow column types + in save arguments.""" + + for key, value in save_args["schema"].items(): + assert s3_data_set._save_args["schema"][key] == value + + s3_data_set._process_schema() + + for field in s3_data_set._save_args["schema"].values(): + assert isinstance(field, pa.DataType) + + @pytest.mark.parametrize( + "save_args", + [ + { + "schema": { + "col1": "[[int64]]", + "col2": "string", + "col3": float, + "col4": pa.int64(), + } + } + ], + indirect=True, + ) + def test_save_extra_params_schema_dict_mixed_types(self, s3_data_set, save_args): + """Test setting the schema as dictionary of mixed value types + in save arguments.""" + + for key, value in save_args["schema"].items(): + assert s3_data_set._save_args["schema"][key] == value + + s3_data_set._process_schema() + + for field in s3_data_set._save_args["schema"].values(): + assert isinstance(field, pa.DataType) + + @pytest.mark.parametrize( + "save_args", + [{"schema": "c1:[int64],c2:int64"}], + indirect=True, + ) + def test_save_extra_params_schema_str_schema_fields(self, s3_data_set, save_args): + """Test setting the schema as string pyarrow schema (list of fields) + in save arguments.""" + + assert s3_data_set._save_args["schema"] == save_args["schema"] + + s3_data_set._process_schema() + + assert isinstance(s3_data_set._save_args["schema"], pa.Schema) diff --git a/tests/extras/datasets/email/test_message_dataset.py b/tests/extras/datasets/email/test_message_dataset.py index 1eb55c1615..9eab39be4d 100644 --- a/tests/extras/datasets/email/test_message_dataset.py +++ b/tests/extras/datasets/email/test_message_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from email.message import EmailMessage from email.policy import default from pathlib import Path, PurePosixPath @@ -37,7 +9,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.email import EmailMessageDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version @@ -121,7 +93,7 @@ def test_open_extra_args(self, message_data_set, fs_args): def test_load_missing_file(self, message_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set EmailMessageDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): message_data_set.load() @pytest.mark.parametrize( @@ -186,7 +158,7 @@ def test_save_and_load(self, versioned_message_data_set, dummy_msg): def test_no_versions(self, versioned_message_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for EmailMessageDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_message_data_set.load() def test_exists(self, versioned_message_data_set, dummy_msg): @@ -200,10 +172,10 @@ def test_prevent_overwrite(self, versioned_message_data_set, dummy_msg): corresponding text file for a given save version already exists.""" versioned_message_data_set.save(dummy_msg) pattern = ( - r"Save path \`.+\` for EmailMessageDataSet\(.+\) must " + r"Save path \'.+\' for EmailMessageDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_message_data_set.save(dummy_msg) @pytest.mark.parametrize( @@ -218,17 +190,17 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - f"Save version `{save_version}` did not match " - f"load version `{load_version}` for " + f"Save version '{save_version}' did not match " + f"load version '{load_version}' for " r"EmailMessageDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_message_data_set.save(dummy_msg) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): EmailMessageDataSet( filepath="https://example.com/file", version=Version(None, None) ) @@ -245,7 +217,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_message_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_message_data_set.save(dummy_msg) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/geojson/test_geojson_dataset.py b/tests/extras/datasets/geojson/test_geojson_dataset.py index 83f6df96e4..5a2669964c 100644 --- a/tests/extras/datasets/geojson/test_geojson_dataset.py +++ b/tests/extras/datasets/geojson/test_geojson_dataset.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. from pathlib import Path, PurePosixPath import geopandas as gpd @@ -37,7 +10,7 @@ from shapely.geometry import Point from kedro.extras.datasets.geopandas import GeoJSONDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp @@ -101,7 +74,7 @@ def test_save_and_load(self, geojson_data_set, dummy_dataframe): def test_load_missing_file(self, geojson_data_set): """Check the error while trying to load from missing source.""" pattern = r"Failed while loading data from data set GeoJSONDataSet" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): geojson_data_set.load() def test_exists(self, geojson_data_set, dummy_dataframe): @@ -192,7 +165,7 @@ def test_save_and_load(self, versioned_geojson_data_set, dummy_dataframe): def test_no_versions(self, versioned_geojson_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for GeoJSONDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_geojson_data_set.load() def test_exists(self, versioned_geojson_data_set, dummy_dataframe): @@ -206,10 +179,10 @@ def test_prevent_override(self, versioned_geojson_data_set, dummy_dataframe): version.""" versioned_geojson_data_set.save(dummy_dataframe) pattern = ( - r"Save path \`.+\` for GeoJSONDataSet\(.+\) must not " + r"Save path \'.+\' for GeoJSONDataSet\(.+\) must not " r"exist if versioning is enabled" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_geojson_data_set.save(dummy_dataframe) @pytest.mark.parametrize( @@ -224,16 +197,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for GeoJSONDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for GeoJSONDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_geojson_data_set.save(dummy_dataframe) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): GeoJSONDataSet( filepath="https://example/file.geojson", version=Version(None, None) ) @@ -250,7 +223,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_geojson_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_geojson_data_set.save(dummy_dataframe) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/holoviews/test_holoviews_writer.py b/tests/extras/datasets/holoviews/test_holoviews_writer.py index 8257648217..24fb7f6c0f 100644 --- a/tests/extras/datasets/holoviews/test_holoviews_writer.py +++ b/tests/extras/datasets/holoviews/test_holoviews_writer.py @@ -1,31 +1,4 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - +import sys from pathlib import Path, PurePosixPath import holoviews as hv @@ -37,7 +10,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.holoviews import HoloviewsWriter -from kedro.io import DataSetError, Version +from kedro.io import DatasetError, Version from kedro.io.core import PROTOCOL_DELIMITER @@ -61,6 +34,10 @@ def versioned_hv_writer(filepath_png, load_version, save_version): return HoloviewsWriter(filepath_png, version=Version(load_version, save_version)) +@pytest.mark.skipif( + sys.version_info.minor == 10, + reason="Python 3.10 needs matplotlib>=3.5 which breaks holoviews.", +) class TestHoloviewsWriter: def test_save_data(self, tmp_path, dummy_hv_object, hv_writer): """Test saving Holoviews object.""" @@ -91,8 +68,8 @@ def test_open_extra_args(self, tmp_path, fs_args, mocker): assert writer._fs_open_args_save == fs_args["open_args_save"] def test_load_fail(self, hv_writer): - pattern = r"Loading not supported for `HoloviewsWriter`" - with pytest.raises(DataSetError, match=pattern): + pattern = r"Loading not supported for 'HoloviewsWriter'" + with pytest.raises(DatasetError, match=pattern): hv_writer.load() def test_exists(self, dummy_hv_object, hv_writer): @@ -140,6 +117,10 @@ def test_protocol_usage(self, filepath, instance_type, credentials): assert isinstance(data_set._filepath, PurePosixPath) +@pytest.mark.skipif( + sys.version_info.minor == 10, + reason="Python 3.10 needs matplotlib>=3.5 which breaks holoviews.", +) class TestHoloviewsWriterVersioned: def test_version_str_repr(self, hv_writer, versioned_hv_writer): """Test that version is in string representation of the class instance @@ -161,10 +142,10 @@ def test_prevent_overwrite(self, dummy_hv_object, versioned_hv_writer): corresponding file for a given save version already exists.""" versioned_hv_writer.save(dummy_hv_object) pattern = ( - r"Save path \`.+\` for HoloviewsWriter\(.+\) must " + r"Save path \'.+\' for HoloviewsWriter\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_hv_writer.save(dummy_hv_object) @pytest.mark.parametrize( @@ -179,24 +160,26 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - fr"Save version `{save_version}` did not match load version " - fr"`{load_version}` for HoloviewsWriter\(.+\)" + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for HoloviewsWriter\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_hv_writer.save(dummy_hv_object) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): HoloviewsWriter( filepath="https://example.com/file.png", version=Version(None, None) ) - def test_no_versions(self, versioned_hv_writer): + def test_load_not_supported(self, versioned_hv_writer): """Check the error if no versions are available for load.""" - pattern = r"Did not find any versions for HoloviewsWriter\(.+\)" - with pytest.raises(DataSetError, match=pattern): + pattern = ( + rf"Loading not supported for '{versioned_hv_writer.__class__.__name__}'" + ) + with pytest.raises(DatasetError, match=pattern): versioned_hv_writer.load() def test_exists(self, versioned_hv_writer, dummy_hv_object): @@ -228,7 +211,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_hv_writer._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_hv_writer.save(dummy_hv_object) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/json/test_json_dataset.py b/tests/extras/datasets/json/test_json_dataset.py index 6e74d8cdda..531fd007b7 100644 --- a/tests/extras/datasets/json/test_json_dataset.py +++ b/tests/extras/datasets/json/test_json_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath import pytest @@ -35,7 +7,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.json import JSONDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version @@ -97,7 +69,7 @@ def test_open_extra_args(self, json_data_set, fs_args): def test_load_missing_file(self, json_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set JSONDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): json_data_set.load() @pytest.mark.parametrize( @@ -160,7 +132,7 @@ def test_save_and_load(self, versioned_json_data_set, dummy_data): def test_no_versions(self, versioned_json_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for JSONDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_json_data_set.load() def test_exists(self, versioned_json_data_set, dummy_data): @@ -174,10 +146,10 @@ def test_prevent_overwrite(self, versioned_json_data_set, dummy_data): corresponding json file for a given save version already exists.""" versioned_json_data_set.save(dummy_data) pattern = ( - r"Save path \`.+\` for JSONDataSet\(.+\) must " + r"Save path \'.+\' for JSONDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_json_data_set.save(dummy_data) @pytest.mark.parametrize( @@ -192,17 +164,17 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - f"Save version `{save_version}` did not match " - f"load version `{load_version}` for " + f"Save version '{save_version}' did not match " + f"load version '{load_version}' for " r"JSONDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_json_data_set.save(dummy_data) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): JSONDataSet( filepath="https://example.com/file.json", version=Version(None, None) ) @@ -219,7 +191,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_json_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_json_data_set.save(dummy_data) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/api/__init__.py b/tests/extras/datasets/libsvm/__init__.py similarity index 100% rename from tests/extras/datasets/api/__init__.py rename to tests/extras/datasets/libsvm/__init__.py diff --git a/tests/extras/datasets/libsvm/test_svmlight_dataset.py b/tests/extras/datasets/libsvm/test_svmlight_dataset.py new file mode 100644 index 0000000000..52bfba394d --- /dev/null +++ b/tests/extras/datasets/libsvm/test_svmlight_dataset.py @@ -0,0 +1,214 @@ +from pathlib import Path, PurePosixPath + +import numpy as np +import pytest +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.svmlight import SVMLightDataSet +from kedro.io import DatasetError +from kedro.io.core import PROTOCOL_DELIMITER, Version + + +@pytest.fixture +def filepath_svm(tmp_path): + return (tmp_path / "test.svm").as_posix() + + +@pytest.fixture +def svm_data_set(filepath_svm, save_args, load_args, fs_args): + return SVMLightDataSet( + filepath=filepath_svm, save_args=save_args, load_args=load_args, fs_args=fs_args + ) + + +@pytest.fixture +def versioned_svm_data_set(filepath_svm, load_version, save_version): + return SVMLightDataSet( + filepath=filepath_svm, version=Version(load_version, save_version) + ) + + +@pytest.fixture +def dummy_data(): + features = np.array([[1, 2, 10], [1, 0.4, 3.2], [0, 0, 0]]) + label = np.array([1, 0, 3]) + return features, label + + +class TestSVMLightDataSet: + def test_save_and_load(self, svm_data_set, dummy_data): + """Test saving and reloading the data set.""" + svm_data_set.save(dummy_data) + reloaded_features, reloaded_label = svm_data_set.load() + original_features, original_label = dummy_data + assert (original_features == reloaded_features).all() + assert (original_label == reloaded_label).all() + assert svm_data_set._fs_open_args_load == {"mode": "rb"} + assert svm_data_set._fs_open_args_save == {"mode": "wb"} + + def test_exists(self, svm_data_set, dummy_data): + """Test `exists` method invocation for both existing and + nonexistent data set.""" + assert not svm_data_set.exists() + svm_data_set.save(dummy_data) + assert svm_data_set.exists() + + @pytest.mark.parametrize( + "save_args", [{"zero_based": False, "comment": "comment"}], indirect=True + ) + def test_save_extra_save_args(self, svm_data_set, save_args): + """Test overriding the default save arguments.""" + for key, value in save_args.items(): + assert svm_data_set._save_args[key] == value + + @pytest.mark.parametrize( + "load_args", [{"zero_based": False, "n_features": 3}], indirect=True + ) + def test_save_extra_load_args(self, svm_data_set, load_args): + """Test overriding the default load arguments.""" + for key, value in load_args.items(): + assert svm_data_set._load_args[key] == value + + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], + indirect=True, + ) + def test_open_extra_args(self, svm_data_set, fs_args): + assert svm_data_set._fs_open_args_load == fs_args["open_args_load"] + assert svm_data_set._fs_open_args_save == {"mode": "wb"} # default unchanged + + def test_load_missing_file(self, svm_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set SVMLightDataSet\(.*\)" + with pytest.raises(DatasetError, match=pattern): + svm_data_set.load() + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/file.svm", S3FileSystem), + ("file:///tmp/test.svm", LocalFileSystem), + ("/tmp/test.svm", LocalFileSystem), + ("gcs://bucket/file.svm", GCSFileSystem), + ("https://example.com/file.svm", HTTPFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + data_set = SVMLightDataSet(filepath=filepath) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.svm" + data_set = SVMLightDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestSVMLightDataSetVersioned: + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.svm" + ds = SVMLightDataSet(filepath=filepath) + ds_versioned = SVMLightDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "SVMLightDataSet" in str(ds_versioned) + assert "SVMLightDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + + def test_save_and_load(self, versioned_svm_data_set, dummy_data): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_svm_data_set.save(dummy_data) + reloaded_features, reloaded_label = versioned_svm_data_set.load() + original_features, original_label = dummy_data + assert (original_features == reloaded_features).all() + assert (original_label == reloaded_label).all() + + def test_no_versions(self, versioned_svm_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for SVMLightDataSet\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_svm_data_set.load() + + def test_exists(self, versioned_svm_data_set, dummy_data): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_svm_data_set.exists() + versioned_svm_data_set.save(dummy_data) + assert versioned_svm_data_set.exists() + + def test_prevent_overwrite(self, versioned_svm_data_set, dummy_data): + """Check the error when attempting to override the data set if the + corresponding json file for a given save version already exists.""" + versioned_svm_data_set.save(dummy_data) + pattern = ( + r"Save path \'.+\' for SVMLightDataSet\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DatasetError, match=pattern): + versioned_svm_data_set.save(dummy_data) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_svm_data_set, load_version, save_version, dummy_data + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + f"Save version '{save_version}' did not match " + f"load version '{load_version}' for " + r"SVMLightDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_svm_data_set.save(dummy_data) + + def test_http_filesystem_no_versioning(self): + pattern = "Versioning is not supported for HTTP protocols." + + with pytest.raises(DatasetError, match=pattern): + SVMLightDataSet( + filepath="https://example.com/file.svm", version=Version(None, None) + ) + + def test_versioning_existing_dataset( + self, svm_data_set, versioned_svm_data_set, dummy_data + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + svm_data_set.save(dummy_data) + assert svm_data_set.exists() + assert svm_data_set._filepath == versioned_svm_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_svm_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_svm_data_set.save(dummy_data) + + # Remove non-versioned dataset and try again + Path(svm_data_set._filepath.as_posix()).unlink() + versioned_svm_data_set.save(dummy_data) + assert versioned_svm_data_set.exists() diff --git a/tests/extras/datasets/matplotlib/test_matplotlib_writer.py b/tests/extras/datasets/matplotlib/test_matplotlib_writer.py index aae1155802..e6ee5be83b 100644 --- a/tests/extras/datasets/matplotlib/test_matplotlib_writer.py +++ b/tests/extras/datasets/matplotlib/test_matplotlib_writer.py @@ -1,32 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - - import json from pathlib import Path @@ -38,7 +9,7 @@ from s3fs import S3FileSystem from kedro.extras.datasets.matplotlib import MatplotlibWriter -from kedro.io import DataSetError, Version +from kedro.io import DatasetError, Version BUCKET_NAME = "test_bucket" AWS_CREDENTIALS = {"key": "testing", "secret": "testing"} @@ -126,15 +97,21 @@ def s3fs_cleanup(): S3FileSystem.cachable = False +@pytest.fixture(params=[False]) +def overwrite(request): + return request.param + + @pytest.fixture def plot_writer( - mocked_s3_bucket, fs_args, save_args + mocked_s3_bucket, fs_args, save_args, overwrite ): # pylint: disable=unused-argument return MatplotlibWriter( filepath=FULL_PATH, credentials=AWS_CREDENTIALS, fs_args=fs_args, save_args=save_args, + overwrite=overwrite, ) @@ -205,8 +182,30 @@ def test_dict_save(self, tmp_path, mock_dict_plot, plot_writer, mocked_s3_bucket assert actual_filepath.read_bytes() == download_path.read_bytes() + @pytest.mark.parametrize( + "overwrite,expected_num_plots", [(False, 8), (True, 3)], indirect=["overwrite"] + ) + def test_overwrite( + self, + mock_list_plot, + mock_dict_plot, + plot_writer, + mocked_s3_bucket, + expected_num_plots, + ): + """Test saving dictionary of plots after list of plots to S3.""" + + plot_writer.save(mock_list_plot) + plot_writer.save(mock_dict_plot) + + response = mocked_s3_bucket.list_objects(Bucket=BUCKET_NAME) + saved_plots = {obj["Key"] for obj in response["Contents"]} + + assert {f"{KEY_PATH}/{colour}" for colour in COLOUR_LIST} <= saved_plots + assert len(saved_plots) == expected_num_plots + def test_fs_args(self, tmp_path, mock_single_plot, mocked_encrypted_s3_bucket): - """Test writing to encrypted bucket""" + """Test writing to encrypted bucket.""" normal_encryped_writer = MatplotlibWriter( fs_args={"s3_additional_kwargs": {"ServerSideEncryption": "AES256"}}, filepath=FULL_PATH, @@ -235,8 +234,8 @@ def test_open_extra_args(self, plot_writer, fs_args): assert plot_writer._fs_open_args_save == fs_args["open_args_save"] def test_load_fail(self, plot_writer): - pattern = r"Loading not supported for `MatplotlibWriter`" - with pytest.raises(DataSetError, match=pattern): + pattern = r"Loading not supported for 'MatplotlibWriter'" + with pytest.raises(DatasetError, match=pattern): plot_writer.load() @pytest.mark.usefixtures("s3fs_cleanup") @@ -279,12 +278,26 @@ def test_prevent_overwrite(self, mock_single_plot, versioned_plot_writer): corresponding matplotlib file for a given save version already exists.""" versioned_plot_writer.save(mock_single_plot) pattern = ( - r"Save path \`.+\` for MatplotlibWriter\(.+\) must " + r"Save path \'.+\' for MatplotlibWriter\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_plot_writer.save(mock_single_plot) + def test_ineffective_overwrite(self, load_version, save_version): + pattern = ( + "Setting 'overwrite=True' is ineffective if versioning " + "is enabled, since the versioned path must not already " + "exist; overriding flag with 'overwrite=False' instead." + ) + with pytest.warns(UserWarning, match=pattern): + versioned_plot_writer = MatplotlibWriter( + filepath="/tmp/file.txt", + version=Version(load_version, save_version), + overwrite=True, + ) + assert not versioned_plot_writer._overwrite + @pytest.mark.parametrize( "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True ) @@ -297,24 +310,26 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for MatplotlibWriter\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for MatplotlibWriter\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_plot_writer.save(mock_single_plot) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): MatplotlibWriter( filepath="https://example.com/file.png", version=Version(None, None) ) - def test_no_versions(self, versioned_plot_writer): + def test_load_not_supported(self, versioned_plot_writer): """Check the error if no versions are available for load.""" - pattern = r"Did not find any versions for MatplotlibWriter\(.+\)" - with pytest.raises(DataSetError, match=pattern): + pattern = ( + rf"Loading not supported for '{versioned_plot_writer.__class__.__name__}'" + ) + with pytest.raises(DatasetError, match=pattern): versioned_plot_writer.load() def test_exists(self, versioned_plot_writer, mock_single_plot): @@ -384,7 +399,7 @@ def test_versioning_existing_dataset_single_plot( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_plot_writer._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_plot_writer.save(mock_single_plot) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/networkx/test_gml_dataset.py b/tests/extras/datasets/networkx/test_gml_dataset.py new file mode 100644 index 0000000000..88f7b18a77 --- /dev/null +++ b/tests/extras/datasets/networkx/test_gml_dataset.py @@ -0,0 +1,188 @@ +from pathlib import Path, PurePosixPath + +import networkx +import pytest +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.networkx import GMLDataSet +from kedro.io import DatasetError, Version +from kedro.io.core import PROTOCOL_DELIMITER + +ATTRS = { + "source": "from", + "target": "to", + "name": "fake_id", + "key": "fake_key", + "link": "fake_link", +} + + +@pytest.fixture +def filepath_gml(tmp_path): + return (tmp_path / "some_dir" / "test.gml").as_posix() + + +@pytest.fixture +def gml_data_set(filepath_gml): + return GMLDataSet( + filepath=filepath_gml, + load_args={"destringizer": int}, + save_args={"stringizer": str}, + ) + + +@pytest.fixture +def versioned_gml_data_set(filepath_gml, load_version, save_version): + return GMLDataSet( + filepath=filepath_gml, + version=Version(load_version, save_version), + load_args={"destringizer": int}, + save_args={"stringizer": str}, + ) + + +@pytest.fixture() +def dummy_graph_data(): + return networkx.complete_graph(3) + + +class TestGMLDataSet: + def test_save_and_load(self, gml_data_set, dummy_graph_data): + """Test saving and reloading the data set.""" + gml_data_set.save(dummy_graph_data) + reloaded = gml_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + assert gml_data_set._fs_open_args_load == {"mode": "rb"} + assert gml_data_set._fs_open_args_save == {"mode": "wb"} + + def test_load_missing_file(self, gml_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set GMLDataSet\(.*\)" + with pytest.raises(DatasetError, match=pattern): + assert gml_data_set.load() + + def test_exists(self, gml_data_set, dummy_graph_data): + """Test `exists` method invocation.""" + assert not gml_data_set.exists() + gml_data_set.save(dummy_graph_data) + assert gml_data_set.exists() + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/file.gml", S3FileSystem), + ("file:///tmp/test.gml", LocalFileSystem), + ("/tmp/test.gml", LocalFileSystem), + ("gcs://bucket/file.gml", GCSFileSystem), + ("https://example.com/file.gml", HTTPFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + data_set = GMLDataSet(filepath=filepath) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.gml" + data_set = GMLDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestGMLDataSetVersioned: + def test_save_and_load(self, versioned_gml_data_set, dummy_graph_data): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_gml_data_set.save(dummy_graph_data) + reloaded = versioned_gml_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + assert versioned_gml_data_set._fs_open_args_load == {"mode": "rb"} + assert versioned_gml_data_set._fs_open_args_save == {"mode": "wb"} + + def test_no_versions(self, versioned_gml_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for GMLDataSet\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_gml_data_set.load() + + def test_exists(self, versioned_gml_data_set, dummy_graph_data): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_gml_data_set.exists() + versioned_gml_data_set.save(dummy_graph_data) + assert versioned_gml_data_set.exists() + + def test_prevent_override(self, versioned_gml_data_set, dummy_graph_data): + """Check the error when attempt to override the same data set + version.""" + versioned_gml_data_set.save(dummy_graph_data) + pattern = ( + r"Save path \'.+\' for GMLDataSet\(.+\) must not " + r"exist if versioning is enabled" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_gml_data_set.save(dummy_graph_data) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_gml_data_set, load_version, save_version, dummy_graph_data + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + rf"Save version '{save_version}' did not match " + rf"load version '{load_version}' for GMLDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_gml_data_set.save(dummy_graph_data) + + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.gml" + ds = GMLDataSet(filepath=filepath) + ds_versioned = GMLDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "GMLDataSet" in str(ds_versioned) + assert "GMLDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + + def test_versioning_existing_dataset( + self, gml_data_set, versioned_gml_data_set, dummy_graph_data + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + gml_data_set.save(dummy_graph_data) + assert gml_data_set.exists() + assert gml_data_set._filepath == versioned_gml_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_gml_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_gml_data_set.save(dummy_graph_data) + + # Remove non-versioned dataset and try again + Path(gml_data_set._filepath.as_posix()).unlink() + versioned_gml_data_set.save(dummy_graph_data) + assert versioned_gml_data_set.exists() diff --git a/tests/extras/datasets/networkx/test_graphml_dataset.py b/tests/extras/datasets/networkx/test_graphml_dataset.py new file mode 100644 index 0000000000..1d744a61cb --- /dev/null +++ b/tests/extras/datasets/networkx/test_graphml_dataset.py @@ -0,0 +1,188 @@ +from pathlib import Path, PurePosixPath + +import networkx +import pytest +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.networkx import GraphMLDataSet +from kedro.io import DatasetError, Version +from kedro.io.core import PROTOCOL_DELIMITER + +ATTRS = { + "source": "from", + "target": "to", + "name": "fake_id", + "key": "fake_key", + "link": "fake_link", +} + + +@pytest.fixture +def filepath_graphml(tmp_path): + return (tmp_path / "some_dir" / "test.graphml").as_posix() + + +@pytest.fixture +def graphml_data_set(filepath_graphml): + return GraphMLDataSet( + filepath=filepath_graphml, + load_args={"node_type": int}, + save_args={}, + ) + + +@pytest.fixture +def versioned_graphml_data_set(filepath_graphml, load_version, save_version): + return GraphMLDataSet( + filepath=filepath_graphml, + version=Version(load_version, save_version), + load_args={"node_type": int}, + save_args={}, + ) + + +@pytest.fixture() +def dummy_graph_data(): + return networkx.complete_graph(3) + + +class TestGraphMLDataSet: + def test_save_and_load(self, graphml_data_set, dummy_graph_data): + """Test saving and reloading the data set.""" + graphml_data_set.save(dummy_graph_data) + reloaded = graphml_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + assert graphml_data_set._fs_open_args_load == {"mode": "rb"} + assert graphml_data_set._fs_open_args_save == {"mode": "wb"} + + def test_load_missing_file(self, graphml_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set GraphMLDataSet\(.*\)" + with pytest.raises(DatasetError, match=pattern): + assert graphml_data_set.load() + + def test_exists(self, graphml_data_set, dummy_graph_data): + """Test `exists` method invocation.""" + assert not graphml_data_set.exists() + graphml_data_set.save(dummy_graph_data) + assert graphml_data_set.exists() + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/file.graphml", S3FileSystem), + ("file:///tmp/test.graphml", LocalFileSystem), + ("/tmp/test.graphml", LocalFileSystem), + ("gcs://bucket/file.graphml", GCSFileSystem), + ("https://example.com/file.graphml", HTTPFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + data_set = GraphMLDataSet(filepath=filepath) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.graphml" + data_set = GraphMLDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestGraphMLDataSetVersioned: + def test_save_and_load(self, versioned_graphml_data_set, dummy_graph_data): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_graphml_data_set.save(dummy_graph_data) + reloaded = versioned_graphml_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + assert versioned_graphml_data_set._fs_open_args_load == {"mode": "rb"} + assert versioned_graphml_data_set._fs_open_args_save == {"mode": "wb"} + + def test_no_versions(self, versioned_graphml_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for GraphMLDataSet\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_graphml_data_set.load() + + def test_exists(self, versioned_graphml_data_set, dummy_graph_data): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_graphml_data_set.exists() + versioned_graphml_data_set.save(dummy_graph_data) + assert versioned_graphml_data_set.exists() + + def test_prevent_override(self, versioned_graphml_data_set, dummy_graph_data): + """Check the error when attempt to override the same data set + version.""" + versioned_graphml_data_set.save(dummy_graph_data) + pattern = ( + r"Save path \'.+\' for GraphMLDataSet\(.+\) must not " + r"exist if versioning is enabled" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_graphml_data_set.save(dummy_graph_data) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_graphml_data_set, load_version, save_version, dummy_graph_data + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + rf"Save version '{save_version}' did not match " + rf"load version '{load_version}' for GraphMLDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_graphml_data_set.save(dummy_graph_data) + + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.graphml" + ds = GraphMLDataSet(filepath=filepath) + ds_versioned = GraphMLDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "GraphMLDataSet" in str(ds_versioned) + assert "GraphMLDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + + def test_versioning_existing_dataset( + self, graphml_data_set, versioned_graphml_data_set, dummy_graph_data + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + graphml_data_set.save(dummy_graph_data) + assert graphml_data_set.exists() + assert graphml_data_set._filepath == versioned_graphml_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_graphml_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_graphml_data_set.save(dummy_graph_data) + + # Remove non-versioned dataset and try again + Path(graphml_data_set._filepath.as_posix()).unlink() + versioned_graphml_data_set.save(dummy_graph_data) + assert versioned_graphml_data_set.exists() diff --git a/tests/extras/datasets/networkx/test_json_dataset.py b/tests/extras/datasets/networkx/test_json_dataset.py new file mode 100644 index 0000000000..55c7ebd213 --- /dev/null +++ b/tests/extras/datasets/networkx/test_json_dataset.py @@ -0,0 +1,226 @@ +from pathlib import Path, PurePosixPath + +import networkx +import pytest +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.networkx import JSONDataSet +from kedro.io import DatasetError, Version +from kedro.io.core import PROTOCOL_DELIMITER + +ATTRS = { + "source": "from", + "target": "to", + "name": "fake_id", + "key": "fake_key", + "link": "fake_link", +} + + +@pytest.fixture +def filepath_json(tmp_path): + return (tmp_path / "some_dir" / "test.json").as_posix() + + +@pytest.fixture +def json_data_set(filepath_json, fs_args): + return JSONDataSet(filepath=filepath_json, fs_args=fs_args) + + +@pytest.fixture +def versioned_json_data_set(filepath_json, load_version, save_version): + return JSONDataSet( + filepath=filepath_json, version=Version(load_version, save_version) + ) + + +@pytest.fixture +def json_data_set_args(filepath_json): + return JSONDataSet( + filepath=filepath_json, load_args={"attrs": ATTRS}, save_args={"attrs": ATTRS} + ) + + +@pytest.fixture() +def dummy_graph_data(): + return networkx.complete_graph(3) + + +class TestJSONDataSet: + def test_save_and_load(self, json_data_set, dummy_graph_data): + """Test saving and reloading the data set.""" + json_data_set.save(dummy_graph_data) + reloaded = json_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + assert json_data_set._fs_open_args_load == {} + assert json_data_set._fs_open_args_save == {"mode": "w"} + + def test_load_missing_file(self, json_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set JSONDataSet\(.*\)" + with pytest.raises(DatasetError, match=pattern): + assert json_data_set.load() + + def test_load_args_save_args(self, mocker, json_data_set_args, dummy_graph_data): + """Test saving and reloading with save and load arguments.""" + patched_save = mocker.patch( + "networkx.node_link_data", wraps=networkx.node_link_data + ) + json_data_set_args.save(dummy_graph_data) + patched_save.assert_called_once_with(dummy_graph_data, attrs=ATTRS) + + patched_load = mocker.patch( + "networkx.node_link_graph", wraps=networkx.node_link_graph + ) + # load args need to be the same attrs as the ones used for saving + # in order to successfully retrieve data + reloaded = json_data_set_args.load() + + patched_load.assert_called_once_with( + { + "directed": False, + "multigraph": False, + "graph": {}, + "nodes": [{"fake_id": 0}, {"fake_id": 1}, {"fake_id": 2}], + "fake_link": [ + {"from": 0, "to": 1}, + {"from": 0, "to": 2}, + {"from": 1, "to": 2}, + ], + }, + attrs=ATTRS, + ) + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], + indirect=True, + ) + def test_open_extra_args(self, json_data_set, fs_args): + assert json_data_set._fs_open_args_load == fs_args["open_args_load"] + assert json_data_set._fs_open_args_save == {"mode": "w"} # default unchanged + + def test_exists(self, json_data_set, dummy_graph_data): + """Test `exists` method invocation.""" + assert not json_data_set.exists() + json_data_set.save(dummy_graph_data) + assert json_data_set.exists() + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/file.json", S3FileSystem), + ("file:///tmp/test.json", LocalFileSystem), + ("/tmp/test.json", LocalFileSystem), + ("gcs://bucket/file.json", GCSFileSystem), + ("https://example.com/file.json", HTTPFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + data_set = JSONDataSet(filepath=filepath) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.json" + data_set = JSONDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestJSONDataSetVersioned: + def test_save_and_load(self, versioned_json_data_set, dummy_graph_data): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_json_data_set.save(dummy_graph_data) + reloaded = versioned_json_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + + def test_no_versions(self, versioned_json_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for JSONDataSet\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_json_data_set.load() + + def test_exists(self, versioned_json_data_set, dummy_graph_data): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_json_data_set.exists() + versioned_json_data_set.save(dummy_graph_data) + assert versioned_json_data_set.exists() + + def test_prevent_override(self, versioned_json_data_set, dummy_graph_data): + """Check the error when attempt to override the same data set + version.""" + versioned_json_data_set.save(dummy_graph_data) + pattern = ( + r"Save path \'.+\' for JSONDataSet\(.+\) must not " + r"exist if versioning is enabled" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_json_data_set.save(dummy_graph_data) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_json_data_set, load_version, save_version, dummy_graph_data + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for JSONDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_json_data_set.save(dummy_graph_data) + + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.json" + ds = JSONDataSet(filepath=filepath) + ds_versioned = JSONDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "JSONDataSet" in str(ds_versioned) + assert "JSONDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + + def test_versioning_existing_dataset( + self, json_data_set, versioned_json_data_set, dummy_graph_data + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + json_data_set.save(dummy_graph_data) + assert json_data_set.exists() + assert json_data_set._filepath == versioned_json_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_json_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_json_data_set.save(dummy_graph_data) + + # Remove non-versioned dataset and try again + Path(json_data_set._filepath.as_posix()).unlink() + versioned_json_data_set.save(dummy_graph_data) + assert versioned_json_data_set.exists() diff --git a/tests/extras/datasets/networkx/test_networkx_dataset.py b/tests/extras/datasets/networkx/test_networkx_dataset.py deleted file mode 100644 index bd55f40f5e..0000000000 --- a/tests/extras/datasets/networkx/test_networkx_dataset.py +++ /dev/null @@ -1,258 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path, PurePosixPath - -import networkx -import pytest -from fsspec.implementations.http import HTTPFileSystem -from fsspec.implementations.local import LocalFileSystem -from gcsfs import GCSFileSystem -from s3fs.core import S3FileSystem - -from kedro.extras.datasets.networkx import NetworkXDataSet -from kedro.io import DataSetError, Version -from kedro.io.core import PROTOCOL_DELIMITER - -ATTRS = { - "source": "from", - "target": "to", - "name": "fake_id", - "key": "fake_key", - "link": "fake_link", -} - - -@pytest.fixture -def filepath_json(tmp_path): - return (tmp_path / "some_dir" / "test.json").as_posix() - - -@pytest.fixture -def networkx_data_set(filepath_json, fs_args): - return NetworkXDataSet(filepath=filepath_json, fs_args=fs_args) - - -@pytest.fixture -def versioned_networkx_data_set(filepath_json, load_version, save_version): - return NetworkXDataSet( - filepath=filepath_json, version=Version(load_version, save_version) - ) - - -@pytest.fixture -def networkx_data_set_args(filepath_json): - return NetworkXDataSet( - filepath=filepath_json, load_args={"attrs": ATTRS}, save_args={"attrs": ATTRS} - ) - - -@pytest.fixture() -def dummy_graph_data(): - return networkx.complete_graph(3) - - -class TestNetworkXDataSet: - def test_save_and_load(self, networkx_data_set, dummy_graph_data): - """Test saving and reloading the data set.""" - networkx_data_set.save(dummy_graph_data) - reloaded = networkx_data_set.load() - assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) - assert networkx_data_set._fs_open_args_load == {} - assert networkx_data_set._fs_open_args_save == {"mode": "w"} - - def test_load_missing_file(self, networkx_data_set): - """Check the error when trying to load missing file.""" - pattern = r"Failed while loading data from data set NetworkXDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): - assert networkx_data_set.load() - - def test_load_args_save_args( - self, mocker, networkx_data_set_args, dummy_graph_data - ): - """Test saving and reloading with save and load arguments.""" - patched_save = mocker.patch( - "networkx.node_link_data", wraps=networkx.node_link_data - ) - networkx_data_set_args.save(dummy_graph_data) - patched_save.assert_called_once_with(dummy_graph_data, attrs=ATTRS) - - patched_load = mocker.patch( - "networkx.node_link_graph", wraps=networkx.node_link_graph - ) - # load args need to be the same attrs as the ones used for saving - # in order to successfully retrieve data - reloaded = networkx_data_set_args.load() - - patched_load.assert_called_once_with( - { - "directed": False, - "multigraph": False, - "graph": {}, - "nodes": [{"fake_id": 0}, {"fake_id": 1}, {"fake_id": 2}], - "fake_link": [ - {"from": 0, "to": 1}, - {"from": 0, "to": 2}, - {"from": 1, "to": 2}, - ], - }, - attrs=ATTRS, - ) - assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) - - @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, - ) - def test_open_extra_args(self, networkx_data_set, fs_args): - assert networkx_data_set._fs_open_args_load == fs_args["open_args_load"] - assert networkx_data_set._fs_open_args_save == { - "mode": "w" - } # default unchanged - - def test_exists(self, networkx_data_set, dummy_graph_data): - """Test `exists` method invocation.""" - assert not networkx_data_set.exists() - networkx_data_set.save(dummy_graph_data) - assert networkx_data_set.exists() - - @pytest.mark.parametrize( - "filepath,instance_type", - [ - ("s3://bucket/file.json", S3FileSystem), - ("file:///tmp/test.json", LocalFileSystem), - ("/tmp/test.json", LocalFileSystem), - ("gcs://bucket/file.json", GCSFileSystem), - ("https://example.com/file.json", HTTPFileSystem), - ], - ) - def test_protocol_usage(self, filepath, instance_type): - data_set = NetworkXDataSet(filepath=filepath) - assert isinstance(data_set._fs, instance_type) - - path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] - - assert str(data_set._filepath) == path - assert isinstance(data_set._filepath, PurePosixPath) - - def test_catalog_release(self, mocker): - fs_mock = mocker.patch("fsspec.filesystem").return_value - filepath = "test.json" - data_set = NetworkXDataSet(filepath=filepath) - data_set.release() - fs_mock.invalidate_cache.assert_called_once_with(filepath) - - -class TestNetworkXDataSetVersioned: - def test_save_and_load(self, versioned_networkx_data_set, dummy_graph_data): - """Test that saved and reloaded data matches the original one for - the versioned data set.""" - versioned_networkx_data_set.save(dummy_graph_data) - reloaded = versioned_networkx_data_set.load() - assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) - - def test_no_versions(self, versioned_networkx_data_set): - """Check the error if no versions are available for load.""" - pattern = r"Did not find any versions for NetworkXDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): - versioned_networkx_data_set.load() - - def test_exists(self, versioned_networkx_data_set, dummy_graph_data): - """Test `exists` method invocation for versioned data set.""" - assert not versioned_networkx_data_set.exists() - versioned_networkx_data_set.save(dummy_graph_data) - assert versioned_networkx_data_set.exists() - - def test_prevent_override(self, versioned_networkx_data_set, dummy_graph_data): - """Check the error when attempt to override the same data set - version.""" - versioned_networkx_data_set.save(dummy_graph_data) - pattern = ( - r"Save path \`.+\` for NetworkXDataSet\(.+\) must not " - r"exist if versioning is enabled" - ) - with pytest.raises(DataSetError, match=pattern): - versioned_networkx_data_set.save(dummy_graph_data) - - @pytest.mark.parametrize( - "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True - ) - @pytest.mark.parametrize( - "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True - ) - def test_save_version_warning( - self, versioned_networkx_data_set, load_version, save_version, dummy_graph_data - ): - """Check the warning when saving to the path that differs from - the subsequent load path.""" - pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for NetworkXDataSet\(.+\)".format(save_version, load_version) - ) - with pytest.warns(UserWarning, match=pattern): - versioned_networkx_data_set.save(dummy_graph_data) - - def test_version_str_repr(self, load_version, save_version): - """Test that version is in string representation of the class instance - when applicable.""" - filepath = "test.json" - ds = NetworkXDataSet(filepath=filepath) - ds_versioned = NetworkXDataSet( - filepath=filepath, version=Version(load_version, save_version) - ) - assert filepath in str(ds) - assert "version" not in str(ds) - - assert filepath in str(ds_versioned) - ver_str = f"version=Version(load={load_version}, save='{save_version}')" - assert ver_str in str(ds_versioned) - assert "NetworkXDataSet" in str(ds_versioned) - assert "NetworkXDataSet" in str(ds) - assert "protocol" in str(ds_versioned) - assert "protocol" in str(ds) - - def test_versioning_existing_dataset( - self, networkx_data_set, versioned_networkx_data_set, dummy_graph_data - ): - """Check the error when attempting to save a versioned dataset on top of an - already existing (non-versioned) dataset.""" - networkx_data_set.save(dummy_graph_data) - assert networkx_data_set.exists() - assert networkx_data_set._filepath == versioned_networkx_data_set._filepath - pattern = ( - f"(?=.*file with the same name already exists in the directory)" - f"(?=.*{versioned_networkx_data_set._filepath.parent.as_posix()})" - ) - with pytest.raises(DataSetError, match=pattern): - versioned_networkx_data_set.save(dummy_graph_data) - - # Remove non-versioned dataset and try again - Path(networkx_data_set._filepath.as_posix()).unlink() - versioned_networkx_data_set.save(dummy_graph_data) - assert versioned_networkx_data_set.exists() diff --git a/tests/extras/datasets/pandas/test_csv_dataset.py b/tests/extras/datasets/pandas/test_csv_dataset.py index 2592e8afef..a2a15f5938 100644 --- a/tests/extras/datasets/pandas/test_csv_dataset.py +++ b/tests/extras/datasets/pandas/test_csv_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath from time import sleep @@ -39,7 +11,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.pandas import CSVDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp @@ -112,8 +84,8 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): records = [r for r in caplog.records if r.levelname == "WARNING"] expected_log_message = ( - f"Dropping `storage_options` for {filepath}, " - f"please specify them under `fs_args` or `credentials`." + f"Dropping 'storage_options' for {filepath}, " + f"please specify them under 'fs_args' or 'credentials'." ) assert records[0].getMessage() == expected_log_message assert "storage_options" not in ds._save_args @@ -122,7 +94,7 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): def test_load_missing_file(self, csv_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set CSVDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): csv_data_set.load() @pytest.mark.parametrize( @@ -261,7 +233,7 @@ def test_release_instance_cache(self, dummy_dataframe, filepath_csv): def test_no_versions(self, versioned_csv_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for CSVDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_csv_data_set.load() def test_exists(self, versioned_csv_data_set, dummy_dataframe): @@ -275,10 +247,10 @@ def test_prevent_overwrite(self, versioned_csv_data_set, dummy_dataframe): corresponding CSV file for a given save version already exists.""" versioned_csv_data_set.save(dummy_dataframe) pattern = ( - r"Save path \`.+\` for CSVDataSet\(.+\) must " + r"Save path \'.+\' for CSVDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_csv_data_set.save(dummy_dataframe) @pytest.mark.parametrize( @@ -293,16 +265,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for CSVDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for CSVDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_csv_data_set.save(dummy_dataframe) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): CSVDataSet( filepath="https://example.com/file.csv", version=Version(None, None) ) @@ -319,7 +291,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_csv_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_csv_data_set.save(dummy_dataframe) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/pandas/test_excel_dataset.py b/tests/extras/datasets/pandas/test_excel_dataset.py index a9b30643f3..d558d3b22f 100644 --- a/tests/extras/datasets/pandas/test_excel_dataset.py +++ b/tests/extras/datasets/pandas/test_excel_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath import pandas as pd @@ -37,7 +9,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.pandas import ExcelDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version @@ -56,6 +28,17 @@ def excel_data_set(filepath_excel, load_args, save_args, fs_args): ) +@pytest.fixture +def excel_multisheet_data_set(filepath_excel, save_args, fs_args): + load_args = {"sheet_name": None} + return ExcelDataSet( + filepath=filepath_excel, + load_args=load_args, + save_args=save_args, + fs_args=fs_args, + ) + + @pytest.fixture def versioned_excel_data_set(filepath_excel, load_version, save_version): return ExcelDataSet( @@ -68,6 +51,11 @@ def dummy_dataframe(): return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) +@pytest.fixture +def another_dummy_dataframe(): + return pd.DataFrame({"x": [10, 20], "y": ["hello", "world"]}) + + class TestExcelDataSet: def test_save_and_load(self, excel_data_set, dummy_dataframe): """Test saving and reloading the data set.""" @@ -75,6 +63,19 @@ def test_save_and_load(self, excel_data_set, dummy_dataframe): reloaded = excel_data_set.load() assert_frame_equal(dummy_dataframe, reloaded) + def test_save_and_load_multiple_sheets( + self, excel_multisheet_data_set, dummy_dataframe, another_dummy_dataframe + ): + """Test saving and reloading the data set with multiple sheets.""" + dummy_multisheet = { + "sheet 1": dummy_dataframe, + "sheet 2": another_dummy_dataframe, + } + excel_multisheet_data_set.save(dummy_multisheet) + reloaded = excel_multisheet_data_set.load() + assert_frame_equal(dummy_multisheet["sheet 1"], reloaded["sheet 1"]) + assert_frame_equal(dummy_multisheet["sheet 2"], reloaded["sheet 2"]) + def test_exists(self, excel_data_set, dummy_dataframe): """Test `exists` method invocation for both existing and nonexistent data set.""" @@ -113,8 +114,8 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): records = [r for r in caplog.records if r.levelname == "WARNING"] expected_log_message = ( - f"Dropping `storage_options` for {filepath}, " - f"please specify them under `fs_args` or `credentials`." + f"Dropping 'storage_options' for {filepath}, " + f"please specify them under 'fs_args' or 'credentials'." ) assert records[0].getMessage() == expected_log_message assert "storage_options" not in ds._save_args @@ -123,7 +124,7 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): def test_load_missing_file(self, excel_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set ExcelDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): excel_data_set.load() @pytest.mark.parametrize( @@ -199,7 +200,7 @@ def test_save_and_load(self, versioned_excel_data_set, dummy_dataframe): def test_no_versions(self, versioned_excel_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for ExcelDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_excel_data_set.load() def test_versioning_not_supported_in_append_mode( @@ -208,8 +209,8 @@ def test_versioning_not_supported_in_append_mode( filepath = str(tmp_path / "test.xlsx") save_args = {"writer": {"mode": "a"}} - pattern = "`ExcelDataSet` doesn't support versioning in append mode." - with pytest.raises(DataSetError, match=pattern): + pattern = "'ExcelDataSet' doesn't support versioning in append mode." + with pytest.raises(DatasetError, match=pattern): ExcelDataSet( filepath=filepath, version=Version(load_version, save_version), @@ -227,10 +228,10 @@ def test_prevent_overwrite(self, versioned_excel_data_set, dummy_dataframe): corresponding Excel file for a given save version already exists.""" versioned_excel_data_set.save(dummy_dataframe) pattern = ( - r"Save path \`.+\` for ExcelDataSet\(.+\) must " + r"Save path \'.+\' for ExcelDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_excel_data_set.save(dummy_dataframe) @pytest.mark.parametrize( @@ -245,16 +246,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for ExcelDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for ExcelDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_excel_data_set.save(dummy_dataframe) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): ExcelDataSet( filepath="https://example.com/file.xlsx", version=Version(None, None) ) @@ -271,7 +272,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_excel_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_excel_data_set.save(dummy_dataframe) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/pandas/test_feather_dataset.py b/tests/extras/datasets/pandas/test_feather_dataset.py index 988c7eaca4..8637bd2bcf 100644 --- a/tests/extras/datasets/pandas/test_feather_dataset.py +++ b/tests/extras/datasets/pandas/test_feather_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath import pandas as pd @@ -37,7 +9,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.pandas import FeatherDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version @@ -102,8 +74,8 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): records = [r for r in caplog.records if r.levelname == "WARNING"] expected_log_message = ( - f"Dropping `storage_options` for {filepath}, " - f"please specify them under `fs_args` or `credentials`." + f"Dropping 'storage_options' for {filepath}, " + f"please specify them under 'fs_args' or 'credentials'." ) assert records[0].getMessage() == expected_log_message assert "storage_options" not in ds._save_args @@ -112,7 +84,7 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): def test_load_missing_file(self, feather_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set FeatherDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): feather_data_set.load() @pytest.mark.parametrize( @@ -181,7 +153,7 @@ def test_save_and_load(self, versioned_feather_data_set, dummy_dataframe): def test_no_versions(self, versioned_feather_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for FeatherDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_feather_data_set.load() def test_exists(self, versioned_feather_data_set, dummy_dataframe): @@ -195,10 +167,10 @@ def test_prevent_overwrite(self, versioned_feather_data_set, dummy_dataframe): corresponding feather file for a given save version already exists.""" versioned_feather_data_set.save(dummy_dataframe) pattern = ( - r"Save path \`.+\` for FeatherDataSet\(.+\) must " + r"Save path \'.+\' for FeatherDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_feather_data_set.save(dummy_dataframe) @pytest.mark.parametrize( @@ -213,16 +185,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for FeatherDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for FeatherDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_feather_data_set.save(dummy_dataframe) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): FeatherDataSet( filepath="https://example.com/file.feather", version=Version(None, None) ) @@ -239,7 +211,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_feather_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_feather_data_set.save(dummy_dataframe) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/pandas/test_gbq_dataset.py b/tests/extras/datasets/pandas/test_gbq_dataset.py index 9a889a7e28..475f25c93b 100644 --- a/tests/extras/datasets/pandas/test_gbq_dataset.py +++ b/tests/extras/datasets/pandas/test_gbq_dataset.py @@ -1,42 +1,17 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from pathlib import PosixPath import pandas as pd import pytest from google.cloud.exceptions import NotFound from pandas.testing import assert_frame_equal -from kedro.extras.datasets.pandas import GBQTableDataSet -from kedro.io.core import DataSetError +from kedro.extras.datasets.pandas import GBQQueryDataSet, GBQTableDataSet +from kedro.io.core import DatasetError DATASET = "dataset" TABLE_NAME = "table_name" PROJECT = "project" +SQL_QUERY = "SELECT * FROM table_a" @pytest.fixture @@ -64,6 +39,35 @@ def gbq_dataset( ) +@pytest.fixture(params=[{}]) +def gbq_sql_dataset(load_args, mock_bigquery_client): # pylint: disable=unused-argument + return GBQQueryDataSet( + sql=SQL_QUERY, + project=PROJECT, + credentials=None, + load_args=load_args, + ) + + +@pytest.fixture +def sql_file(tmp_path: PosixPath): + file = tmp_path / "test.sql" + file.write_text(SQL_QUERY) + return file.as_posix() + + +@pytest.fixture(params=[{}]) +def gbq_sql_file_dataset( + load_args, sql_file, mock_bigquery_client +): # pylint: disable=unused-argument + return GBQQueryDataSet( + filepath=sql_file, + project=PROJECT, + credentials=None, + load_args=load_args, + ) + + class TestGBQDataSet: def test_exists(self, mock_bigquery_client): """Test `exists` method invocation.""" @@ -99,18 +103,16 @@ def test_load_missing_file(self, gbq_dataset, mocker): "kedro.extras.datasets.pandas.gbq_dataset.pd.read_gbq" ) mocked_read_gbq.side_effect = ValueError - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): gbq_dataset.load() @pytest.mark.parametrize("load_args", [{"location": "l1"}], indirect=True) @pytest.mark.parametrize("save_args", [{"location": "l2"}], indirect=True) def test_invalid_location(self, save_args, load_args): """Check the error when initializing instance if save_args and load_args - `location` are different.""" - pattern = ( - r"`load_args\['location'\]` is different from `save_args\['location'\]`." - ) - with pytest.raises(DataSetError, match=pattern): + 'location' are different.""" + pattern = r""""load_args\['location'\]" is different from "save_args\['location'\]".""" + with pytest.raises(DatasetError, match=pattern): GBQTableDataSet( dataset=DATASET, table_name=TABLE_NAME, @@ -180,7 +182,7 @@ def test_read_gbq_with_query(self, gbq_dataset, dummy_dataframe, mocker, load_ar ) def test_validation_of_dataset_and_table_name(self, dataset, table_name): pattern = "Neither white-space nor semicolon are allowed.*" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): GBQTableDataSet(dataset=dataset, table_name=table_name) def test_credentials_propagation(self, mocker): @@ -206,3 +208,108 @@ def test_credentials_propagation(self, mocker): mocked_bigquery.Client.assert_called_once_with( project=PROJECT, credentials=credentials_obj, location=None ) + + +class TestGBQQueryDataSet: + def test_empty_query_error(self): + """Check the error when instantiating with empty query or file""" + pattern = ( + r"'sql' and 'filepath' arguments cannot both be empty\." + r"Please provide a sql query or path to a sql query file\." + ) + with pytest.raises(DatasetError, match=pattern): + GBQQueryDataSet(sql="", filepath="", credentials=None) + + @pytest.mark.parametrize( + "load_args", [{"k1": "v1", "index": "value"}], indirect=True + ) + def test_load_extra_params(self, gbq_sql_dataset, load_args): + """Test overriding the default load arguments.""" + for key, value in load_args.items(): + assert gbq_sql_dataset._load_args[key] == value + + def test_credentials_propagation(self, mocker): + credentials = {"token": "my_token"} + credentials_obj = "credentials" + mocked_credentials = mocker.patch( + "kedro.extras.datasets.pandas.gbq_dataset.Credentials", + return_value=credentials_obj, + ) + mocked_bigquery = mocker.patch( + "kedro.extras.datasets.pandas.gbq_dataset.bigquery" + ) + + data_set = GBQQueryDataSet( + sql=SQL_QUERY, + credentials=credentials, + project=PROJECT, + ) + + assert data_set._credentials == credentials_obj + mocked_credentials.assert_called_once_with(**credentials) + mocked_bigquery.Client.assert_called_once_with( + project=PROJECT, credentials=credentials_obj, location=None + ) + + def test_load(self, mocker, gbq_sql_dataset, dummy_dataframe): + """Test `load` method invocation""" + mocked_read_gbq = mocker.patch( + "kedro.extras.datasets.pandas.gbq_dataset.pd.read_gbq" + ) + mocked_read_gbq.return_value = dummy_dataframe + + loaded_data = gbq_sql_dataset.load() + + mocked_read_gbq.assert_called_once_with( + project_id=PROJECT, credentials=None, query=SQL_QUERY + ) + + assert_frame_equal(dummy_dataframe, loaded_data) + + def test_load_query_file(self, mocker, gbq_sql_file_dataset, dummy_dataframe): + """Test `load` method invocation using a file as input query""" + mocked_read_gbq = mocker.patch( + "kedro.extras.datasets.pandas.gbq_dataset.pd.read_gbq" + ) + mocked_read_gbq.return_value = dummy_dataframe + + loaded_data = gbq_sql_file_dataset.load() + + mocked_read_gbq.assert_called_once_with( + project_id=PROJECT, credentials=None, query=SQL_QUERY + ) + + assert_frame_equal(dummy_dataframe, loaded_data) + + def test_save_error(self, gbq_sql_dataset, dummy_dataframe): + """Check the error when trying to save to the data set""" + pattern = r"'save' is not supported on GBQQueryDataSet" + with pytest.raises(DatasetError, match=pattern): + gbq_sql_dataset.save(dummy_dataframe) + + def test_str_representation_sql(self, gbq_sql_dataset, sql_file): + """Test the data set instance string representation""" + str_repr = str(gbq_sql_dataset) + assert ( + f"GBQQueryDataSet(filepath=None, load_args={{}}, sql={SQL_QUERY})" + in str_repr + ) + assert sql_file not in str_repr + + def test_str_representation_filepath(self, gbq_sql_file_dataset, sql_file): + """Test the data set instance string representation with filepath arg.""" + str_repr = str(gbq_sql_file_dataset) + assert ( + f"GBQQueryDataSet(filepath={str(sql_file)}, load_args={{}}, sql=None)" + in str_repr + ) + assert SQL_QUERY not in str_repr + + def test_sql_and_filepath_args(self, sql_file): + """Test that an error is raised when both `sql` and `filepath` args are given.""" + pattern = ( + r"'sql' and 'filepath' arguments cannot both be provided." + r"Please only provide one." + ) + with pytest.raises(DatasetError, match=pattern): + GBQQueryDataSet(sql=SQL_QUERY, filepath=sql_file) diff --git a/tests/extras/datasets/pandas/test_generic_dataset.py b/tests/extras/datasets/pandas/test_generic_dataset.py new file mode 100644 index 0000000000..23feb861e8 --- /dev/null +++ b/tests/extras/datasets/pandas/test_generic_dataset.py @@ -0,0 +1,383 @@ +from pathlib import Path, PurePosixPath +from time import sleep + +import pandas as pd +import pytest +from adlfs import AzureBlobFileSystem +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from pandas._testing import assert_frame_equal +from s3fs import S3FileSystem + +from kedro.extras.datasets.pandas import GenericDataSet +from kedro.io import DatasetError, Version +from kedro.io.core import PROTOCOL_DELIMITER, generate_timestamp + + +@pytest.fixture +def filepath_sas(tmp_path): + return tmp_path / "test.sas7bdat" + + +@pytest.fixture +def filepath_csv(tmp_path): + return tmp_path / "test.csv" + + +@pytest.fixture +def filepath_html(tmp_path): + return tmp_path / "test.html" + + +# pylint: disable = line-too-long +@pytest.fixture() +def sas_binary(): + return b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc2\xea\x81`\xb3\x14\x11\xcf\xbd\x92\x08\x00\t\xc71\x8c\x18\x1f\x10\x11""\x002"\x01\x022\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x01\x18\x1f\x10\x11""\x002"\x01\x022\x042\x01""\x00\x00\x00\x00\x10\x03\x01\x00\x00\x00\x00\x00\x00\x00\x00SAS FILEAIRLINE DATA \x00\x00\xc0\x95j\xbe\xd6A\x00\x00\xc0\x95j\xbe\xd6A\x00\x00\x00\x00\x00 \xbc@\x00\x00\x00\x00\x00 \xbc@\x00\x04\x00\x00\x00\x10\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x009.0000M0WIN\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00WIN\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc0\x95LN\xaf\xf0LN\xaf\xf0LN\xaf\xf0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00jIW-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00kIW-\x00\x00\x00\x00\x00\x00\x00\x00<\x04\x00\x00\x00\x02-\x00\r\x00\x00\x00 \x0e\x00\x00\xe0\x01\x00\x00\x00\x00\x00\x00\x14\x0e\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\xe4\x0c\x00\x000\x01\x00\x00\x00\x00\x00\x00H\x0c\x00\x00\x9c\x00\x00\x00\x00\x01\x00\x00\x04\x0c\x00\x00D\x00\x00\x00\x00\x01\x00\x00\xa8\x0b\x00\x00\\\x00\x00\x00\x00\x01\x00\x00t\x0b\x00\x004\x00\x00\x00\x00\x00\x00\x00@\x0b\x00\x004\x00\x00\x00\x00\x00\x00\x00\x0c\x0b\x00\x004\x00\x00\x00\x00\x00\x00\x00\xd8\n\x00\x004\x00\x00\x00\x00\x00\x00\x00\xa4\n\x00\x004\x00\x00\x00\x00\x00\x00\x00p\n\x00\x004\x00\x00\x00\x00\x00\x00\x00p\n\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00p\x9e@\x00\x00\x00@\x8bl\xf3?\x00\x00\x00\xc0\x9f\x1a\xcf?\x00\x00\x00\xa0w\x9c\xc2?\x00\x00\x00\x00\xd7\xa3\xf6?\x00\x00\x00\x00\x81\x95\xe3?\x00t\x9e@\x00\x00\x00\xe0\xfb\xa9\xf5?\x00\x00\x00\x00\xd7\xa3\xd0?\x00\x00\x00`\xb3\xea\xcb?\x00\x00\x00 \xdd$\xf6?\x00\x00\x00\x00T\xe3\xe1?\x00x\x9e@\x00\x00\x00\xc0\x9f\x1a\xf9?\x00\x00\x00\x80\xc0\xca\xd1?\x00\x00\x00\xc0m4\xd4?\x00\x00\x00\x80?5\xf6?\x00\x00\x00 \x04V\xe2?\x00|\x9e@\x00\x00\x00\x00\x02+\xff?\x00\x00\x00@\x0c\x02\xd3?\x00\x00\x00\xc0K7\xd9?\x00\x00\x00\xc0\xcc\xcc\xf8?\x00\x00\x00\xc0I\x0c\xe2?\x00\x80\x9e@\x00\x00\x00`\xb8\x1e\x02@\x00\x00\x00@\n\xd7\xd3?\x00\x00\x00\xc0\x10\xc7\xd6?\x00\x00\x00\x00\xfe\xd4\xfc?\x00\x00\x00@5^\xe2?\x00\x84\x9e@\x00\x00\x00\x80\x16\xd9\x05@\x00\x00\x00\xe0\xa5\x9b\xd4?\x00\x00\x00`\xc5\xfe\xd6?\x00\x00\x00`\xe5\xd0\xfe?\x00\x00\x00 \x83\xc0\xe6?\x00\x88\x9e@\x00\x00\x00@33\x08@\x00\x00\x00\xe0\xa3p\xd5?\x00\x00\x00`\x8f\xc2\xd9?\x00\x00\x00@\x8bl\xff?\x00\x00\x00\x00\xfe\xd4\xe8?\x00\x8c\x9e@\x00\x00\x00\xe0\xf9~\x0c@\x00\x00\x00`ff\xd6?\x00\x00\x00\xe0\xb3Y\xd9?\x00\x00\x00`\x91\xed\x00@\x00\x00\x00\xc0\xc8v\xea?\x00\x90\x9e@\x00\x00\x00\x00\xfe\xd4\x0f@\x00\x00\x00\xc0\x9f\x1a\xd7?\x00\x00\x00\x00\xf7u\xd8?\x00\x00\x00@\xe1z\x03@\x00\x00\x00\xa0\x99\x99\xe9?\x00\x94\x9e@\x00\x00\x00\x80\x14\xae\x11@\x00\x00\x00@\x89A\xd8?\x00\x00\x00\xa0\xed|\xd3?\x00\x00\x00\xa0\xef\xa7\x05@\x00\x00\x00\x00\xd5x\xed?\x00\x98\x9e@\x00\x00\x00 \x83@\x12@\x00\x00\x00\xe0$\x06\xd9?\x00\x00\x00`\x81\x04\xd5?\x00\x00\x00`\xe3\xa5\x05@\x00\x00\x00\xa0n\x12\xf1?\x00\x9c\x9e@\x00\x00\x00\x80=\x8a\x15@\x00\x00\x00\x80\x95C\xdb?\x00\x00\x00\xa0\xab\xad\xd8?\x00\x00\x00\xa0\x9b\xc4\x06@\x00\x00\x00\xc0\xf7S\xf1?\x00\xa0\x9e@\x00\x00\x00\xc0K7\x16@\x00\x00\x00 X9\xdc?\x00\x00\x00@io\xd4?\x00\x00\x00\xa0E\xb6\x08@\x00\x00\x00\x00-\xb2\xf7?\x00\xa4\x9e@\x00\x00\x00\x00)\xdc\x15@\x00\x00\x00\xe0\xa3p\xdd?\x00\x00\x00@\xa2\xb4\xd3?\x00\x00\x00 \xdb\xf9\x08@\x00\x00\x00\xe0\xa7\xc6\xfb?\x00\xa8\x9e@\x00\x00\x00\xc0\xccL\x17@\x00\x00\x00\x80=\n\xdf?\x00\x00\x00@\x116\xd8?\x00\x00\x00\x00\xd5x\t@\x00\x00\x00`\xe5\xd0\xfe?\x00\xac\x9e@\x00\x00\x00 \x06\x81\x1b@\x00\x00\x00\xe0&1\xe0?\x00\x00\x00 \x83\xc0\xda?\x00\x00\x00\xc0\x9f\x1a\n@\x00\x00\x00\xc0\xf7S\x00@\x00\xb0\x9e@\x00\x00\x00\x80\xc0J\x1f@\x00\x00\x00\xc0K7\xe1?\x00\x00\x00\xa0\x87\x85\xe0?\x00\x00\x00\xa0\xc6K\x0b@\x00\x00\x00@\xb6\xf3\xff?\x00\xb4\x9e@\x00\x00\x00\xa0p="@\x00\x00\x00\xc0I\x0c\xe2?\x00\x00\x00\xa0\x13\xd0\xe2?\x00\x00\x00`\xe7\xfb\x0c@\x00\x00\x00\x00V\x0e\x02@\x00\xb8\x9e@\x00\x00\x00\xe0$\x06%@\x00\x00\x00 \x83\xc0\xe2?\x00\x00\x00\xe0H.\xe1?\x00\x00\x00\xa0\xc6K\x10@\x00\x00\x00\xc0\x9d\xef\x05@\x00\xbc\x9e@\x00\x00\x00\x80=\n*@\x00\x00\x00\x80l\xe7\xe3?\x00\x00\x00@io\xdc?\x00\x00\x00@\n\xd7\x12@\x00\x00\x00`\x12\x83\x0c@\x00\xc0\x9e@\x00\x00\x00\xc0\xa1\x85.@\x00\x00\x00@\xdfO\xe5?\x00\x00\x00\xa0e\x88\xd3?\x00\x00\x00@5\xde\x14@\x00\x00\x00\x80h\x11\x13@\x00\xc4\x9e@\x00\x00\x00\xc0 P0@\x00\x00\x00 Zd\xe7?\x00\x00\x00`\x7f\xd9\xcd?\x00\x00\x00\xe0\xa7F\x16@\x00\x00\x00\xa0C\x0b\x1a@\x00\xc8\x9e@\x00\x00\x00 \x83\x000@\x00\x00\x00@\x8d\x97\xea?\x00\x00\x00\xe06\x1a\xc8?\x00\x00\x00@\xe1\xfa\x15@\x00\x00\x00@\x0c\x82\x1e@\x00\xcc\x9e@\x00\x00\x00 \x83\xc0/@\x00\x00\x00\xc0\xf3\xfd\xec?\x00\x00\x00`\xf7\xe4\xc9?\x00\x00\x00 \x04V\x15@\x00\x00\x00\x80\x93X!@\x00\xd0\x9e@\x00\x00\x00\xe0x\xa90@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\xa0\xd4\t\xd0?\x00\x00\x00\xa0Ga\x15@\x00\x00\x00\xe0x\xa9 @\x00\xd4\x9e@\x00\x00\x00\x80\x95\x031@\x00\x00\x00@`\xe5\xf0?\x00\x00\x00@@\x13\xd1?\x00\x00\x00`\xe3\xa5\x16@\x00\x00\x00 /\x1d!@\x00\xd8\x9e@\x00\x00\x00\x80\x14N3@\x00\x00\x00\x80\x93\x18\xf2?\x00\x00\x00\xa0\xb2\x0c\xd1?\x00\x00\x00\x00\x7f\xea\x16@\x00\x00\x00\xa0\x18\x04#@\x00\xdc\x9e@\x00\x00\x00\x80\x93\xb82@\x00\x00\x00@\xb6\xf3\xf3?\x00\x00\x00\xc0\xeas\xcd?\x00\x00\x00\x00T\xe3\x16@\x00\x00\x00\x80\xbe\x1f"@\x00\xe0\x9e@\x00\x00\x00\x00\x00@3@\x00\x00\x00\x00\x00\x00\xf6?\x00\x00\x00\xc0\xc1\x17\xd6?\x00\x00\x00\xc0I\x0c\x17@\x00\x00\x00\xe0$\x86 @\x00\xe4\x9e@\x00\x00\x00\xc0\xa1\xa54@\x00\x00\x00`9\xb4\xf8?\x00\x00\x00@\xe8\xd9\xdc?\x00\x00\x00@\x0c\x82\x17@\x00\x00\x00@`\xe5\x1d@\x00\xe8\x9e@\x00\x00\x00 \xdb\xb96@\x00\x00\x00\xe0|?\xfb?\x00\x00\x00@p\xce\xe2?\x00\x00\x00\x80\x97n\x18@\x00\x00\x00\x00\x7fj\x1c@\x00\xec\x9e@\x00\x00\x00\xc0v\x9e7@\x00\x00\x00\xc0\xc8v\xfc?\x00\x00\x00\x80q\x1b\xe1?\x00\x00\x00\xc0rh\x1b@\x00\x00\x00\xe0\xf9~\x1b@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00p\x00\r\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00`\x00\x0b\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00L\x00\r\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00<\x00\t\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00(\x00\x0f\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00 \x00\x04\x00\x00\x00\x00\x00\x00\x00\xfc\xff\xff\xffP\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x04\x01\x00\x04\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00\x0c\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00\x14\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00\x1c\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00$\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff8\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x04\x00\x00\x00\x00\x00$\x00\x01\x00\x00\x00\x00\x008\x00\x01\x00\x00\x00\x00\x00H\x00\x01\x00\x00\x00\x00\x00\\\x00\x01\x00\x00\x00\x00\x00l\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfd\xff\xff\xff\x90\x00\x10\x00\x80\x00\x00\x00\x00\x00\x00\x00Written by SAS\x00\x00YEARyearY\x00\x00\x00level of output\x00W\x00\x00\x00wage rate\x00\x00\x00R\x00\x00\x00interest rate\x00\x00\x00L\x00\x00\x00labor input\x00K\x00\x00\x00capital input\x00\x00\x00\x01\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfc\xff\xff0\x00\x00\x00\x04\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x07\x00\x00\x00\x00\x00\x00\xfc\xff\xff\xff\x01\x00\x00\x00\x06\x00\x00\x00\x01\x00\x00\x00\x06\x00\x00\x00\xfd\xff\xff\xff\x01\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\xff\xff\xff\xff\x01\x00\x00\x00\x05\x00\x00\x00\x01\x00\x00\x00\x05\x00\x00\x00\xfe\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfb\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfa\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf9\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf6\xf6\xf6\xf6\x06\x00\x00\x00\x00\x00\x00\x00\xf7\xf7\xf7\xf7\xcd\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x110\x02\x00,\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00.\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00 \x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00kIW-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x0e\x00\x00\x00\x01\x00\x00\x00-\x00\x00\x00\x01\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x0c\x00\x10\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x08\x00\x00\x00\x1c\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' + + +@pytest.fixture +def sas_data_set(filepath_sas, fs_args): + return GenericDataSet( + filepath=filepath_sas.as_posix(), + file_format="sas", + load_args={"format": "sas7bdat"}, + fs_args=fs_args, + ) + + +@pytest.fixture +def html_data_set(filepath_html, fs_args): + return GenericDataSet( + filepath=filepath_html.as_posix(), + file_format="html", + fs_args=fs_args, + save_args={"index": False}, + ) + + +@pytest.fixture +def sas_data_set_bad_config(filepath_sas, fs_args): + return GenericDataSet( + filepath=filepath_sas.as_posix(), + file_format="sas", + load_args={}, # SAS reader requires a type param + fs_args=fs_args, + ) + + +@pytest.fixture +def versioned_csv_data_set(filepath_csv, load_version, save_version): + return GenericDataSet( + filepath=filepath_csv.as_posix(), + file_format="csv", + version=Version(load_version, save_version), + save_args={"index": False}, + ) + + +@pytest.fixture +def csv_data_set(filepath_csv): + return GenericDataSet( + filepath=filepath_csv.as_posix(), + file_format="csv", + save_args={"index": False}, + ) + + +@pytest.fixture +def dummy_dataframe(): + return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + + +class TestGenericSasDataSet: + def test_load(self, sas_binary, sas_data_set, filepath_sas): + filepath_sas.write_bytes(sas_binary) + df = sas_data_set.load() + assert df.shape == (32, 6) + + def test_save_fail(self, sas_data_set, dummy_dataframe): + pattern = ( + "Unable to retrieve 'pandas.DataFrame.to_sas' method, please ensure that your " + "'file_format' parameter has been defined correctly as per the Pandas API " + "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html" + ) + with pytest.raises(DatasetError, match=pattern): + sas_data_set.save(dummy_dataframe) + # Pandas does not implement a SAS writer + + def test_bad_load(self, sas_data_set_bad_config, sas_binary, filepath_sas): + # SAS reader requires a format param e.g. sas7bdat + filepath_sas.write_bytes(sas_binary) + pattern = "you must specify a format string" + with pytest.raises(DatasetError, match=pattern): + sas_data_set_bad_config.load() + + @pytest.mark.parametrize( + "filepath,instance_type,credentials", + [ + ("s3://bucket/file.sas7bdat", S3FileSystem, {}), + ("file:///tmp/test.sas7bdat", LocalFileSystem, {}), + ("/tmp/test.sas7bdat", LocalFileSystem, {}), + ("gcs://bucket/file.sas7bdat", GCSFileSystem, {}), + ("https://example.com/file.sas7bdat", HTTPFileSystem, {}), + ( + "abfs://bucket/file.sas7bdat", + AzureBlobFileSystem, + {"account_name": "test", "account_key": "test"}, + ), + ], + ) + def test_protocol_usage(self, filepath, instance_type, credentials): + data_set = GenericDataSet( + filepath=filepath, file_format="sas", credentials=credentials + ) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.csv" + data_set = GenericDataSet(filepath=filepath, file_format="sas") + assert data_set._version_cache.currsize == 0 # no cache if unversioned + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + assert data_set._version_cache.currsize == 0 + + +class TestGenericCSVDataSetVersioned: + def test_version_str_repr(self, filepath_csv, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = filepath_csv.as_posix() + ds = GenericDataSet(filepath=filepath, file_format="csv") + ds_versioned = GenericDataSet( + filepath=filepath, + file_format="csv", + version=Version(load_version, save_version), + ) + assert filepath in str(ds) + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "GenericDataSet" in str(ds_versioned) + assert "GenericDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + + def test_save_and_load(self, versioned_csv_data_set, dummy_dataframe): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_csv_data_set.save(dummy_dataframe) + reloaded_df = versioned_csv_data_set.load() + assert_frame_equal(dummy_dataframe, reloaded_df) + + def test_multiple_loads( + self, versioned_csv_data_set, dummy_dataframe, filepath_csv + ): + """Test that if a new version is created mid-run, by an + external system, it won't be loaded in the current run.""" + versioned_csv_data_set.save(dummy_dataframe) + versioned_csv_data_set.load() + v1 = versioned_csv_data_set.resolve_load_version() + + sleep(0.5) + # force-drop a newer version into the same location + v_new = generate_timestamp() + GenericDataSet( + filepath=filepath_csv.as_posix(), + file_format="csv", + version=Version(v_new, v_new), + ).save(dummy_dataframe) + + versioned_csv_data_set.load() + v2 = versioned_csv_data_set.resolve_load_version() + + assert v2 == v1 # v2 should not be v_new! + ds_new = GenericDataSet( + filepath=filepath_csv.as_posix(), + file_format="csv", + version=Version(None, None), + ) + assert ( + ds_new.resolve_load_version() == v_new + ) # new version is discoverable by a new instance + + def test_multiple_saves(self, dummy_dataframe, filepath_csv): + """Test multiple cycles of save followed by load for the same dataset""" + ds_versioned = GenericDataSet( + filepath=filepath_csv.as_posix(), + file_format="csv", + version=Version(None, None), + ) + + # first save + ds_versioned.save(dummy_dataframe) + first_save_version = ds_versioned.resolve_save_version() + first_load_version = ds_versioned.resolve_load_version() + assert first_load_version == first_save_version + + # second save + sleep(0.5) + ds_versioned.save(dummy_dataframe) + second_save_version = ds_versioned.resolve_save_version() + second_load_version = ds_versioned.resolve_load_version() + assert second_load_version == second_save_version + assert second_load_version > first_load_version + + # another dataset + ds_new = GenericDataSet( + filepath=filepath_csv.as_posix(), + file_format="csv", + version=Version(None, None), + ) + assert ds_new.resolve_load_version() == second_load_version + + def test_release_instance_cache(self, dummy_dataframe, filepath_csv): + """Test that cache invalidation does not affect other instances""" + ds_a = GenericDataSet( + filepath=filepath_csv.as_posix(), + file_format="csv", + version=Version(None, None), + ) + assert ds_a._version_cache.currsize == 0 + ds_a.save(dummy_dataframe) # create a version + assert ds_a._version_cache.currsize == 2 + + ds_b = GenericDataSet( + filepath=filepath_csv.as_posix(), + file_format="csv", + version=Version(None, None), + ) + assert ds_b._version_cache.currsize == 0 + ds_b.resolve_save_version() + assert ds_b._version_cache.currsize == 1 + ds_b.resolve_load_version() + assert ds_b._version_cache.currsize == 2 + + ds_a.release() + + # dataset A cache is cleared + assert ds_a._version_cache.currsize == 0 + + # dataset B cache is unaffected + assert ds_b._version_cache.currsize == 2 + + def test_no_versions(self, versioned_csv_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for GenericDataSet\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_csv_data_set.load() + + def test_exists(self, versioned_csv_data_set, dummy_dataframe): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_csv_data_set.exists() + versioned_csv_data_set.save(dummy_dataframe) + assert versioned_csv_data_set.exists() + + def test_prevent_overwrite(self, versioned_csv_data_set, dummy_dataframe): + """Check the error when attempting to override the data set if the + corresponding Generic (csv) file for a given save version already exists.""" + versioned_csv_data_set.save(dummy_dataframe) + pattern = ( + r"Save path \'.+\' for GenericDataSet\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DatasetError, match=pattern): + versioned_csv_data_set.save(dummy_dataframe) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_csv_data_set, load_version, save_version, dummy_dataframe + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for GenericDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_csv_data_set.save(dummy_dataframe) + + def test_versioning_existing_dataset( + self, csv_data_set, versioned_csv_data_set, dummy_dataframe + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + csv_data_set.save(dummy_dataframe) + assert csv_data_set.exists() + assert csv_data_set._filepath == versioned_csv_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_csv_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_csv_data_set.save(dummy_dataframe) + + # Remove non-versioned dataset and try again + Path(csv_data_set._filepath.as_posix()).unlink() + versioned_csv_data_set.save(dummy_dataframe) + assert versioned_csv_data_set.exists() + + +class TestGenericHtmlDataSet: + def test_save_and_load(self, dummy_dataframe, html_data_set): + html_data_set.save(dummy_dataframe) + df = html_data_set.load() + assert_frame_equal(dummy_dataframe, df[0]) + + +class TestBadGenericDataSet: + def test_bad_file_format_argument(self): + ds = GenericDataSet(filepath="test.kedro", file_format="kedro") + + pattern = ( + "Unable to retrieve 'pandas.read_kedro' method, please ensure that your 'file_format' " + "parameter has been defined correctly as per the Pandas API " + "https://pandas.pydata.org/docs/reference/io.html" + ) + + with pytest.raises(DatasetError, match=pattern): + _ = ds.load() + + pattern2 = ( + "Unable to retrieve 'pandas.DataFrame.to_kedro' method, please ensure that your 'file_format' " + "parameter has been defined correctly as per the Pandas API " + "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html" + ) + with pytest.raises(DatasetError, match=pattern2): + ds.save(pd.DataFrame([1])) + + @pytest.mark.parametrize( + "file_format", + [ + "clipboard", + "sql_table", + "sql", + "numpy", + "records", + ], + ) + def test_generic_no_filepaths(self, file_format): + error = ( + "Cannot create a dataset of file_format " + f"'{file_format}' as it does not support a filepath target/source" + ) + + with pytest.raises(DatasetError, match=error): + _ = GenericDataSet( + filepath="/file/thing.file", file_format=file_format + ).load() + with pytest.raises(DatasetError, match=error): + GenericDataSet(filepath="/file/thing.file", file_format=file_format).save( + pd.DataFrame([1]) + ) diff --git a/tests/extras/datasets/pandas/test_hdf_dataset.py b/tests/extras/datasets/pandas/test_hdf_dataset.py index 2d7b178145..0580e510b4 100644 --- a/tests/extras/datasets/pandas/test_hdf_dataset.py +++ b/tests/extras/datasets/pandas/test_hdf_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath import pandas as pd @@ -37,7 +9,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.pandas import HDFDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version HDF_KEY = "data" @@ -116,7 +88,7 @@ def test_open_extra_args(self, hdf_data_set, fs_args): def test_load_missing_file(self, hdf_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set HDFDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): hdf_data_set.load() @pytest.mark.parametrize( @@ -161,7 +133,10 @@ def test_thread_lock_usage(self, hdf_data_set, dummy_dataframe, mocker): mocked_lock.assert_not_called() hdf_data_set.save(dummy_dataframe) - calls = [mocker.call.__enter__(), mocker.call.__exit__(None, None, None)] + calls = [ + mocker.call.__enter__(), # pylint: disable=unnecessary-dunder-call + mocker.call.__exit__(None, None, None), + ] mocked_lock.assert_has_calls(calls) mocked_lock.reset_mock() @@ -201,7 +176,7 @@ def test_save_and_load(self, versioned_hdf_data_set, dummy_dataframe): def test_no_versions(self, versioned_hdf_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for HDFDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_hdf_data_set.load() def test_exists(self, versioned_hdf_data_set, dummy_dataframe): @@ -215,10 +190,10 @@ def test_prevent_overwrite(self, versioned_hdf_data_set, dummy_dataframe): corresponding hdf file for a given save version already exists.""" versioned_hdf_data_set.save(dummy_dataframe) pattern = ( - r"Save path \`.+\` for HDFDataSet\(.+\) must " + r"Save path \'.+\' for HDFDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_hdf_data_set.save(dummy_dataframe) @pytest.mark.parametrize( @@ -233,16 +208,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for HDFDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for HDFDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_hdf_data_set.save(dummy_dataframe) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): HDFDataSet( filepath="https://example.com/file.h5", key=HDF_KEY, @@ -261,7 +236,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_hdf_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_hdf_data_set.save(dummy_dataframe) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/pandas/test_json_dataset.py b/tests/extras/datasets/pandas/test_json_dataset.py index 9727deacc8..fe5c7f8c42 100644 --- a/tests/extras/datasets/pandas/test_json_dataset.py +++ b/tests/extras/datasets/pandas/test_json_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath import pandas as pd @@ -38,7 +10,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.pandas import JSONDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version @@ -114,8 +86,8 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): records = [r for r in caplog.records if r.levelname == "WARNING"] expected_log_message = ( - f"Dropping `storage_options` for {filepath}, " - f"please specify them under `fs_args` or `credentials`." + f"Dropping 'storage_options' for {filepath}, " + f"please specify them under 'fs_args' or 'credentials'." ) assert records[0].getMessage() == expected_log_message assert "storage_options" not in ds._save_args @@ -124,7 +96,7 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): def test_load_missing_file(self, json_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set JSONDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): json_data_set.load() @pytest.mark.parametrize( @@ -202,7 +174,7 @@ def test_save_and_load(self, versioned_json_data_set, dummy_dataframe): def test_no_versions(self, versioned_json_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for JSONDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_json_data_set.load() def test_exists(self, versioned_json_data_set, dummy_dataframe): @@ -216,10 +188,10 @@ def test_prevent_overwrite(self, versioned_json_data_set, dummy_dataframe): corresponding hdf file for a given save version already exists.""" versioned_json_data_set.save(dummy_dataframe) pattern = ( - r"Save path \`.+\` for JSONDataSet\(.+\) must " + r"Save path \'.+\' for JSONDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_json_data_set.save(dummy_dataframe) @pytest.mark.parametrize( @@ -234,16 +206,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for JSONDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for JSONDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_json_data_set.save(dummy_dataframe) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): JSONDataSet( filepath="https://example.com/file.json", version=Version(None, None) ) @@ -260,7 +232,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_json_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_json_data_set.save(dummy_dataframe) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/pandas/test_parquet_dataset.py b/tests/extras/datasets/pandas/test_parquet_dataset.py index 8e545131bc..5e415bd75b 100644 --- a/tests/extras/datasets/pandas/test_parquet_dataset.py +++ b/tests/extras/datasets/pandas/test_parquet_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath import pandas as pd @@ -39,7 +11,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.pandas import ParquetDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version FILENAME = "test.parquet" @@ -140,8 +112,8 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): records = [r for r in caplog.records if r.levelname == "WARNING"] expected_log_message = ( - f"Dropping `storage_options` for {filepath}, " - f"please specify them under `fs_args` or `credentials`." + f"Dropping 'storage_options' for {filepath}, " + f"please specify them under 'fs_args' or 'credentials'." ) assert records[0].getMessage() == expected_log_message assert "storage_options" not in ds._save_args @@ -150,7 +122,7 @@ def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): def test_load_missing_file(self, parquet_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set ParquetDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): parquet_data_set.load() @pytest.mark.parametrize( @@ -216,7 +188,7 @@ def test_write_to_dir(self, dummy_dataframe, tmp_path): data_set = ParquetDataSet(filepath=tmp_path.as_posix()) pattern = "Saving ParquetDataSet to a directory is not supported" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): data_set.save(dummy_dataframe) def test_read_from_non_local_dir(self, mocker): @@ -247,9 +219,9 @@ def test_arg_partition_cols(self, dummy_dataframe, tmp_path): filepath=(tmp_path / FILENAME).as_posix(), save_args={"partition_cols": ["col2"]}, ) - pattern = "does not support save argument `partition_cols`" + pattern = "does not support save argument 'partition_cols'" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): data_set.save(dummy_dataframe) @@ -286,7 +258,7 @@ def test_save_and_load(self, versioned_parquet_data_set, dummy_dataframe, mocker def test_no_versions(self, versioned_parquet_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for ParquetDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_parquet_data_set.load() def test_exists(self, versioned_parquet_data_set, dummy_dataframe, mocker): @@ -310,10 +282,10 @@ def test_prevent_overwrite( ) versioned_parquet_data_set.save(dummy_dataframe) pattern = ( - r"Save path \`.+\` for ParquetDataSet\(.+\) must " + r"Save path \'.+\' for ParquetDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_parquet_data_set.save(dummy_dataframe) @pytest.mark.parametrize( @@ -333,8 +305,8 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for ParquetDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for ParquetDataSet\(.+\)" ) mocker.patch( "pyarrow.fs._ensure_filesystem", @@ -344,9 +316,9 @@ def test_save_version_warning( versioned_parquet_data_set.save(dummy_dataframe) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): ParquetDataSet( filepath="https://example.com/test.parquet", version=Version(None, None) ) @@ -363,7 +335,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_parquet_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_parquet_data_set.save(dummy_dataframe) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/pandas/test_sql_dataset.py b/tests/extras/datasets/pandas/test_sql_dataset.py index e0644259ef..d80ee12090 100644 --- a/tests/extras/datasets/pandas/test_sql_dataset.py +++ b/tests/extras/datasets/pandas/test_sql_dataset.py @@ -1,191 +1,179 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - # pylint: disable=no-member - -from typing import Any +from pathlib import PosixPath +from unittest.mock import ANY import pandas as pd import pytest import sqlalchemy from kedro.extras.datasets.pandas import SQLQueryDataSet, SQLTableDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError TABLE_NAME = "table_a" CONNECTION = "sqlite:///kedro.db" SQL_QUERY = "SELECT * FROM table_a" +EXECUTION_OPTIONS = {"stream_results": True} FAKE_CONN_STR = "some_sql://scott:tiger@localhost/foo" ERROR_PREFIX = ( r"A module\/driver is missing when connecting to your SQL server\.(.|\n)*" ) +@pytest.fixture(autouse=True) +def cleanup_engines(): + yield + SQLTableDataSet.engines = {} + SQLQueryDataSet.engines = {} + + @pytest.fixture def dummy_dataframe(): return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) -@pytest.fixture(params=[dict()]) +@pytest.fixture +def sql_file(tmp_path: PosixPath): + file = tmp_path / "test.sql" + file.write_text(SQL_QUERY) + return file.as_posix() + + +@pytest.fixture(params=[{}]) def table_data_set(request): - kwargs = dict(table_name=TABLE_NAME, credentials=dict(con=CONNECTION)) + kwargs = {"table_name": TABLE_NAME, "credentials": {"con": CONNECTION}} kwargs.update(request.param) return SQLTableDataSet(**kwargs) -@pytest.fixture(params=[dict()]) +@pytest.fixture(params=[{}]) def query_data_set(request): - kwargs = dict(sql=SQL_QUERY, credentials=dict(con=CONNECTION)) + kwargs = {"sql": SQL_QUERY, "credentials": {"con": CONNECTION}} kwargs.update(request.param) return SQLQueryDataSet(**kwargs) -class TestSQLTableDataSetLoad: +@pytest.fixture(params=[{}]) +def query_file_data_set(request, sql_file): + kwargs = {"filepath": sql_file, "credentials": {"con": CONNECTION}} + kwargs.update(request.param) + return SQLQueryDataSet(**kwargs) + + +class TestSQLTableDataSet: + _unknown_conn = "mysql+unknown_module://scott:tiger@localhost/foo" + @staticmethod - def _assert_pd_called_once(): - pd.read_sql_table.assert_called_once_with(table_name=TABLE_NAME, con=CONNECTION) + def _assert_sqlalchemy_called_once(*args): + _callable = sqlalchemy.engine.Engine.table_names + if args: + _callable.assert_called_once_with(*args) + else: + assert _callable.call_count == 1 def test_empty_table_name(self): """Check the error when instantiating with an empty table""" - pattern = r"`table\_name` argument cannot be empty\." - with pytest.raises(DataSetError, match=pattern): - SQLTableDataSet(table_name="", credentials=dict(con=CONNECTION)) + pattern = r"'table\_name' argument cannot be empty\." + with pytest.raises(DatasetError, match=pattern): + SQLTableDataSet(table_name="", credentials={"con": CONNECTION}) def test_empty_connection(self): """Check the error when instantiating with an empty connection string""" pattern = ( - r"`con` argument cannot be empty\. " + r"'con' argument cannot be empty\. " r"Please provide a SQLAlchemy connection string\." ) - with pytest.raises(DataSetError, match=pattern): - SQLTableDataSet(table_name=TABLE_NAME, credentials=dict(con="")) + with pytest.raises(DatasetError, match=pattern): + SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": ""}) - def test_load_sql_params(self, mocker, table_data_set): - """Test `load` method invocation""" - mocker.patch("pandas.read_sql_table") - table_data_set.load() - self._assert_pd_called_once() - - def test_load_driver_missing(self, mocker, table_data_set): + def test_driver_missing(self, mocker): """Check the error when the sql driver is missing""" mocker.patch( - "pandas.read_sql_table", + "kedro.extras.datasets.pandas.sql_dataset.create_engine", side_effect=ImportError("No module named 'mysqldb'"), ) - with pytest.raises(DataSetError, match=ERROR_PREFIX + "mysqlclient"): - table_data_set.load() - self._assert_pd_called_once() - - def test_invalid_module(self, mocker, table_data_set): - """Test that if an invalid module/driver is encountered by SQLAlchemy - then the error should contain the original error message""" - _err = ImportError("Invalid module some_module") - mocker.patch("pandas.read_sql_table", side_effect=_err) - pattern = ERROR_PREFIX + r"Invalid module some\_module" - with pytest.raises(DataSetError, match=pattern): - table_data_set.load() - self._assert_pd_called_once() + with pytest.raises(DatasetError, match=ERROR_PREFIX + "mysqlclient"): + SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": CONNECTION}) + + def test_unknown_sql(self): + """Check the error when unknown sql dialect is provided; + this means the error is raised on catalog creation, rather + than on load or save operation. + """ + pattern = r"The SQL dialect in your connection is not supported by SQLAlchemy" + with pytest.raises(DatasetError, match=pattern): + SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": FAKE_CONN_STR}) - def test_load_unknown_module(self, mocker, table_data_set): + def test_unknown_module(self, mocker): """Test that if an unknown module/driver is encountered by SQLAlchemy then the error should contain the original error message""" mocker.patch( - "pandas.read_sql_table", + "kedro.extras.datasets.pandas.sql_dataset.create_engine", side_effect=ImportError("No module named 'unknown_module'"), ) pattern = ERROR_PREFIX + r"No module named \'unknown\_module\'" - with pytest.raises(DataSetError, match=pattern): - table_data_set.load() + with pytest.raises(DatasetError, match=pattern): + SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": CONNECTION}) + + def test_str_representation_table(self, table_data_set): + """Test the data set instance string representation""" + str_repr = str(table_data_set) + assert ( + "SQLTableDataSet(load_args={}, save_args={'index': False}, " + f"table_name={TABLE_NAME})" in str_repr + ) + assert CONNECTION not in str(str_repr) + + def test_table_exists(self, mocker, table_data_set): + """Test `exists` method invocation""" + mocker.patch("sqlalchemy.engine.Engine.table_names") + assert not table_data_set.exists() + self._assert_sqlalchemy_called_once() @pytest.mark.parametrize( - "table_data_set", [{"credentials": dict(con=FAKE_CONN_STR)}], indirect=True + "table_data_set", [{"load_args": {"schema": "ingested"}}], indirect=True ) - def test_load_unknown_sql(self, table_data_set): - """Check the error when unknown sql dialect is provided""" - pattern = r"The SQL dialect in your connection is not supported by SQLAlchemy" - with pytest.raises(DataSetError, match=pattern): - table_data_set.load() - + def test_table_exists_schema(self, mocker, table_data_set): + """Test `exists` method invocation with DB schema provided""" + mocker.patch("sqlalchemy.engine.Engine.table_names") + assert not table_data_set.exists() + self._assert_sqlalchemy_called_once("ingested") -class TestSQLTableDataSetSave: - _unknown_conn = "mysql+unknown_module://scott:tiger@localhost/foo" + def test_table_exists_mocked(self, mocker, table_data_set): + """Test `exists` method invocation with mocked list of tables""" + mocker.patch("sqlalchemy.engine.Engine.table_names", return_value=[TABLE_NAME]) + assert table_data_set.exists() + self._assert_sqlalchemy_called_once() - @staticmethod - def _assert_to_sql_called_once(df: Any, index: bool = False): - df.to_sql.assert_called_once_with(name=TABLE_NAME, con=CONNECTION, index=index) + def test_load_sql_params(self, mocker, table_data_set): + """Test `load` method invocation""" + mocker.patch("pandas.read_sql_table") + table_data_set.load() + pd.read_sql_table.assert_called_once_with( + table_name=TABLE_NAME, con=table_data_set.engines[CONNECTION] + ) def test_save_default_index(self, mocker, table_data_set, dummy_dataframe): """Test `save` method invocation""" mocker.patch.object(dummy_dataframe, "to_sql") table_data_set.save(dummy_dataframe) - self._assert_to_sql_called_once(dummy_dataframe) + dummy_dataframe.to_sql.assert_called_once_with( + name=TABLE_NAME, con=table_data_set.engines[CONNECTION], index=False + ) @pytest.mark.parametrize( - "table_data_set", [{"save_args": dict(index=True)}], indirect=True + "table_data_set", [{"save_args": {"index": True}}], indirect=True ) def test_save_overwrite_index(self, mocker, table_data_set, dummy_dataframe): """Test writing DataFrame index as a column""" mocker.patch.object(dummy_dataframe, "to_sql") table_data_set.save(dummy_dataframe) - self._assert_to_sql_called_once(dummy_dataframe, True) - - def test_save_driver_missing(self, mocker, table_data_set, dummy_dataframe): - """Test that if an unknown module/driver is encountered by SQLAlchemy - then the error should contain the original error message""" - _err = ImportError("No module named 'mysqldb'") - mocker.patch.object(dummy_dataframe, "to_sql", side_effect=_err) - with pytest.raises(DataSetError, match=ERROR_PREFIX + "mysqlclient"): - table_data_set.save(dummy_dataframe) - - @pytest.mark.parametrize( - "table_data_set", [{"credentials": dict(con=FAKE_CONN_STR)}], indirect=True - ) - def test_save_unknown_sql(self, table_data_set, dummy_dataframe): - """Check the error when unknown sql dialect is provided""" - pattern = r"The SQL dialect in your connection is not supported by SQLAlchemy" - with pytest.raises(DataSetError, match=pattern): - table_data_set.save(dummy_dataframe) - - @pytest.mark.parametrize( - "table_data_set", [{"credentials": dict(con=_unknown_conn)}], indirect=True - ) - def test_save_unknown_module(self, mocker, table_data_set, dummy_dataframe): - """Test that if an unknown module/driver is encountered by SQLAlchemy - then the error should contain the original error message""" - _err = ImportError("No module named 'unknown_module'") - mocker.patch.object(dummy_dataframe, "to_sql", side_effect=_err) - pattern = r"No module named \'unknown_module\'" - with pytest.raises(DataSetError, match=pattern): - table_data_set.save(dummy_dataframe) + dummy_dataframe.to_sql.assert_called_once_with( + name=TABLE_NAME, con=table_data_set.engines[CONNECTION], index=True + ) @pytest.mark.parametrize( - "table_data_set", [{"save_args": dict(name="TABLE_B")}], indirect=True + "table_data_set", [{"save_args": {"name": "TABLE_B"}}], indirect=True ) def test_save_ignore_table_name_override( self, mocker, table_data_set, dummy_dataframe @@ -194,120 +182,244 @@ def test_save_ignore_table_name_override( effect""" mocker.patch.object(dummy_dataframe, "to_sql") table_data_set.save(dummy_dataframe) - self._assert_to_sql_called_once(dummy_dataframe) + dummy_dataframe.to_sql.assert_called_once_with( + name=TABLE_NAME, con=table_data_set.engines[CONNECTION], index=False + ) -class TestSQLTableDataSet: - @staticmethod - def _assert_sqlalchemy_called_once(*args): - _callable = sqlalchemy.engine.Engine.table_names - if args: - _callable.assert_called_once_with(*args) - else: - assert _callable.call_count == 1 +class TestSQLTableDataSetSingleConnection: + def test_single_connection(self, dummy_dataframe, mocker): + """Test to make sure multiple instances use the same connection object.""" + mocker.patch("pandas.read_sql_table") + dummy_to_sql = mocker.patch.object(dummy_dataframe, "to_sql") + kwargs = {"table_name": TABLE_NAME, "credentials": {"con": CONNECTION}} + + first = SQLTableDataSet(**kwargs) + unique_connection = first.engines[CONNECTION] + datasets = [SQLTableDataSet(**kwargs) for _ in range(10)] + + for ds in datasets: + ds.save(dummy_dataframe) + engine = ds.engines[CONNECTION] + assert engine is unique_connection + + expected_call = mocker.call(name=TABLE_NAME, con=unique_connection, index=False) + dummy_to_sql.assert_has_calls([expected_call] * 10) + + for ds in datasets: + ds.load() + engine = ds.engines[CONNECTION] + assert engine is unique_connection + + def test_create_connection_only_once(self, mocker): + """Test that two datasets that need to connect to the same db + (but different tables, for example) only create a connection once. + """ + mock_engine = mocker.patch( + "kedro.extras.datasets.pandas.sql_dataset.create_engine" + ) + first = SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": CONNECTION}) + assert len(first.engines) == 1 - def test_str_representation_table(self, table_data_set): - """Test the data set instance string representation""" - str_repr = str(table_data_set) - assert ( - "SQLTableDataSet(load_args={}, save_args={'index': False}, " - f"table_name={TABLE_NAME})" in str_repr + second = SQLTableDataSet( + table_name="other_table", credentials={"con": CONNECTION} ) - assert CONNECTION not in str(str_repr) + assert len(second.engines) == 1 + assert len(first.engines) == 1 - def test_table_exists(self, mocker, table_data_set): - """Test `exists` method invocation""" - mocker.patch("sqlalchemy.engine.Engine.table_names") - assert not table_data_set.exists() - self._assert_sqlalchemy_called_once() + mock_engine.assert_called_once_with(CONNECTION) - @pytest.mark.parametrize( - "table_data_set", [{"load_args": dict(schema="ingested")}], indirect=True - ) - def test_able_exists_schema(self, mocker, table_data_set): - """Test `exists` method invocation with DB schema provided""" - mocker.patch("sqlalchemy.engine.Engine.table_names") - assert not table_data_set.exists() - self._assert_sqlalchemy_called_once("ingested") + def test_multiple_connections(self, mocker): + """Test that two datasets that need to connect to different dbs + only create one connection per db. + """ + mock_engine = mocker.patch( + "kedro.extras.datasets.pandas.sql_dataset.create_engine" + ) + first = SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": CONNECTION}) + assert len(first.engines) == 1 - def test_table_exists_mocked(self, mocker, table_data_set): - """Test `exists` method invocation with mocked list of tables""" - mocker.patch("sqlalchemy.engine.Engine.table_names", return_value=[TABLE_NAME]) - assert table_data_set.exists() - self._assert_sqlalchemy_called_once() + second_con = f"other_{CONNECTION}" + second = SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": second_con}) + assert len(second.engines) == 2 + assert len(first.engines) == 2 + expected_calls = [mocker.call(CONNECTION), mocker.call(second_con)] + assert mock_engine.call_args_list == expected_calls -class TestSQLQueryDataSet: - @staticmethod - def _assert_pd_called_once(): - _callable = pd.read_sql_query - _callable.assert_called_once_with(sql=SQL_QUERY, con=CONNECTION) +class TestSQLQueryDataSet: def test_empty_query_error(self): - """Check the error when instantiating with empty query""" - pattern = r"`sql` argument cannot be empty\. Please provide a sql query" - with pytest.raises(DataSetError, match=pattern): - SQLQueryDataSet(sql="", credentials=dict(con=CONNECTION)) + """Check the error when instantiating with empty query or file""" + pattern = ( + r"'sql' and 'filepath' arguments cannot both be empty\." + r"Please provide a sql query or path to a sql query file\." + ) + with pytest.raises(DatasetError, match=pattern): + SQLQueryDataSet(sql="", filepath="", credentials={"con": CONNECTION}) def test_empty_con_error(self): """Check the error when instantiating with empty connection string""" pattern = ( - r"`con` argument cannot be empty\. Please provide " + r"'con' argument cannot be empty\. Please provide " r"a SQLAlchemy connection string" ) - with pytest.raises(DataSetError, match=pattern): - SQLQueryDataSet(sql=SQL_QUERY, credentials=dict(con="")) + with pytest.raises(DatasetError, match=pattern): + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": ""}) - def test_load(self, mocker, query_data_set): + @pytest.mark.parametrize( + "query_data_set, has_execution_options", + [ + ({"execution_options": EXECUTION_OPTIONS}, True), + ({"execution_options": {}}, False), + ({}, False), + ], + indirect=["query_data_set"], + ) + def test_load(self, mocker, query_data_set, has_execution_options): """Test `load` method invocation""" mocker.patch("pandas.read_sql_query") query_data_set.load() - self._assert_pd_called_once() - def test_load_driver_missing(self, mocker, query_data_set): + # Check that data was loaded with the expected query, connection string and + # execution options: + pd.read_sql_query.assert_called_once_with(sql=SQL_QUERY, con=ANY) + con_arg = pd.read_sql_query.call_args_list[0][1]["con"] + assert str(con_arg.url) == CONNECTION + assert len(con_arg.get_execution_options()) == bool(has_execution_options) + if has_execution_options: + assert con_arg.get_execution_options() == EXECUTION_OPTIONS + + @pytest.mark.parametrize( + "query_file_data_set, has_execution_options", + [ + ({"execution_options": EXECUTION_OPTIONS}, True), + ({"execution_options": {}}, False), + ({}, False), + ], + indirect=["query_file_data_set"], + ) + def test_load_query_file(self, mocker, query_file_data_set, has_execution_options): + """Test `load` method with a query file""" + mocker.patch("pandas.read_sql_query") + query_file_data_set.load() + + # Check that data was loaded with the expected query, connection string and + # execution options: + pd.read_sql_query.assert_called_once_with(sql=SQL_QUERY, con=ANY) + con_arg = pd.read_sql_query.call_args_list[0][1]["con"] + assert str(con_arg.url) == CONNECTION + assert len(con_arg.get_execution_options()) == bool(has_execution_options) + if has_execution_options: + assert con_arg.get_execution_options() == EXECUTION_OPTIONS + + def test_load_driver_missing(self, mocker): """Test that if an unknown module/driver is encountered by SQLAlchemy then the error should contain the original error message""" _err = ImportError("No module named 'mysqldb'") - mocker.patch("pandas.read_sql_query", side_effect=_err) - with pytest.raises(DataSetError, match=ERROR_PREFIX + "mysqlclient"): - query_data_set.load() + mocker.patch( + "kedro.extras.datasets.pandas.sql_dataset.create_engine", side_effect=_err + ) + with pytest.raises(DatasetError, match=ERROR_PREFIX + "mysqlclient"): + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) - def test_invalid_module(self, mocker, query_data_set): + def test_invalid_module(self, mocker): """Test that if an unknown module/driver is encountered by SQLAlchemy then the error should contain the original error message""" _err = ImportError("Invalid module some_module") - mocker.patch("pandas.read_sql_query", side_effect=_err) + mocker.patch( + "kedro.extras.datasets.pandas.sql_dataset.create_engine", side_effect=_err + ) pattern = ERROR_PREFIX + r"Invalid module some\_module" - with pytest.raises(DataSetError, match=pattern): - query_data_set.load() + with pytest.raises(DatasetError, match=pattern): + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) - def test_load_unknown_module(self, mocker, query_data_set): + def test_load_unknown_module(self, mocker): """Test that if an unknown module/driver is encountered by SQLAlchemy then the error should contain the original error message""" _err = ImportError("No module named 'unknown_module'") - mocker.patch("pandas.read_sql_query", side_effect=_err) + mocker.patch( + "kedro.extras.datasets.pandas.sql_dataset.create_engine", side_effect=_err + ) pattern = ERROR_PREFIX + r"No module named \'unknown\_module\'" - with pytest.raises(DataSetError, match=pattern): - query_data_set.load() + with pytest.raises(DatasetError, match=pattern): + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) - @pytest.mark.parametrize( - "query_data_set", [{"credentials": dict(con=FAKE_CONN_STR)}], indirect=True - ) - def test_load_unknown_sql(self, query_data_set): + def test_load_unknown_sql(self): """Check the error when unknown SQL dialect is provided in the connection string""" pattern = r"The SQL dialect in your connection is not supported by SQLAlchemy" - with pytest.raises(DataSetError, match=pattern): - query_data_set.load() + with pytest.raises(DatasetError, match=pattern): + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": FAKE_CONN_STR}) def test_save_error(self, query_data_set, dummy_dataframe): """Check the error when trying to save to the data set""" - pattern = r"`save` is not supported on SQLQueryDataSet" - with pytest.raises(DataSetError, match=pattern): + pattern = r"'save' is not supported on SQLQueryDataSet" + with pytest.raises(DatasetError, match=pattern): query_data_set.save(dummy_dataframe) - def test_str_representation_sql(self, query_data_set): + def test_str_representation_sql(self, query_data_set, sql_file): """Test the data set instance string representation""" str_repr = str(query_data_set) - assert f"SQLQueryDataSet(load_args={{}}, sql={SQL_QUERY})" in str_repr + assert ( + "SQLQueryDataSet(execution_options={}, filepath=None, " + f"load_args={{}}, sql={SQL_QUERY})" in str_repr + ) + assert CONNECTION not in str_repr + assert sql_file not in str_repr + + def test_str_representation_filepath(self, query_file_data_set, sql_file): + """Test the data set instance string representation with filepath arg.""" + str_repr = str(query_file_data_set) + assert ( + f"SQLQueryDataSet(execution_options={{}}, filepath={str(sql_file)}, " + "load_args={}, sql=None)" in str_repr + ) assert CONNECTION not in str_repr + assert SQL_QUERY not in str_repr + + def test_sql_and_filepath_args(self, sql_file): + """Test that an error is raised when both `sql` and `filepath` args are given.""" + pattern = ( + r"'sql' and 'filepath' arguments cannot both be provided." + r"Please only provide one." + ) + with pytest.raises(DatasetError, match=pattern): + SQLQueryDataSet(sql=SQL_QUERY, filepath=sql_file) + + def test_create_connection_only_once(self, mocker): + """Test that two datasets that need to connect to the same db (but different + tables and execution options, for example) only create a connection once. + """ + mock_engine = mocker.patch( + "kedro.extras.datasets.pandas.sql_dataset.create_engine" + ) + first = SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) + assert len(first.engines) == 1 + + # second engine has identical params to the first one + # => no new engine should be created + second = SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) + mock_engine.assert_called_once_with(CONNECTION) + assert second.engines == first.engines + assert len(first.engines) == 1 + + # third engine only differs by its query execution options + # => no new engine should be created + third = SQLQueryDataSet( + sql="a different query", + credentials={"con": CONNECTION}, + execution_options=EXECUTION_OPTIONS, + ) + assert mock_engine.call_count == 1 + assert third.engines == first.engines + assert len(first.engines) == 1 + + # fourth engine has a different connection string + # => a new engine has to be created + fourth = SQLQueryDataSet( + sql=SQL_QUERY, credentials={"con": "an other connection string"} + ) + assert mock_engine.call_count == 2 + assert fourth.engines == first.engines + assert len(first.engines) == 2 diff --git a/tests/extras/datasets/pandas/test_xml_dataset.py b/tests/extras/datasets/pandas/test_xml_dataset.py new file mode 100644 index 0000000000..9dc8f47dc1 --- /dev/null +++ b/tests/extras/datasets/pandas/test_xml_dataset.py @@ -0,0 +1,241 @@ +from pathlib import Path, PurePosixPath + +import pandas as pd +import pytest +from adlfs import AzureBlobFileSystem +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from pandas.testing import assert_frame_equal +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.pandas import XMLDataSet +from kedro.io import DatasetError +from kedro.io.core import PROTOCOL_DELIMITER, Version + + +@pytest.fixture +def filepath_xml(tmp_path): + return (tmp_path / "test.xml").as_posix() + + +@pytest.fixture +def xml_data_set(filepath_xml, load_args, save_args, fs_args): + return XMLDataSet( + filepath=filepath_xml, + load_args=load_args, + save_args=save_args, + fs_args=fs_args, + ) + + +@pytest.fixture +def versioned_xml_data_set(filepath_xml, load_version, save_version): + return XMLDataSet( + filepath=filepath_xml, version=Version(load_version, save_version) + ) + + +@pytest.fixture +def dummy_dataframe(): + return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + + +class TestXMLDataSet: + def test_save_and_load(self, xml_data_set, dummy_dataframe): + """Test saving and reloading the data set.""" + xml_data_set.save(dummy_dataframe) + reloaded = xml_data_set.load() + assert_frame_equal(dummy_dataframe, reloaded) + + def test_exists(self, xml_data_set, dummy_dataframe): + """Test `exists` method invocation for both existing and + nonexistent data set.""" + assert not xml_data_set.exists() + xml_data_set.save(dummy_dataframe) + assert xml_data_set.exists() + + @pytest.mark.parametrize( + "load_args", [{"k1": "v1", "index": "value"}], indirect=True + ) + def test_load_extra_params(self, xml_data_set, load_args): + """Test overriding the default load arguments.""" + for key, value in load_args.items(): + assert xml_data_set._load_args[key] == value + + @pytest.mark.parametrize( + "save_args", [{"k1": "v1", "index": "value"}], indirect=True + ) + def test_save_extra_params(self, xml_data_set, save_args): + """Test overriding the default save arguments.""" + for key, value in save_args.items(): + assert xml_data_set._save_args[key] == value + + @pytest.mark.parametrize( + "load_args,save_args", + [ + ({"storage_options": {"a": "b"}}, {}), + ({}, {"storage_options": {"a": "b"}}), + ({"storage_options": {"a": "b"}}, {"storage_options": {"x": "y"}}), + ], + ) + def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): + filepath = str(tmp_path / "test.csv") + + ds = XMLDataSet(filepath=filepath, load_args=load_args, save_args=save_args) + + records = [r for r in caplog.records if r.levelname == "WARNING"] + expected_log_message = ( + f"Dropping 'storage_options' for {filepath}, " + f"please specify them under 'fs_args' or 'credentials'." + ) + assert records[0].getMessage() == expected_log_message + assert "storage_options" not in ds._save_args + assert "storage_options" not in ds._load_args + + def test_load_missing_file(self, xml_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set XMLDataSet\(.*\)" + with pytest.raises(DatasetError, match=pattern): + xml_data_set.load() + + @pytest.mark.parametrize( + "filepath,instance_type,credentials,load_path", + [ + ("s3://bucket/file.xml", S3FileSystem, {}, "s3://bucket/file.xml"), + ("file:///tmp/test.xml", LocalFileSystem, {}, "/tmp/test.xml"), + ("/tmp/test.xml", LocalFileSystem, {}, "/tmp/test.xml"), + ("gcs://bucket/file.xml", GCSFileSystem, {}, "gcs://bucket/file.xml"), + ( + "https://example.com/file.xml", + HTTPFileSystem, + {}, + "https://example.com/file.xml", + ), + ( + "abfs://bucket/file.csv", + AzureBlobFileSystem, + {"account_name": "test", "account_key": "test"}, + "abfs://bucket/file.csv", + ), + ], + ) + def test_protocol_usage( + self, filepath, instance_type, credentials, load_path, mocker + ): + data_set = XMLDataSet(filepath=filepath, credentials=credentials) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + mock_pandas_call = mocker.patch("pandas.read_xml") + data_set.load() + assert mock_pandas_call.call_count == 1 + assert mock_pandas_call.call_args_list[0][0][0] == load_path + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.xml" + data_set = XMLDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestXMLDataSetVersioned: + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.xml" + ds = XMLDataSet(filepath=filepath) + ds_versioned = XMLDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "XMLDataSet" in str(ds_versioned) + assert "XMLDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + + def test_save_and_load(self, versioned_xml_data_set, dummy_dataframe): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_xml_data_set.save(dummy_dataframe) + reloaded_df = versioned_xml_data_set.load() + assert_frame_equal(dummy_dataframe, reloaded_df) + + def test_no_versions(self, versioned_xml_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for XMLDataSet\(.+\)" + with pytest.raises(DatasetError, match=pattern): + versioned_xml_data_set.load() + + def test_exists(self, versioned_xml_data_set, dummy_dataframe): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_xml_data_set.exists() + versioned_xml_data_set.save(dummy_dataframe) + assert versioned_xml_data_set.exists() + + def test_prevent_overwrite(self, versioned_xml_data_set, dummy_dataframe): + """Check the error when attempting to override the data set if the + corresponding hdf file for a given save version already exists.""" + versioned_xml_data_set.save(dummy_dataframe) + pattern = ( + r"Save path \'.+\' for XMLDataSet\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DatasetError, match=pattern): + versioned_xml_data_set.save(dummy_dataframe) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_xml_data_set, load_version, save_version, dummy_dataframe + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + rf"Save version '{save_version}' did not match " + rf"load version '{load_version}' for XMLDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_xml_data_set.save(dummy_dataframe) + + def test_http_filesystem_no_versioning(self): + pattern = "Versioning is not supported for HTTP protocols." + + with pytest.raises(DatasetError, match=pattern): + XMLDataSet( + filepath="https://example.com/file.xml", version=Version(None, None) + ) + + def test_versioning_existing_dataset( + self, xml_data_set, versioned_xml_data_set, dummy_dataframe + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + xml_data_set.save(dummy_dataframe) + assert xml_data_set.exists() + assert xml_data_set._filepath == versioned_xml_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_xml_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DatasetError, match=pattern): + versioned_xml_data_set.save(dummy_dataframe) + + # Remove non-versioned dataset and try again + Path(xml_data_set._filepath.as_posix()).unlink() + versioned_xml_data_set.save(dummy_dataframe) + assert versioned_xml_data_set.exists() diff --git a/tests/extras/datasets/pickle/test_pickle_dataset.py b/tests/extras/datasets/pickle/test_pickle_dataset.py index 04aac9d478..65f7495a06 100644 --- a/tests/extras/datasets/pickle/test_pickle_dataset.py +++ b/tests/extras/datasets/pickle/test_pickle_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - import pickle from pathlib import Path, PurePosixPath @@ -38,7 +10,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.pickle import PickleDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version @@ -81,6 +53,7 @@ class TestPickleDataSet: [ ("pickle", None, None), ("joblib", None, None), + ("dill", None, None), ("compress_pickle", {"compression": "lz4"}, {"compression": "lz4"}), ], indirect=True, @@ -126,7 +99,7 @@ def test_open_extra_args(self, pickle_data_set, fs_args): def test_load_missing_file(self, pickle_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set PickleDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): pickle_data_set.load() @pytest.mark.parametrize( @@ -155,30 +128,41 @@ def test_catalog_release(self, mocker): data_set.release() fs_mock.invalidate_cache.assert_called_once_with(filepath) - def test_unserializable_data(self, pickle_data_set, dummy_dataframe, mocker): + def test_unserialisable_data(self, pickle_data_set, dummy_dataframe, mocker): mocker.patch("pickle.dump", side_effect=pickle.PickleError) - pattern = r".+ was not serialized due to:.*" + pattern = r".+ was not serialised due to:.*" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): pickle_data_set.save(dummy_dataframe) - def test_invalid_backend(self): + def test_invalid_backend(self, mocker): pattern = ( - r"'backend' should be one of \['pickle', 'joblib', 'compress_pickle'\], " - r"got 'invalid'\." + r"Selected backend 'invalid' should satisfy the pickle interface. " + r"Missing one of 'load' and 'dump' on the backend." + ) + mocker.patch( + "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module", + return_value=object, ) with pytest.raises(ValueError, match=pattern): PickleDataSet(filepath="test.pkl", backend="invalid") - def test_no_joblib(self, mocker): - mocker.patch.object(PickleDataSet, "BACKENDS", {"joblib": None}) - with pytest.raises(ImportError): - PickleDataSet(filepath="test.pkl", backend="joblib") + def test_no_backend(self, mocker): + pattern = ( + r"Selected backend 'fake.backend.does.not.exist' could not be imported. " + r"Make sure it is installed and importable." + ) + mocker.patch( + "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module", + side_effect=ImportError, + ) + with pytest.raises(ImportError, match=pattern): + PickleDataSet(filepath="test.pkl", backend="fake.backend.does.not.exist") - def test_no_compress_pickle(self, mocker): - mocker.patch.object(PickleDataSet, "BACKENDS", {"compress_pickle": None}) - with pytest.raises(ImportError): - PickleDataSet(filepath="test.pkl", backend="compress_pickle") + def test_copy(self, pickle_data_set): + pickle_data_set_copy = pickle_data_set._copy() + assert pickle_data_set_copy is not pickle_data_set + assert pickle_data_set_copy._describe() == pickle_data_set._describe() class TestPickleDataSetVersioned: @@ -213,7 +197,7 @@ def test_save_and_load(self, versioned_pickle_data_set, dummy_dataframe): def test_no_versions(self, versioned_pickle_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for PickleDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_pickle_data_set.load() def test_exists(self, versioned_pickle_data_set, dummy_dataframe): @@ -227,10 +211,10 @@ def test_prevent_overwrite(self, versioned_pickle_data_set, dummy_dataframe): corresponding Pickle file for a given save version already exists.""" versioned_pickle_data_set.save(dummy_dataframe) pattern = ( - r"Save path \`.+\` for PickleDataSet\(.+\) must " + r"Save path \'.+\' for PickleDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_pickle_data_set.save(dummy_dataframe) @pytest.mark.parametrize( @@ -245,16 +229,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for PickleDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for PickleDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_pickle_data_set.save(dummy_dataframe) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): PickleDataSet( filepath="https://example.com/file.pkl", version=Version(None, None) ) @@ -271,10 +255,15 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_pickle_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_pickle_data_set.save(dummy_dataframe) # Remove non-versioned dataset and try again Path(pickle_data_set._filepath.as_posix()).unlink() versioned_pickle_data_set.save(dummy_dataframe) assert versioned_pickle_data_set.exists() + + def test_copy(self, versioned_pickle_data_set): + pickle_data_set_copy = versioned_pickle_data_set._copy() + assert pickle_data_set_copy is not versioned_pickle_data_set + assert pickle_data_set_copy._describe() == versioned_pickle_data_set._describe() diff --git a/tests/extras/datasets/pillow/test_image_dataset.py b/tests/extras/datasets/pillow/test_image_dataset.py index daaf0af3f0..d3cb450989 100644 --- a/tests/extras/datasets/pillow/test_image_dataset.py +++ b/tests/extras/datasets/pillow/test_image_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath from time import sleep @@ -36,7 +8,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.pillow import ImageDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp @@ -110,7 +82,7 @@ def test_open_extra_args(self, image_dataset, fs_args): def test_load_missing_file(self, image_dataset): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set ImageDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): image_dataset.load() @pytest.mark.parametrize( @@ -192,7 +164,7 @@ def test_multiple_loads(self, versioned_image_dataset, image_object, filepath_pn def test_no_versions(self, versioned_image_dataset): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for ImageDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_image_dataset.load() def test_exists(self, versioned_image_dataset, image_object): @@ -206,10 +178,10 @@ def test_prevent_overwrite(self, versioned_image_dataset, image_object): corresponding image file for a given save version already exists.""" versioned_image_dataset.save(image_object) pattern = ( - r"Save path \`.+\` for ImageDataSet\(.+\) must " + r"Save path \'.+\' for ImageDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_image_dataset.save(image_object) @pytest.mark.parametrize( @@ -224,16 +196,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for ImageDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for ImageDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_image_dataset.save(image_object) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): ImageDataSet( filepath="https://example.com/file.png", version=Version(None, None) ) @@ -250,7 +222,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_image_dataset._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_image_dataset.save(image_object) # Remove non-versioned dataset and try again diff --git a/tests/extras/datasets/plotly/test_json_dataset.py b/tests/extras/datasets/plotly/test_json_dataset.py new file mode 100644 index 0000000000..552d34bb27 --- /dev/null +++ b/tests/extras/datasets/plotly/test_json_dataset.py @@ -0,0 +1,101 @@ +from pathlib import PurePosixPath + +import plotly.express as px +import pytest +from adlfs import AzureBlobFileSystem +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.plotly import JSONDataSet +from kedro.io import DatasetError +from kedro.io.core import PROTOCOL_DELIMITER + + +@pytest.fixture +def filepath_json(tmp_path): + return (tmp_path / "test.json").as_posix() + + +@pytest.fixture +def json_data_set(filepath_json, load_args, save_args, fs_args): + return JSONDataSet( + filepath=filepath_json, + load_args=load_args, + save_args=save_args, + fs_args=fs_args, + ) + + +@pytest.fixture +def dummy_plot(): + return px.scatter(x=[1, 2, 3], y=[1, 3, 2], title="Test") + + +class TestJSONDataSet: + def test_save_and_load(self, json_data_set, dummy_plot): + """Test saving and reloading the data set.""" + json_data_set.save(dummy_plot) + reloaded = json_data_set.load() + assert dummy_plot == reloaded + assert json_data_set._fs_open_args_load == {} + assert json_data_set._fs_open_args_save == {"mode": "w"} + + def test_exists(self, json_data_set, dummy_plot): + """Test `exists` method invocation for both existing and + nonexistent data set.""" + assert not json_data_set.exists() + json_data_set.save(dummy_plot) + assert json_data_set.exists() + + def test_load_missing_file(self, json_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set JSONDataSet\(.*\)" + with pytest.raises(DatasetError, match=pattern): + json_data_set.load() + + @pytest.mark.parametrize("save_args", [{"pretty": True}]) + def test_save_extra_params(self, json_data_set, save_args): + """Test overriding default save args""" + for k, v in save_args.items(): + assert json_data_set._save_args[k] == v + + @pytest.mark.parametrize( + "load_args", [{"output_type": "FigureWidget", "skip_invalid": True}] + ) + def test_load_extra_params(self, json_data_set, load_args): + """Test overriding default save args""" + for k, v in load_args.items(): + assert json_data_set._load_args[k] == v + + @pytest.mark.parametrize( + "filepath,instance_type,credentials", + [ + ("s3://bucket/file.json", S3FileSystem, {}), + ("file:///tmp/test.json", LocalFileSystem, {}), + ("/tmp/test.json", LocalFileSystem, {}), + ("gcs://bucket/file.json", GCSFileSystem, {}), + ("https://example.com/file.json", HTTPFileSystem, {}), + ( + "abfs://bucket/file.csv", + AzureBlobFileSystem, + {"account_name": "test", "account_key": "test"}, + ), + ], + ) + def test_protocol_usage(self, filepath, instance_type, credentials): + data_set = JSONDataSet(filepath=filepath, credentials=credentials) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.json" + data_set = JSONDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) diff --git a/tests/extras/datasets/plotly/test_plotly_dataset.py b/tests/extras/datasets/plotly/test_plotly_dataset.py index c93d0661d6..042a414905 100644 --- a/tests/extras/datasets/plotly/test_plotly_dataset.py +++ b/tests/extras/datasets/plotly/test_plotly_dataset.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. from pathlib import PurePosixPath import pandas as pd @@ -38,7 +11,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.plotly import PlotlyDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER @@ -58,17 +31,13 @@ def plotly_data_set(filepath_json, load_args, save_args, fs_args, plotly_args): ) -@pytest.fixture( - params=[ - { - "fig": {"orientation": "h", "x": "col1", "y": "col2"}, - "layout": {"title": "Test", "xaxis_title": "x", "yaxis_title": "y"}, - "type": "scatter", - } - ] -) -def plotly_args(request): - return request.param +@pytest.fixture +def plotly_args(): + return { + "fig": {"orientation": "h", "x": "col1", "y": "col2"}, + "layout": {"title": "Test", "xaxis_title": "x", "yaxis_title": "y"}, + "type": "scatter", + } @pytest.fixture @@ -95,7 +64,7 @@ def test_exists(self, plotly_data_set, dummy_dataframe): def test_load_missing_file(self, plotly_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set PlotlyDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): plotly_data_set.load() @pytest.mark.parametrize( @@ -135,5 +104,5 @@ def test_fail_if_invalid_plotly_args_provided(self): plotly_args = [] filepath = "test.json" data_set = PlotlyDataSet(filepath=filepath, plotly_args=plotly_args) - with pytest.raises(DataSetError): + with pytest.raises(DatasetError): data_set.save(dummy_dataframe) diff --git a/tests/extras/decorators/__init__.py b/tests/extras/datasets/redis/__init__.py similarity index 100% rename from tests/extras/decorators/__init__.py rename to tests/extras/datasets/redis/__init__.py diff --git a/tests/extras/datasets/redis/test_redis_dataset.py b/tests/extras/datasets/redis/test_redis_dataset.py new file mode 100644 index 0000000000..bd4c4da9fa --- /dev/null +++ b/tests/extras/datasets/redis/test_redis_dataset.py @@ -0,0 +1,165 @@ +"""Tests ``PickleDataSet``.""" + +import importlib +import pickle + +import numpy as np +import pandas as pd +import pytest +import redis +from pandas.testing import assert_frame_equal + +from kedro.extras.datasets.redis import PickleDataSet +from kedro.io import DatasetError + + +@pytest.fixture(params=["pickle"]) +def backend(request): + return request.param + + +@pytest.fixture(params=["key"]) +def key(request): + return request.param + + +@pytest.fixture +def redis_args(): + return { + "from_url_args": {"arg1": "1", "arg2": "2", "url": "redis://127.0.0.1:6379"} + } + + +@pytest.fixture +def dummy_object(): + """Test data for saving.""" + return pd.DataFrame(np.random.random((3, 3)), columns=["a", "b", "c"]) + + +@pytest.fixture +def serialised_dummy_object(backend, dummy_object, save_args): + """Serialise test data.""" + imported_backend = importlib.import_module(backend) + save_args = save_args or {} + return imported_backend.dumps(dummy_object, **save_args) + + +@pytest.fixture +def pickle_data_set(mocker, key, backend, load_args, save_args, redis_args): + mocker.patch( + "redis.StrictRedis.from_url", return_value=redis.Redis.from_url("redis://") + ) + return PickleDataSet( + key=key, + backend=backend, + load_args=load_args, + save_args=save_args, + redis_args=redis_args, + ) + + +class TestPickleDataSet: + @pytest.mark.parametrize( + "key,backend,load_args,save_args", + [ + ("a", "pickle", None, None), + (1, "dill", None, None), + ("key", "compress_pickle", {"compression": "lz4"}, {"compression": "lz4"}), + ], + indirect=True, + ) + def test_save_and_load( + self, + pickle_data_set, + mocker, + dummy_object, + serialised_dummy_object, + key, + ): + """Test saving and reloading the data set.""" + set_mocker = mocker.patch("redis.StrictRedis.set") + get_mocker = mocker.patch( + "redis.StrictRedis.get", return_value=serialised_dummy_object + ) + pickle_data_set.save(dummy_object) + mocker.patch("redis.StrictRedis.exists", return_value=True) + loaded_dummy_object = pickle_data_set.load() + set_mocker.assert_called_once_with( + key, + serialised_dummy_object, + ) + get_mocker.assert_called_once_with(key) + assert_frame_equal(loaded_dummy_object, dummy_object) + + def test_exists(self, mocker, pickle_data_set, dummy_object, key): + """Test `exists` method invocation for both existing and + nonexistent data set.""" + mocker.patch("redis.StrictRedis.exists", return_value=False) + assert not pickle_data_set.exists() + mocker.patch("redis.StrictRedis.set") + pickle_data_set.save(dummy_object) + exists_mocker = mocker.patch("redis.StrictRedis.exists", return_value=True) + assert pickle_data_set.exists() + exists_mocker.assert_called_once_with(key) + + def test_exists_raises_error(self, pickle_data_set): + """Check the error when trying to assert existence with no redis server.""" + pattern = r"The existence of key " + with pytest.raises(DatasetError, match=pattern): + pickle_data_set.exists() + + @pytest.mark.parametrize( + "load_args", [{"k1": "v1", "errors": "strict"}], indirect=True + ) + def test_load_extra_params(self, pickle_data_set, load_args): + """Test overriding the default load arguments.""" + for key, value in load_args.items(): + assert pickle_data_set._load_args[key] == value + + @pytest.mark.parametrize("save_args", [{"k1": "v1", "protocol": 2}], indirect=True) + def test_save_extra_params(self, pickle_data_set, save_args): + """Test overriding the default save arguments.""" + for key, value in save_args.items(): + assert pickle_data_set._save_args[key] == value + + def test_redis_extra_args(self, pickle_data_set, redis_args): + assert pickle_data_set._redis_from_url_args == redis_args["from_url_args"] + assert pickle_data_set._redis_set_args == {} # default unchanged + + def test_load_missing_key(self, mocker, pickle_data_set): + """Check the error when trying to load missing file.""" + pattern = r"The provided key " + mocker.patch("redis.StrictRedis.exists", return_value=False) + with pytest.raises(DatasetError, match=pattern): + pickle_data_set.load() + + def test_unserialisable_data(self, pickle_data_set, dummy_object, mocker): + mocker.patch("pickle.dumps", side_effect=pickle.PickleError) + pattern = r".+ was not serialised due to:.*" + + with pytest.raises(DatasetError, match=pattern): + pickle_data_set.save(dummy_object) + + def test_invalid_backend(self, mocker): + pattern = ( + r"Selected backend 'invalid' should satisfy the pickle interface. " + r"Missing one of 'loads' and 'dumps' on the backend." + ) + mocker.patch( + "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module", + return_value=object, + ) + with pytest.raises(ValueError, match=pattern): + PickleDataSet(key="key", backend="invalid") + + def test_no_backend(self, mocker): + pattern = ( + r"Selected backend 'fake.backend.does.not.exist' could not be imported. " + r"Make sure it is installed and importable." + ) + mocker.patch( + "kedro.extras.datasets.pickle.pickle_dataset.importlib.import_module", + side_effect=ImportError, + ) + with pytest.raises(ImportError, match=pattern): + PickleDataSet("key", backend="fake.backend.does.not.exist") diff --git a/tests/extras/datasets/spark/__init__.py b/tests/extras/datasets/spark/__init__.py index 188f28003b..e69de29bb2 100644 --- a/tests/extras/datasets/spark/__init__.py +++ b/tests/extras/datasets/spark/__init__.py @@ -1,33 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for spark related datasets.""" - -import logging - -logging.getLogger("py4j").setLevel(logging.WARN) diff --git a/tests/extras/datasets/spark/conftest.py b/tests/extras/datasets/spark/conftest.py index 2f0e72106e..3e3ae1c544 100644 --- a/tests/extras/datasets/spark/conftest.py +++ b/tests/extras/datasets/spark/conftest.py @@ -1,83 +1,41 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """ This file contains the fixtures that are reusable by any tests within this directory. You don't need to import the fixtures as pytest will discover them automatically. More info here: https://docs.pytest.org/en/latest/fixture.html """ -import gc -from subprocess import Popen - import pytest +from delta import configure_spark_with_delta_pip +from filelock import FileLock try: - from pyspark import SparkContext from pyspark.sql import SparkSession except ImportError: # pragma: no cover pass # this is only for test discovery to succeed on Python 3.8, 3.9 -the_real_getOrCreate = None - - -class UseTheSparkSessionFixtureOrMock: # pylint: disable=too-few-public-methods - pass - -# prevent using spark without going through the spark_session fixture -@pytest.fixture(scope="session", autouse=True) -def replace_spark_default_getorcreate(): - global the_real_getOrCreate # pylint: disable=global-statement - the_real_getOrCreate = SparkSession.builder.getOrCreate - SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock - return the_real_getOrCreate - - -# clean up pyspark after the test module finishes -@pytest.fixture(scope="module") -def spark_session(): # SKIP_IF_NO_SPARK - SparkSession.builder.getOrCreate = the_real_getOrCreate - spark = SparkSession.builder.getOrCreate() +def _setup_spark_session(): + return configure_spark_with_delta_pip( + SparkSession.builder.appName("MyApp") + .master("local[*]") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog", + ) + ).getOrCreate() + + +@pytest.fixture(scope="module", autouse=True) +def spark_session(tmp_path_factory): + # When running these spark tests with pytest-xdist, we need to make sure + # that the spark session setup on each test process don't interfere with each other. + # Therefore, we block the process during the spark session setup. + # Locking procedure comes from pytest-xdist's own recommendation: + # https://github.com/pytest-dev/pytest-xdist#making-session-scoped-fixtures-execute-only-once + root_tmp_dir = tmp_path_factory.getbasetemp().parent + lock = root_tmp_dir / "semaphore.lock" + with FileLock(lock): + spark = _setup_spark_session() yield spark spark.stop() - SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock - - # remove the cached JVM vars - SparkContext._jvm = None # pylint: disable=protected-access - SparkContext._gateway = None # pylint: disable=protected-access - - # py4j doesn't shutdown properly so kill the actual JVM process - for obj in gc.get_objects(): - try: - if isinstance(obj, Popen) and "pyspark" in obj.args[0]: - obj.terminate() - except ReferenceError: # pragma: no cover - # gc.get_objects may return dead weak proxy objects that will raise - # ReferenceError when you isinstance them - pass diff --git a/tests/extras/datasets/spark/test_deltatable_dataset.py b/tests/extras/datasets/spark/test_deltatable_dataset.py new file mode 100644 index 0000000000..00eb313f6a --- /dev/null +++ b/tests/extras/datasets/spark/test_deltatable_dataset.py @@ -0,0 +1,90 @@ +import pytest +from delta import DeltaTable +from pyspark.sql import SparkSession +from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from pyspark.sql.utils import AnalysisException + +from kedro.extras.datasets.spark import DeltaTableDataSet, SparkDataSet +from kedro.io import DataCatalog, DatasetError +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline +from kedro.runner import ParallelRunner + + +@pytest.fixture +def sample_spark_df(): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] + + return SparkSession.builder.getOrCreate().createDataFrame(data, schema) + + +class TestDeltaTableDataSet: + def test_load(self, tmp_path, sample_spark_df): + filepath = (tmp_path / "test_data").as_posix() + spark_delta_ds = SparkDataSet(filepath=filepath, file_format="delta") + spark_delta_ds.save(sample_spark_df) + loaded_with_spark = spark_delta_ds.load() + assert loaded_with_spark.exceptAll(sample_spark_df).count() == 0 + + delta_ds = DeltaTableDataSet(filepath=filepath) + delta_table = delta_ds.load() + + assert isinstance(delta_table, DeltaTable) + loaded_with_deltalake = delta_table.toDF() + assert loaded_with_deltalake.exceptAll(loaded_with_spark).count() == 0 + + def test_save(self, tmp_path, sample_spark_df): + filepath = (tmp_path / "test_data").as_posix() + delta_ds = DeltaTableDataSet(filepath=filepath) + assert not delta_ds.exists() + + pattern = "DeltaTableDataSet is a read only dataset type" + with pytest.raises(DatasetError, match=pattern): + delta_ds.save(sample_spark_df) + + # check that indeed nothing is written + assert not delta_ds.exists() + + def test_exists(self, tmp_path, sample_spark_df): + filepath = (tmp_path / "test_data").as_posix() + delta_ds = DeltaTableDataSet(filepath=filepath) + + assert not delta_ds.exists() + + spark_delta_ds = SparkDataSet(filepath=filepath, file_format="delta") + spark_delta_ds.save(sample_spark_df) + + assert delta_ds.exists() + + def test_exists_raises_error(self, mocker): + delta_ds = DeltaTableDataSet(filepath="") + mocker.patch.object( + delta_ds, "_get_spark", side_effect=AnalysisException("Other Exception", []) + ) + + with pytest.raises(DatasetError, match="Other Exception"): + delta_ds.exists() + + @pytest.mark.parametrize("is_async", [False, True]) + def test_parallel_runner(self, is_async): + """Test ParallelRunner with SparkDataSet fails.""" + + def no_output(x): + _ = x + 1 # pragma: no cover + + delta_ds = DeltaTableDataSet(filepath="") + catalog = DataCatalog(data_sets={"delta_in": delta_ds}) + pipeline = modular_pipeline([node(no_output, "delta_in", None)]) + pattern = ( + r"The following data sets cannot be used with " + r"multiprocessing: \['delta_in'\]" + ) + with pytest.raises(AttributeError, match=pattern): + ParallelRunner(is_async=is_async).run(pipeline, catalog) diff --git a/tests/extras/datasets/spark/test_memory_dataset.py b/tests/extras/datasets/spark/test_memory_dataset.py index 35780b759f..d678f42d63 100644 --- a/tests/extras/datasets/spark/test_memory_dataset.py +++ b/tests/extras/datasets/spark/test_memory_dataset.py @@ -1,37 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - import pytest from pyspark.sql import DataFrame as SparkDataFrame from pyspark.sql import SparkSession from pyspark.sql.functions import col, when -from kedro.io import MemoryDataSet +from kedro.io import MemoryDataset def _update_spark_df(data, idx, jdx, value): @@ -59,37 +31,37 @@ def spark_data_frame(spark_session): @pytest.fixture -def memory_data_set(spark_data_frame): - return MemoryDataSet(data=spark_data_frame) +def memory_dataset(spark_data_frame): + return MemoryDataset(data=spark_data_frame) -def test_load_modify_original_data(memory_data_set, spark_data_frame): +def test_load_modify_original_data(memory_dataset, spark_data_frame): """Check that the data set object is not updated when the original SparkDataFrame is changed.""" spark_data_frame = _update_spark_df(spark_data_frame, 1, 1, -5) - assert not _check_equals(memory_data_set.load(), spark_data_frame) + assert not _check_equals(memory_dataset.load(), spark_data_frame) def test_save_modify_original_data(spark_data_frame): """Check that the data set object is not updated when the original SparkDataFrame is changed.""" - memory_data_set = MemoryDataSet() - memory_data_set.save(spark_data_frame) + memory_dataset = MemoryDataset() + memory_dataset.save(spark_data_frame) spark_data_frame = _update_spark_df(spark_data_frame, 1, 1, "new value") - assert not _check_equals(memory_data_set.load(), spark_data_frame) + assert not _check_equals(memory_dataset.load(), spark_data_frame) -def test_load_returns_same_spark_object(memory_data_set, spark_data_frame): +def test_load_returns_same_spark_object(memory_dataset, spark_data_frame): """Test that consecutive loads point to the same object in case of a SparkDataFrame""" - loaded_data = memory_data_set.load() - reloaded_data = memory_data_set.load() + loaded_data = memory_dataset.load() + reloaded_data = memory_dataset.load() assert _check_equals(loaded_data, spark_data_frame) assert _check_equals(reloaded_data, spark_data_frame) assert loaded_data is reloaded_data -def test_str_representation(memory_data_set): +def test_str_representation(memory_dataset): """Test string representation of the data set""" - assert "MemoryDataSet(data=)" in str(memory_data_set) + assert "MemoryDataset(data=)" in str(memory_dataset) diff --git a/tests/extras/datasets/spark/test_spark_dataset.py b/tests/extras/datasets/spark/test_spark_dataset.py index b2073ca690..da979901ac 100644 --- a/tests/extras/datasets/spark/test_spark_dataset.py +++ b/tests/extras/datasets/spark/test_spark_dataset.py @@ -1,40 +1,21 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - +import re import sys import tempfile from pathlib import Path, PurePosixPath +import boto3 import pandas as pd import pytest +from moto import mock_s3 from pyspark.sql import SparkSession from pyspark.sql.functions import col -from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from pyspark.sql.types import ( + FloatType, + IntegerType, + StringType, + StructField, + StructType, +) from pyspark.sql.utils import AnalysisException from kedro.extras.datasets.pandas import CSVDataSet, ParquetDataSet @@ -45,14 +26,16 @@ _dbfs_glob, _get_dbutils, ) -from kedro.io import DataCatalog, DataSetError, Version +from kedro.io import DataCatalog, DatasetError, Version from kedro.io.core import generate_timestamp -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner, SequentialRunner FOLDER_NAME = "fake_folder" FILENAME = "test.parquet" BUCKET_NAME = "test_bucket" +SCHEMA_FILE_NAME = "schema.json" AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} HDFS_PREFIX = f"{FOLDER_NAME}/{FILENAME}" @@ -78,12 +61,6 @@ ] -@pytest.fixture(autouse=True) -def spark_session_autouse(spark_session): - # all the tests in this file require Spark - return spark_session - - @pytest.fixture def sample_pandas_df() -> pd.DataFrame: return pd.DataFrame( @@ -133,6 +110,17 @@ def sample_spark_df(): return SparkSession.builder.getOrCreate().createDataFrame(data, schema) +@pytest.fixture +def sample_spark_df_schema() -> StructType: + return StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("height", FloatType(), True), + ] + ) + + def identity(arg): return arg # pragma: no cover @@ -144,6 +132,31 @@ def spark_in(tmp_path, sample_spark_df): return spark_in +@pytest.fixture +def mocked_s3_bucket(): + """Create a bucket for testing using moto.""" + with mock_s3(): + conn = boto3.client( + "s3", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + conn.create_bucket(Bucket=BUCKET_NAME) + yield conn + + +@pytest.fixture +def mocked_s3_schema(tmp_path, mocked_s3_bucket, sample_spark_df_schema: StructType): + """Creates schema file and adds it to mocked S3 bucket.""" + temporary_path = tmp_path / SCHEMA_FILE_NAME + temporary_path.write_text(sample_spark_df_schema.json(), encoding="utf-8") + + mocked_s3_bucket.put_object( + Bucket=BUCKET_NAME, Key=SCHEMA_FILE_NAME, Body=temporary_path.read_bytes() + ) + return mocked_s3_bucket + + class FileInfo: def __init__(self, path): self.path = "dbfs:" + path @@ -192,6 +205,109 @@ def test_load_options_csv(self, tmp_path, sample_pandas_df): spark_df = spark_data_set.load() assert spark_df.filter(col("Name") == "Alex").count() == 1 + def test_load_options_schema_ddl_string( + self, tmp_path, sample_pandas_df, sample_spark_df_schema + ): + filepath = (tmp_path / "data").as_posix() + local_csv_data_set = CSVDataSet(filepath=filepath) + local_csv_data_set.save(sample_pandas_df) + spark_data_set = SparkDataSet( + filepath=filepath, + file_format="csv", + load_args={"header": True, "schema": "name STRING, age INT, height FLOAT"}, + ) + spark_df = spark_data_set.load() + assert spark_df.schema == sample_spark_df_schema + + def test_load_options_schema_obj( + self, tmp_path, sample_pandas_df, sample_spark_df_schema + ): + filepath = (tmp_path / "data").as_posix() + local_csv_data_set = CSVDataSet(filepath=filepath) + local_csv_data_set.save(sample_pandas_df) + + spark_data_set = SparkDataSet( + filepath=filepath, + file_format="csv", + load_args={"header": True, "schema": sample_spark_df_schema}, + ) + + spark_df = spark_data_set.load() + assert spark_df.schema == sample_spark_df_schema + + def test_load_options_schema_path( + self, tmp_path, sample_pandas_df, sample_spark_df_schema + ): + filepath = (tmp_path / "data").as_posix() + schemapath = (tmp_path / SCHEMA_FILE_NAME).as_posix() + local_csv_data_set = CSVDataSet(filepath=filepath) + local_csv_data_set.save(sample_pandas_df) + Path(schemapath).write_text(sample_spark_df_schema.json(), encoding="utf-8") + + spark_data_set = SparkDataSet( + filepath=filepath, + file_format="csv", + load_args={"header": True, "schema": {"filepath": schemapath}}, + ) + + spark_df = spark_data_set.load() + assert spark_df.schema == sample_spark_df_schema + + @pytest.mark.usefixtures("mocked_s3_schema") + def test_load_options_schema_path_with_credentials( + self, tmp_path, sample_pandas_df, sample_spark_df_schema + ): + filepath = (tmp_path / "data").as_posix() + local_csv_data_set = CSVDataSet(filepath=filepath) + local_csv_data_set.save(sample_pandas_df) + + spark_data_set = SparkDataSet( + filepath=filepath, + file_format="csv", + load_args={ + "header": True, + "schema": { + "filepath": f"s3://{BUCKET_NAME}/{SCHEMA_FILE_NAME}", + "credentials": AWS_CREDENTIALS, + }, + }, + ) + + spark_df = spark_data_set.load() + assert spark_df.schema == sample_spark_df_schema + + def test_load_options_invalid_schema_file(self, tmp_path): + filepath = (tmp_path / "data").as_posix() + schemapath = (tmp_path / SCHEMA_FILE_NAME).as_posix() + Path(schemapath).write_text("dummy", encoding="utf-8") + + pattern = ( + f"Contents of 'schema.filepath' ({schemapath}) are invalid. Please" + f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'." + ) + + with pytest.raises(DatasetError, match=re.escape(pattern)): + SparkDataSet( + filepath=filepath, + file_format="csv", + load_args={"header": True, "schema": {"filepath": schemapath}}, + ) + + def test_load_options_invalid_schema(self, tmp_path): + filepath = (tmp_path / "data").as_posix() + + pattern = ( + "Schema load argument does not specify a 'filepath' attribute. Please" + "include a path to a JSON-serialised 'pyspark.sql.types.StructType'." + ) + + with pytest.raises(DatasetError, match=pattern): + SparkDataSet( + filepath=filepath, + file_format="csv", + load_args={"header": True, "schema": {}}, + ) + def test_save_options_csv(self, tmp_path, sample_spark_df): # To cross check the correct Spark save operation we save to # a single spark partition with csv format and retrieve it with Kedro @@ -220,9 +336,7 @@ def test_str_representation(self): with tempfile.NamedTemporaryFile() as temp_data_file: filepath = Path(temp_data_file.name).as_posix() spark_data_set = SparkDataSet( - filepath=filepath, - file_format="csv", - load_args={"header": True}, + filepath=filepath, file_format="csv", load_args={"header": True} ) assert "SparkDataSet" in str(spark_data_set) assert f"filepath={filepath}" in str(spark_data_set) @@ -233,7 +347,7 @@ def test_save_overwrite_fail(self, tmp_path, sample_spark_df): spark_data_set = SparkDataSet(filepath=filepath) spark_data_set.save(sample_spark_df) - with pytest.raises(DataSetError): + with pytest.raises(DatasetError): spark_data_set.save(sample_spark_df) def test_save_overwrite_mode(self, tmp_path, sample_spark_df): @@ -246,6 +360,20 @@ def test_save_overwrite_mode(self, tmp_path, sample_spark_df): spark_data_set.save(sample_spark_df) spark_data_set.save(sample_spark_df) + @pytest.mark.parametrize("mode", ["merge", "delete", "update"]) + def test_file_format_delta_and_unsupported_mode(self, tmp_path, mode): + filepath = (tmp_path / "test_data").as_posix() + pattern = ( + f"It is not possible to perform 'save()' for file format 'delta' " + f"with mode '{mode}' on 'SparkDataSet'. " + f"Please use 'spark.DeltaTableDataSet' instead." + ) + + with pytest.raises(DatasetError, match=re.escape(pattern)): + _ = SparkDataSet( + filepath=filepath, file_format="delta", save_args={"mode": mode} + ) + def test_save_partition(self, tmp_path, sample_spark_df): # To verify partitioning this test will partition the data by one # of the columns and then check whether partitioned column is added @@ -263,7 +391,7 @@ def test_save_partition(self, tmp_path, sample_spark_df): assert expected_path.exists() - @pytest.mark.parametrize("file_format", ["csv", "parquet"]) + @pytest.mark.parametrize("file_format", ["csv", "parquet", "delta"]) def test_exists(self, file_format, tmp_path, sample_spark_df): filepath = (tmp_path / "test_data").as_posix() spark_data_set = SparkDataSet(filepath=filepath, file_format=file_format) @@ -283,14 +411,14 @@ def test_exists_raises_error(self, mocker): side_effect=AnalysisException("Other Exception", []), ) - with pytest.raises(DataSetError, match="Other Exception"): + with pytest.raises(DatasetError, match="Other Exception"): spark_data_set.exists() @pytest.mark.parametrize("is_async", [False, True]) def test_parallel_runner(self, is_async, spark_in): """Test ParallelRunner with SparkDataSet fails.""" catalog = DataCatalog(data_sets={"spark_in": spark_in}) - pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) + pipeline = modular_pipeline([node(identity, "spark_in", "spark_out")]) pattern = ( r"The following data sets cannot be used with " r"multiprocessing: \['spark_in'\]" @@ -320,7 +448,7 @@ def test_copy(self): class TestSparkDataSetVersionedLocal: def test_no_version(self, versioned_dataset_local): pattern = r"Did not find any versions for SparkDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_dataset_local.load() def test_load_latest(self, versioned_dataset_local, sample_spark_df): @@ -359,8 +487,8 @@ def test_save_version_warning(self, tmp_path, sample_spark_df): ) pattern = ( - r"Save version `{ev.save}` did not match load version " - r"`{ev.load}` for SparkDataSet\(.+\)".format(ev=exact_version) + r"Save version '{ev.save}' did not match load version " + r"'{ev.load}' for SparkDataSet\(.+\)".format(ev=exact_version) ) with pytest.warns(UserWarning, match=pattern): ds_local.save(sample_spark_df) @@ -375,10 +503,10 @@ def test_prevent_overwrite(self, tmp_path, version, sample_spark_df): versioned_local.save(sample_spark_df) pattern = ( - r"Save path `.+` for SparkDataSet\(.+\) must not exist " + r"Save path '.+' for SparkDataSet\(.+\) must not exist " r"if versioning is enabled" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_local.save(sample_spark_df) def test_versioning_existing_dataset( @@ -400,7 +528,7 @@ def test_versioning_existing_dataset( sys.platform.startswith("win"), reason="DBFS doesn't work on Windows" ) class TestSparkDataSetVersionedDBFS: - def test_load_latest( # pylint: disable=too-many-arguments + def test_load_latest( # noqa: too-many-arguments self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df ): mocked_glob = mocker.patch.object(versioned_dataset_dbfs, "_glob_function") @@ -427,7 +555,7 @@ def test_load_exact(self, tmp_path, sample_spark_df): assert reloaded.exceptAll(sample_spark_df).count() == 0 - def test_save( # pylint: disable=too-many-arguments + def test_save( # noqa: too-many-arguments self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df ): mocked_glob = mocker.patch.object(versioned_dataset_dbfs, "_glob_function") @@ -440,7 +568,7 @@ def test_save( # pylint: disable=too-many-arguments ) assert (tmp_path / FILENAME / version.save / FILENAME).exists() - def test_exists( # pylint: disable=too-many-arguments + def test_exists( # noqa: too-many-arguments self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df ): mocked_glob = mocker.patch.object(versioned_dataset_dbfs, "_glob_function") @@ -560,7 +688,7 @@ def test_dbfs_path_in_different_os(self, os_name, mocker): class TestSparkDataSetVersionedS3: def test_no_version(self, versioned_dataset_s3): pattern = r"Did not find any versions for SparkDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_dataset_s3.load() def test_load_latest(self, mocker, versioned_dataset_s3): @@ -588,7 +716,6 @@ def test_load_exact(self, mocker): ds_s3 = SparkDataSet( filepath=f"s3a://{BUCKET_NAME}/{FILENAME}", version=Version(ts, None), - credentials=AWS_CREDENTIALS, ) get_spark = mocker.patch.object(ds_s3, "_get_spark") @@ -623,8 +750,8 @@ def test_save_version_warning(self, mocker): mocked_spark_df = mocker.Mock() pattern = ( - r"Save version `{ev.save}` did not match load version " - r"`{ev.load}` for SparkDataSet\(.+\)".format(ev=exact_version) + r"Save version '{ev.save}' did not match load version " + r"'{ev.load}' for SparkDataSet\(.+\)".format(ev=exact_version) ) with pytest.warns(UserWarning, match=pattern): ds_s3.save(mocked_spark_df) @@ -640,18 +767,18 @@ def test_prevent_overwrite(self, mocker, versioned_dataset_s3): mocker.patch.object(versioned_dataset_s3, "_exists_function", return_value=True) pattern = ( - r"Save path `.+` for SparkDataSet\(.+\) must not exist " + r"Save path '.+' for SparkDataSet\(.+\) must not exist " r"if versioning is enabled" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_dataset_s3.save(mocked_spark_df) mocked_spark_df.write.save.assert_not_called() def test_s3n_warning(self, version): pattern = ( - "`s3n` filesystem has now been deprecated by Spark, " - "please consider switching to `s3a`" + "'s3n' filesystem has now been deprecated by Spark, " + "please consider switching to 's3a'" ) with pytest.warns(DeprecationWarning, match=pattern): SparkDataSet(filepath=f"s3n://{BUCKET_NAME}/{FILENAME}", version=version) @@ -677,7 +804,7 @@ def test_no_version(self, mocker, version): versioned_hdfs = SparkDataSet(filepath=f"hdfs://{HDFS_PREFIX}", version=version) pattern = r"Did not find any versions for SparkDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_hdfs.load() hdfs_walk.assert_called_once_with(HDFS_PREFIX) @@ -756,8 +883,8 @@ def test_save_version_warning(self, mocker): mocked_spark_df = mocker.Mock() pattern = ( - r"Save version `{ev.save}` did not match load version " - r"`{ev.load}` for SparkDataSet\(.+\)".format(ev=exact_version) + r"Save version '{ev.save}' did not match load version " + r"'{ev.load}' for SparkDataSet\(.+\)".format(ev=exact_version) ) with pytest.warns(UserWarning, match=pattern): @@ -780,10 +907,10 @@ def test_prevent_overwrite(self, mocker, version): mocked_spark_df = mocker.Mock() pattern = ( - r"Save path `.+` for SparkDataSet\(.+\) must not exist " + r"Save path '.+' for SparkDataSet\(.+\) must not exist " r"if versioning is enabled" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_hdfs.save(mocked_spark_df) hdfs_status.assert_called_once_with( @@ -795,7 +922,7 @@ def test_prevent_overwrite(self, mocker, version): def test_hdfs_warning(self, version): pattern = ( "HDFS filesystem support for versioned SparkDataSet is in beta " - "and uses `hdfs.client.InsecureClient`, please use with caution" + "and uses 'hdfs.client.InsecureClient', please use with caution" ) with pytest.warns(UserWarning, match=pattern): SparkDataSet(filepath=f"hdfs://{HDFS_PREFIX}", version=version) @@ -828,7 +955,7 @@ def data_catalog(tmp_path): class TestDataFlowSequentialRunner: def test_spark_load_save(self, is_async, data_catalog): """SparkDataSet(load) -> node -> Spark (save).""" - pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) + pipeline = modular_pipeline([node(identity, "spark_in", "spark_out")]) SequentialRunner(is_async=is_async).run(pipeline, data_catalog) save_path = Path(data_catalog._data_sets["spark_out"]._filepath.as_posix()) @@ -837,15 +964,15 @@ def test_spark_load_save(self, is_async, data_catalog): def test_spark_pickle(self, is_async, data_catalog): """SparkDataSet(load) -> node -> PickleDataSet (save)""" - pipeline = Pipeline([node(identity, "spark_in", "pickle_ds")]) - pattern = ".* was not serialized due to.*" - with pytest.raises(DataSetError, match=pattern): + pipeline = modular_pipeline([node(identity, "spark_in", "pickle_ds")]) + pattern = ".* was not serialised due to.*" + with pytest.raises(DatasetError, match=pattern): SequentialRunner(is_async=is_async).run(pipeline, data_catalog) def test_spark_memory_spark(self, is_async, data_catalog): """SparkDataSet(load) -> node -> MemoryDataSet (save and then load) -> node -> SparkDataSet (save)""" - pipeline = Pipeline( + pipeline = modular_pipeline( [ node(identity, "spark_in", "memory_ds"), node(identity, "memory_ds", "spark_out"), diff --git a/tests/extras/datasets/spark/test_spark_hive_dataset.py b/tests/extras/datasets/spark/test_spark_hive_dataset.py index 7cd40135be..ba7fc734a6 100644 --- a/tests/extras/datasets/spark/test_spark_hive_dataset.py +++ b/tests/extras/datasets/spark/test_spark_hive_dataset.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import gc import re from pathlib import Path @@ -37,16 +10,13 @@ from pyspark.sql.types import IntegerType, StringType, StructField, StructType from kedro.extras.datasets.spark import SparkHiveDataSet -from kedro.io import DataSetError -from tests.extras.datasets.spark.conftest import UseTheSparkSessionFixtureOrMock +from kedro.io import DatasetError TESTSPARKDIR = "test_spark_dir" -# clean up pyspark after the test module finishes @pytest.fixture(scope="module") -def spark_hive_session(replace_spark_default_getorcreate): - SparkSession.builder.getOrCreate = replace_spark_default_getorcreate +def spark_session(): try: with TemporaryDirectory(TESTSPARKDIR) as tmpdir: spark = ( @@ -58,9 +28,9 @@ def spark_hive_session(replace_spark_default_getorcreate): ) .config( "javax.jdo.option.ConnectionURL", - "jdbc:derby:;databaseName={metastore_db_path};create=true".format( - metastore_db_path=(Path(tmpdir) / "warehouse_db").absolute() - ), + f"jdbc:derby:;" + f"databaseName={(Path(tmpdir) / 'warehouse_db').absolute()};" + f"create=true", ) .enableHiveSupport() .getOrCreate() @@ -79,8 +49,6 @@ def spark_hive_session(replace_spark_default_getorcreate): # files are still used by Java process. pass - SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock - # remove the cached JVM vars SparkContext._jvm = None # pylint: disable=protected-access SparkContext._gateway = None # pylint: disable=protected-access @@ -97,7 +65,7 @@ def spark_hive_session(replace_spark_default_getorcreate): @pytest.fixture(scope="module", autouse=True) -def spark_test_databases(spark_hive_session): +def spark_test_databases(spark_session): """Setup spark test databases for all tests in this module.""" dataset = _generate_spark_df_one() dataset.createOrReplaceTempView("tmp") @@ -105,15 +73,15 @@ def spark_test_databases(spark_hive_session): # Setup the databases and test table before testing for database in databases: - spark_hive_session.sql(f"create database {database}") - spark_hive_session.sql("use default_1") - spark_hive_session.sql("create table table_1 as select * from tmp") + spark_session.sql(f"create database {database}") + spark_session.sql("use default_1") + spark_session.sql("create table table_1 as select * from tmp") - yield spark_hive_session + yield spark_session # Drop the databases after testing for database in databases: - spark_hive_session.sql(f"drop database {database} cascade") + spark_session.sql(f"drop database {database} cascade") def assert_df_equal(expected, result): @@ -181,8 +149,8 @@ def test_read_existing_table(self): ) assert_df_equal(_generate_spark_df_one(), dataset.load()) - def test_overwrite_empty_table(self, spark_hive_session): - spark_hive_session.sql( + def test_overwrite_empty_table(self, spark_session): + spark_session.sql( "create table default_1.test_overwrite_empty_table (name string, age integer)" ).take(1) dataset = SparkHiveDataSet( @@ -193,8 +161,8 @@ def test_overwrite_empty_table(self, spark_hive_session): dataset.save(_generate_spark_df_one()) assert_df_equal(dataset.load(), _generate_spark_df_one()) - def test_overwrite_not_empty_table(self, spark_hive_session): - spark_hive_session.sql( + def test_overwrite_not_empty_table(self, spark_session): + spark_session.sql( "create table default_1.test_overwrite_full_table (name string, age integer)" ).take(1) dataset = SparkHiveDataSet( @@ -206,8 +174,8 @@ def test_overwrite_not_empty_table(self, spark_hive_session): dataset.save(_generate_spark_df_one()) assert_df_equal(dataset.load(), _generate_spark_df_one()) - def test_insert_not_empty_table(self, spark_hive_session): - spark_hive_session.sql( + def test_insert_not_empty_table(self, spark_session): + spark_session.sql( "create table default_1.test_insert_not_empty_table (name string, age integer)" ).take(1) dataset = SparkHiveDataSet( @@ -224,12 +192,12 @@ def test_insert_not_empty_table(self, spark_hive_session): def test_upsert_config_err(self): # no pk provided should prompt config error with pytest.raises( - DataSetError, match="`table_pk` must be set to utilise `upsert` read mode" + DatasetError, match="'table_pk' must be set to utilise 'upsert' read mode" ): SparkHiveDataSet(database="default_1", table="table_1", write_mode="upsert") - def test_upsert_empty_table(self, spark_hive_session): - spark_hive_session.sql( + def test_upsert_empty_table(self, spark_session): + spark_session.sql( "create table default_1.test_upsert_empty_table (name string, age integer)" ).take(1) dataset = SparkHiveDataSet( @@ -243,8 +211,8 @@ def test_upsert_empty_table(self, spark_hive_session): dataset.load().sort("name"), _generate_spark_df_one().sort("name") ) - def test_upsert_not_empty_table(self, spark_hive_session): - spark_hive_session.sql( + def test_upsert_not_empty_table(self, spark_session): + spark_session.sql( "create table default_1.test_upsert_not_empty_table (name string, age integer)" ).take(1) dataset = SparkHiveDataSet( @@ -270,7 +238,7 @@ def test_invalid_pk_provided(self): table_pk=_test_columns, ) with pytest.raises( - DataSetError, + DatasetError, match=re.escape( f"Columns {str(_test_columns)} selected as primary key(s) " f"not found in table default_1.table_1", @@ -280,11 +248,11 @@ def test_invalid_pk_provided(self): def test_invalid_write_mode_provided(self): pattern = ( - "Invalid `write_mode` provided: not_a_write_mode. " - "`write_mode` must be one of: " + "Invalid 'write_mode' provided: not_a_write_mode. " + "'write_mode' must be one of: " "append, error, errorifexists, upsert, overwrite" ) - with pytest.raises(DataSetError, match=re.escape(pattern)): + with pytest.raises(DatasetError, match=re.escape(pattern)): SparkHiveDataSet( database="default_1", table="table_1", @@ -292,8 +260,8 @@ def test_invalid_write_mode_provided(self): table_pk=["name"], ) - def test_invalid_schema_insert(self, spark_hive_session): - spark_hive_session.sql( + def test_invalid_schema_insert(self, spark_session): + spark_session.sql( "create table default_1.test_invalid_schema_insert " "(name string, additional_column_on_hive integer)" ).take(1) @@ -303,7 +271,7 @@ def test_invalid_schema_insert(self, spark_hive_session): write_mode="append", ) with pytest.raises( - DataSetError, + DatasetError, match=r"Dataset does not match hive table schema\.\n" r"Present on insert only: \[\('age', 'int'\)\]\n" r"Present on schema only: \[\('additional_column_on_hive', 'int'\)\]", @@ -324,7 +292,7 @@ def test_read_from_non_existent_table(self): database="default_1", table="table_doesnt_exist", write_mode="append" ) with pytest.raises( - DataSetError, + DatasetError, match=r"Failed while loading data from data set " r"SparkHiveDataSet\(database=default_1, format=hive, " r"table=table_doesnt_exist, table_pk=\[\], write_mode=append\)\.\n" @@ -333,3 +301,14 @@ def test_read_from_non_existent_table(self): r"table_doesnt_exist\], \[\], false\n", ): dataset.load() + + def test_save_delta_format(self, mocker): + dataset = SparkHiveDataSet( + database="default_1", table="delta_table", save_args={"format": "delta"} + ) + mocked_save = mocker.patch("pyspark.sql.DataFrameWriter.saveAsTable") + dataset.save(_generate_spark_df_one()) + mocked_save.assert_called_with( + "default_1.delta_table", mode="errorifexists", format="delta" + ) + assert dataset._format == "delta" diff --git a/tests/extras/datasets/spark/test_spark_jdbc_dataset.py b/tests/extras/datasets/spark/test_spark_jdbc_dataset.py index 4088871a75..fa7af0f966 100644 --- a/tests/extras/datasets/spark/test_spark_jdbc_dataset.py +++ b/tests/extras/datasets/spark/test_spark_jdbc_dataset.py @@ -1,36 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. from unittest import mock import pytest from kedro.extras.datasets.spark import SparkJDBCDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError @pytest.fixture @@ -64,19 +37,19 @@ def spark_jdbc_args_save_load(spark_jdbc_args): def test_missing_url(): error_message = ( - "`url` argument cannot be empty. Please provide a JDBC" - " URL of the form ``jdbc:subprotocol:subname``." + "'url' argument cannot be empty. Please provide a JDBC" + " URL of the form 'jdbc:subprotocol:subname'." ) - with pytest.raises(DataSetError, match=error_message): + with pytest.raises(DatasetError, match=error_message): SparkJDBCDataSet(url=None, table="dummy_table") def test_missing_table(): error_message = ( - "`table` argument cannot be empty. Please provide" + "'table' argument cannot be empty. Please provide" " the name of the table to load or save data to." ) - with pytest.raises(DataSetError, match=error_message): + with pytest.raises(DatasetError, match=error_message): SparkJDBCDataSet(url="dummy_url", table=None) @@ -109,8 +82,8 @@ def test_save_args(spark_jdbc_args_save_load): def test_except_bad_credentials(spark_jdbc_args_credentials_with_none_password): - pattern = r"Credential property `password` cannot be None(.+)" - with pytest.raises(DataSetError, match=pattern): + pattern = r"Credential property 'password' cannot be None(.+)" + with pytest.raises(DatasetError, match=pattern): mock_save(spark_jdbc_args_credentials_with_none_password) diff --git a/tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py b/tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py index 55e261d5ba..69c5c46149 100644 --- a/tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py +++ b/tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py @@ -1,32 +1,4 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. # pylint: disable=import-outside-toplevel - from pathlib import PurePosixPath import numpy as np @@ -36,7 +8,7 @@ from gcsfs import GCSFileSystem from s3fs import S3FileSystem -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version @@ -122,6 +94,24 @@ def dummy_tf_base_model(dummy_x_train, dummy_y_train, tf): return model +@pytest.fixture +def dummy_tf_base_model_new(dummy_x_train, dummy_y_train, tf): + # dummy 2 layer model + inputs = tf.keras.Input(shape=(2, 1)) + x = tf.keras.layers.Dense(1)(inputs) + x = tf.keras.layers.Dense(1)(x) + outputs = tf.keras.layers.Dense(1)(x) + + model = tf.keras.Model(inputs=inputs, outputs=outputs, name="2_layer_dummy") + model.compile("rmsprop", "mse") + model.fit(dummy_x_train, dummy_y_train, batch_size=64, epochs=1) + # from https://www.tensorflow.org/guide/keras/save_and_serialize + # Reset metrics before saving so that loaded model has same state, + # since metric states are not preserved by Model.save_weights + model.reset_metrics() + return model + + @pytest.fixture def dummy_tf_subclassed_model(dummy_x_train, dummy_y_train, tf): """Demonstrate that own class models cannot be saved @@ -165,7 +155,7 @@ def test_load_missing_model(self, tf_model_dataset): pattern = ( r"Failed while loading data from data set TensorFlowModelDataset\(.*\)" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): tf_model_dataset.load() def test_exists(self, tf_model_dataset, dummy_tf_base_model): @@ -222,7 +212,7 @@ def test_unused_subclass_model_hdf5_save_format( r"saving to the Tensorflow SavedModel format \(by setting save_format=\"tf\"\) " r"or using `save_weights`." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): hdf5_data_set.save(dummy_tf_subclassed_model) @pytest.mark.parametrize( @@ -271,9 +261,22 @@ def test_fs_args(self, fs_args, mocker, tensorflow_model_dataset): def test_exists_with_exception(self, tf_model_dataset, mocker): """Test `exists` method invocation when `get_filepath_str` raises an exception.""" - mocker.patch("kedro.io.core.get_filepath_str", side_effct=DataSetError) + mocker.patch("kedro.io.core.get_filepath_str", side_effect=DatasetError) assert not tf_model_dataset.exists() + def test_save_and_overwrite_existing_model( + self, tf_model_dataset, dummy_tf_base_model, dummy_tf_base_model_new + ): + """Test models are correcty overwritten.""" + tf_model_dataset.save(dummy_tf_base_model) + + tf_model_dataset.save(dummy_tf_base_model_new) + + reloaded = tf_model_dataset.load() + + assert len(dummy_tf_base_model.layers) != len(reloaded.layers) + assert len(dummy_tf_base_model_new.layers) == len(reloaded.layers) + class TestTensorFlowModelDatasetVersioned: """Test suite with versioning argument passed into TensorFlowModelDataset creator""" @@ -338,10 +341,10 @@ def test_prevent_overwrite(self, dummy_tf_base_model, versioned_tf_model_dataset corresponding file for a given save version already exists.""" versioned_tf_model_dataset.save(dummy_tf_base_model) pattern = ( - r"Save path \`.+\` for TensorFlowModelDataset\(.+\) must " + r"Save path \'.+\' for TensorFlowModelDataset\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_tf_model_dataset.save(dummy_tf_base_model) @pytest.mark.parametrize( @@ -359,16 +362,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - rf"Save version `{save_version}` did not match load version `{load_version}` " + rf"Save version '{save_version}' did not match load version '{load_version}' " rf"for TensorFlowModelDataset\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_tf_model_dataset.save(dummy_tf_base_model) def test_http_filesystem_no_versioning(self, tensorflow_model_dataset): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): tensorflow_model_dataset( filepath="https://example.com/file.tf", version=Version(None, None) ) @@ -382,7 +385,7 @@ def test_exists(self, versioned_tf_model_dataset, dummy_tf_base_model): def test_no_versions(self, versioned_tf_model_dataset): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for TensorFlowModelDataset\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_tf_model_dataset.load() def test_version_str_repr(self, tf_model_dataset, versioned_tf_model_dataset): @@ -413,3 +416,26 @@ def test_versioning_existing_dataset( assert tf_model_dataset._filepath == versioned_tf_model_dataset._filepath versioned_tf_model_dataset.save(dummy_tf_base_model) assert versioned_tf_model_dataset.exists() + + def test_save_and_load_with_device( + self, + dummy_tf_base_model, + dummy_x_test, + filepath, + tensorflow_model_dataset, + load_version, + save_version, + ): + """Test versioned TensorflowModelDataset can load models using an explicit tf_device""" + hdf5_dataset = tensorflow_model_dataset( + filepath=filepath, + load_args={"tf_device": "/CPU:0"}, + version=Version(load_version, save_version), + ) + + predictions = dummy_tf_base_model.predict(dummy_x_test) + hdf5_dataset.save(dummy_tf_base_model) + + reloaded = hdf5_dataset.load() + new_predictions = reloaded.predict(dummy_x_test) + np.testing.assert_allclose(predictions, new_predictions, rtol=1e-6, atol=1e-6) diff --git a/tests/extras/datasets/text/test_text_dataset.py b/tests/extras/datasets/text/test_text_dataset.py index 3697260c35..1cb866988d 100644 --- a/tests/extras/datasets/text/test_text_dataset.py +++ b/tests/extras/datasets/text/test_text_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath import pytest @@ -35,7 +7,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.text import TextDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version STRING = "Write to text file." @@ -86,7 +58,7 @@ def test_open_extra_args(self, txt_data_set, fs_args): def test_load_missing_file(self, txt_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set TextDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): txt_data_set.load() @pytest.mark.parametrize( @@ -146,7 +118,7 @@ def test_save_and_load(self, versioned_txt_data_set): def test_no_versions(self, versioned_txt_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for TextDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_txt_data_set.load() def test_exists(self, versioned_txt_data_set): @@ -160,10 +132,10 @@ def test_prevent_overwrite(self, versioned_txt_data_set): corresponding text file for a given save version already exists.""" versioned_txt_data_set.save(STRING) pattern = ( - r"Save path \`.+\` for TextDataSet\(.+\) must " + r"Save path \'.+\' for TextDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_txt_data_set.save(STRING) @pytest.mark.parametrize( @@ -178,16 +150,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for TextDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for TextDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_txt_data_set.save(STRING) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): TextDataSet( filepath="https://example.com/file.txt", version=Version(None, None) ) @@ -206,7 +178,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_txt_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_txt_data_set.save(STRING) # Remove non-versioned dataset and try again diff --git a/tests/extras/extensions/__init__.py b/tests/extras/datasets/tracking/__init__.py similarity index 100% rename from tests/extras/extensions/__init__.py rename to tests/extras/datasets/tracking/__init__.py diff --git a/tests/extras/datasets/tracking/test_json_dataset.py b/tests/extras/datasets/tracking/test_json_dataset.py new file mode 100644 index 0000000000..9e0c046558 --- /dev/null +++ b/tests/extras/datasets/tracking/test_json_dataset.py @@ -0,0 +1,185 @@ +import json +from pathlib import Path, PurePosixPath + +import pytest +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.tracking import JSONDataSet +from kedro.io import DatasetError +from kedro.io.core import PROTOCOL_DELIMITER, Version + + +@pytest.fixture +def filepath_json(tmp_path): + return (tmp_path / "test.json").as_posix() + + +@pytest.fixture +def json_dataset(filepath_json, save_args, fs_args): + return JSONDataSet(filepath=filepath_json, save_args=save_args, fs_args=fs_args) + + +@pytest.fixture +def explicit_versioned_json_dataset(filepath_json, load_version, save_version): + return JSONDataSet( + filepath=filepath_json, version=Version(load_version, save_version) + ) + + +@pytest.fixture +def dummy_data(): + return {"col1": 1, "col2": 2, "col3": "mystring"} + + +class TestJSONDataSet: + def test_save(self, filepath_json, dummy_data, tmp_path, save_version): + """Test saving and reloading the data set.""" + json_dataset = JSONDataSet( + filepath=filepath_json, version=Version(None, save_version) + ) + json_dataset.save(dummy_data) + + actual_filepath = Path(json_dataset._filepath.as_posix()) + test_filepath = tmp_path / "locally_saved.json" + + test_filepath.parent.mkdir(parents=True, exist_ok=True) + with open(test_filepath, "w", encoding="utf-8") as file: + json.dump(dummy_data, file) + + with open(test_filepath, encoding="utf-8") as file: + test_data = json.load(file) + + with open( + (actual_filepath / save_version / "test.json"), encoding="utf-8" + ) as actual_file: + actual_data = json.load(actual_file) + + assert actual_data == test_data + assert json_dataset._fs_open_args_load == {} + assert json_dataset._fs_open_args_save == {"mode": "w"} + + def test_load_fail(self, json_dataset, dummy_data): + json_dataset.save(dummy_data) + pattern = r"Loading not supported for 'JSONDataSet'" + with pytest.raises(DatasetError, match=pattern): + json_dataset.load() + + def test_exists(self, json_dataset, dummy_data): + """Test `exists` method invocation for both existing and + nonexistent data set.""" + assert not json_dataset.exists() + json_dataset.save(dummy_data) + assert json_dataset.exists() + + @pytest.mark.parametrize( + "save_args", [{"k1": "v1", "index": "value"}], indirect=True + ) + def test_save_extra_params(self, json_dataset, save_args): + """Test overriding the default save arguments.""" + for key, value in save_args.items(): + assert json_dataset._save_args[key] == value + + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], + indirect=True, + ) + def test_open_extra_args(self, json_dataset, fs_args): + assert json_dataset._fs_open_args_load == fs_args["open_args_load"] + assert json_dataset._fs_open_args_save == {"mode": "w"} # default unchanged + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/file.json", S3FileSystem), + ("file:///tmp/test.json", LocalFileSystem), + ("/tmp/test.json", LocalFileSystem), + ("gcs://bucket/file.json", GCSFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + data_set = JSONDataSet(filepath=filepath) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.json" + data_set = JSONDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + def test_not_version_str_repr(self): + """Test that version is not in string representation of the class instance.""" + filepath = "test.json" + ds = JSONDataSet(filepath=filepath) + + assert filepath in str(ds) + assert "version" not in str(ds) + assert "JSONDataSet" in str(ds) + assert "protocol" in str(ds) + # Default save_args + assert "save_args={'indent': 2}" in str(ds) + + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance.""" + filepath = "test.json" + ds_versioned = JSONDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "JSONDataSet" in str(ds_versioned) + assert "protocol" in str(ds_versioned) + # Default save_args + assert "save_args={'indent': 2}" in str(ds_versioned) + + def test_prevent_overwrite(self, explicit_versioned_json_dataset, dummy_data): + """Check the error when attempting to override the data set if the + corresponding json file for a given save version already exists.""" + explicit_versioned_json_dataset.save(dummy_data) + pattern = ( + r"Save path \'.+\' for JSONDataSet\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DatasetError, match=pattern): + explicit_versioned_json_dataset.save(dummy_data) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, + explicit_versioned_json_dataset, + load_version, + save_version, + dummy_data, + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + f"Save version '{save_version}' did not match " + f"load version '{load_version}' for " + r"JSONDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + explicit_versioned_json_dataset.save(dummy_data) + + def test_http_filesystem_no_versioning(self): + pattern = "Versioning is not supported for HTTP protocols." + + with pytest.raises(DatasetError, match=pattern): + JSONDataSet( + filepath="https://example.com/file.json", version=Version(None, None) + ) diff --git a/tests/extras/datasets/tracking/test_metrics_dataset.py b/tests/extras/datasets/tracking/test_metrics_dataset.py new file mode 100644 index 0000000000..d65b50215d --- /dev/null +++ b/tests/extras/datasets/tracking/test_metrics_dataset.py @@ -0,0 +1,194 @@ +import json +from pathlib import Path, PurePosixPath + +import pytest +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.tracking import MetricsDataSet +from kedro.io import DatasetError +from kedro.io.core import PROTOCOL_DELIMITER, Version + + +@pytest.fixture +def filepath_json(tmp_path): + return (tmp_path / "test.json").as_posix() + + +@pytest.fixture +def metrics_dataset(filepath_json, save_args, fs_args): + return MetricsDataSet(filepath=filepath_json, save_args=save_args, fs_args=fs_args) + + +@pytest.fixture +def explicit_versioned_metrics_dataset(filepath_json, load_version, save_version): + return MetricsDataSet( + filepath=filepath_json, version=Version(load_version, save_version) + ) + + +@pytest.fixture +def dummy_data(): + return {"col1": 1, "col2": 2, "col3": 3} + + +class TestMetricsDataSet: + def test_save_data( + self, + dummy_data, + tmp_path, + filepath_json, + save_version, + ): + """Test saving and reloading the data set.""" + metrics_dataset = MetricsDataSet( + filepath=filepath_json, version=Version(None, save_version) + ) + metrics_dataset.save(dummy_data) + + actual_filepath = Path(metrics_dataset._filepath.as_posix()) + test_filepath = tmp_path / "locally_saved.json" + + test_filepath.parent.mkdir(parents=True, exist_ok=True) + with open(test_filepath, "w", encoding="utf-8") as file: + json.dump(dummy_data, file) + + with open(test_filepath, encoding="utf-8") as file: + test_data = json.load(file) + + with open( + (actual_filepath / save_version / "test.json"), encoding="utf-8" + ) as actual_file: + actual_data = json.load(actual_file) + + assert actual_data == test_data + assert metrics_dataset._fs_open_args_load == {} + assert metrics_dataset._fs_open_args_save == {"mode": "w"} + + def test_load_fail(self, metrics_dataset, dummy_data): + metrics_dataset.save(dummy_data) + pattern = r"Loading not supported for 'MetricsDataSet'" + with pytest.raises(DatasetError, match=pattern): + metrics_dataset.load() + + def test_exists(self, metrics_dataset, dummy_data): + """Test `exists` method invocation for both existing and + nonexistent data set.""" + assert not metrics_dataset.exists() + metrics_dataset.save(dummy_data) + assert metrics_dataset.exists() + + @pytest.mark.parametrize( + "save_args", [{"k1": "v1", "index": "value"}], indirect=True + ) + def test_save_extra_params(self, metrics_dataset, save_args): + """Test overriding the default save arguments.""" + for key, value in save_args.items(): + assert metrics_dataset._save_args[key] == value + + @pytest.mark.parametrize( + "fs_args", + [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], + indirect=True, + ) + def test_open_extra_args(self, metrics_dataset, fs_args): + assert metrics_dataset._fs_open_args_load == fs_args["open_args_load"] + assert metrics_dataset._fs_open_args_save == {"mode": "w"} # default unchanged + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/file.json", S3FileSystem), + ("file:///tmp/test.json", LocalFileSystem), + ("/tmp/test.json", LocalFileSystem), + ("gcs://bucket/file.json", GCSFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + data_set = MetricsDataSet(filepath=filepath) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.json" + data_set = MetricsDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + def test_fail_on_saving_non_numeric_value(self, metrics_dataset): + data = {"col1": 1, "col2": 2, "col3": "hello"} + + pattern = "The MetricsDataSet expects only numeric values." + with pytest.raises(DatasetError, match=pattern): + metrics_dataset.save(data) + + def test_not_version_str_repr(self): + """Test that version is not in string representation of the class instance.""" + filepath = "test.json" + ds = MetricsDataSet(filepath=filepath) + + assert filepath in str(ds) + assert "version" not in str(ds) + assert "MetricsDataSet" in str(ds) + assert "protocol" in str(ds) + # Default save_args + assert "save_args={'indent': 2}" in str(ds) + + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance.""" + filepath = "test.json" + ds_versioned = MetricsDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "MetricsDataSet" in str(ds_versioned) + assert "protocol" in str(ds_versioned) + # Default save_args + assert "save_args={'indent': 2}" in str(ds_versioned) + + def test_prevent_overwrite(self, explicit_versioned_metrics_dataset, dummy_data): + """Check the error when attempting to override the data set if the + corresponding json file for a given save version already exists.""" + explicit_versioned_metrics_dataset.save(dummy_data) + pattern = ( + r"Save path \'.+\' for MetricsDataSet\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DatasetError, match=pattern): + explicit_versioned_metrics_dataset.save(dummy_data) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, explicit_versioned_metrics_dataset, load_version, save_version, dummy_data + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + f"Save version '{save_version}' did not match " + f"load version '{load_version}' for " + r"MetricsDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + explicit_versioned_metrics_dataset.save(dummy_data) + + def test_http_filesystem_no_versioning(self): + pattern = "Versioning is not supported for HTTP protocols." + + with pytest.raises(DatasetError, match=pattern): + MetricsDataSet( + filepath="https://example.com/file.json", version=Version(None, None) + ) diff --git a/tests/extras/datasets/video/conftest.py b/tests/extras/datasets/video/conftest.py new file mode 100644 index 0000000000..ff084cdb5e --- /dev/null +++ b/tests/extras/datasets/video/conftest.py @@ -0,0 +1,107 @@ +from pathlib import Path + +import pytest +from PIL import Image +from utils import TEST_FPS, TEST_HEIGHT, TEST_WIDTH + +from kedro.extras.datasets.video.video_dataset import ( + FileVideo, + GeneratorVideo, + SequenceVideo, +) + + +@pytest.fixture(scope="module") +def red_frame(): + return Image.new("RGB", (TEST_WIDTH, TEST_HEIGHT), (255, 0, 0)) + + +@pytest.fixture(scope="module") +def green_frame(): + return Image.new("RGB", (TEST_WIDTH, TEST_HEIGHT), (0, 255, 0)) + + +@pytest.fixture(scope="module") +def blue_frame(): + return Image.new("RGB", (TEST_WIDTH, TEST_HEIGHT), (0, 0, 255)) + + +@pytest.fixture(scope="module") +def yellow_frame(): + return Image.new("RGB", (TEST_WIDTH, TEST_HEIGHT), (255, 255, 0)) + + +@pytest.fixture(scope="module") +def purple_frame(): + return Image.new("RGB", (TEST_WIDTH, TEST_HEIGHT), (255, 0, 255)) + + +@pytest.fixture +def color_video(red_frame, green_frame, blue_frame, yellow_frame, purple_frame): + return SequenceVideo( + [red_frame, green_frame, blue_frame, yellow_frame, purple_frame], + fps=TEST_FPS, + ) + + +@pytest.fixture +def color_video_generator( + red_frame, green_frame, blue_frame, yellow_frame, purple_frame +): + sequence = [red_frame, green_frame, blue_frame, yellow_frame, purple_frame] + + def generator(): + yield from sequence + + return GeneratorVideo( + generator(), + length=len(sequence), + fps=TEST_FPS, + ) + + +@pytest.fixture +def filepath_mp4(): + """This is a real video converted to mp4/h264 with ffmpeg command""" + return str(Path(__file__).parent / "data/video.mp4") + + +@pytest.fixture +def filepath_mkv(): + """This a a real video recoreded with an Axis network camera""" + return str(Path(__file__).parent / "data/video.mkv") + + +@pytest.fixture +def filepath_mjpeg(): + """This is a real video recorded with an Axis network camera""" + return str(Path(__file__).parent / "data/video.mjpeg") + + +@pytest.fixture +def filepath_color_mp4(): + """This is a video created with the OpenCV VideoWriter + + it contains 5 frames which each is a single color: red, green, blue, yellow, purple + """ + return str(Path(__file__).parent / "data/color_video.mp4") + + +@pytest.fixture +def mp4_object(filepath_mp4): + return FileVideo(filepath_mp4) + + +@pytest.fixture +def mkv_object(filepath_mkv): + return FileVideo(filepath_mkv) + + +@pytest.fixture +def mjpeg_object(filepath_mjpeg): + return FileVideo(filepath_mjpeg) + + +@pytest.fixture +def color_video_object(filepath_color_mp4): + return FileVideo(filepath_color_mp4) diff --git a/tests/extras/datasets/video/data/color_video.mp4 b/tests/extras/datasets/video/data/color_video.mp4 new file mode 100644 index 0000000000..01944b1b78 Binary files /dev/null and b/tests/extras/datasets/video/data/color_video.mp4 differ diff --git a/tests/extras/datasets/video/data/video.mjpeg b/tests/extras/datasets/video/data/video.mjpeg new file mode 100644 index 0000000000..cab90dda94 Binary files /dev/null and b/tests/extras/datasets/video/data/video.mjpeg differ diff --git a/tests/extras/datasets/video/data/video.mkv b/tests/extras/datasets/video/data/video.mkv new file mode 100644 index 0000000000..2710c022ff Binary files /dev/null and b/tests/extras/datasets/video/data/video.mkv differ diff --git a/tests/extras/datasets/video/data/video.mp4 b/tests/extras/datasets/video/data/video.mp4 new file mode 100644 index 0000000000..4c4b974d92 Binary files /dev/null and b/tests/extras/datasets/video/data/video.mp4 differ diff --git a/tests/extras/datasets/video/test_sliced_video.py b/tests/extras/datasets/video/test_sliced_video.py new file mode 100644 index 0000000000..e2e4975d1a --- /dev/null +++ b/tests/extras/datasets/video/test_sliced_video.py @@ -0,0 +1,56 @@ +import numpy as np +from utils import TEST_HEIGHT, TEST_WIDTH + + +class TestSlicedVideo: + def test_slice_sequence_video_first(self, color_video): + """Test slicing and then indexing a SequenceVideo""" + slice_red_green = color_video[:2] + red = np.array(slice_red_green[0]) + assert red.shape == (TEST_HEIGHT, TEST_WIDTH, 3) + assert np.all(red[:, :, 0] == 255) + assert np.all(red[:, :, 1] == 0) + assert np.all(red[:, :, 2] == 0) + + def test_slice_sequence_video_last_as_index(self, color_video): + """Test slicing and then indexing a SequenceVideo""" + slice_blue_yellow_purple = color_video[2:5] + purple = np.array(slice_blue_yellow_purple[2]) + assert purple.shape == (TEST_HEIGHT, TEST_WIDTH, 3) + assert np.all(purple[:, :, 0] == 255) + assert np.all(purple[:, :, 1] == 0) + assert np.all(purple[:, :, 2] == 255) + + def test_slice_sequence_video_last_as_end(self, color_video): + """Test slicing and then indexing a SequenceVideo""" + slice_blue_yellow_purple = color_video[2:] + purple = np.array(slice_blue_yellow_purple[-1]) + assert purple.shape == (TEST_HEIGHT, TEST_WIDTH, 3) + assert np.all(purple[:, :, 0] == 255) + assert np.all(purple[:, :, 1] == 0) + assert np.all(purple[:, :, 2] == 255) + + def test_slice_sequence_attribute(self, color_video): + """Test that attributes from the base class are reachable from sliced views""" + slice_red_green = color_video[:2] + assert slice_red_green.fps == color_video.fps + + def test_slice_sliced_video(self, color_video): + """Test slicing and then indexing a SlicedVideo""" + slice_green_blue_yellow = color_video[1:4] + slice_green_blue = slice_green_blue_yellow[:-1] + blue = np.array(slice_green_blue[1]) + assert blue.shape == (TEST_HEIGHT, TEST_WIDTH, 3) + assert np.all(blue[:, :, 0] == 0) + assert np.all(blue[:, :, 1] == 0) + assert np.all(blue[:, :, 2] == 255) + + def test_slice_file_video_first(self, mp4_object): + """Test slicing and then indexing a FileVideo""" + sliced_video = mp4_object[:2] + assert np.all(np.array(sliced_video[0]) == np.array(mp4_object[0])) + + def test_slice_file_video_last(self, mp4_object): + """Test slicing and then indexing a FileVideo""" + sliced_video = mp4_object[-2:] + assert np.all(np.array(sliced_video[-1]) == np.array(mp4_object[-1])) diff --git a/tests/extras/datasets/video/test_video_dataset.py b/tests/extras/datasets/video/test_video_dataset.py new file mode 100644 index 0000000000..ceeb13929b --- /dev/null +++ b/tests/extras/datasets/video/test_video_dataset.py @@ -0,0 +1,186 @@ +import boto3 +import pytest +from moto import mock_s3 +from utils import TEST_FPS, assert_videos_equal + +from kedro.extras.datasets.video import VideoDataSet +from kedro.extras.datasets.video.video_dataset import FileVideo, SequenceVideo +from kedro.io import DatasetError + +S3_BUCKET_NAME = "test_bucket" +S3_KEY_PATH = "video" +S3_FULL_PATH = f"s3://{S3_BUCKET_NAME}/{S3_KEY_PATH}/" +AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} + + +@pytest.fixture +def tmp_filepath_mp4(tmp_path): + return (tmp_path / "test.mp4").as_posix() + + +@pytest.fixture +def tmp_filepath_avi(tmp_path): + return (tmp_path / "test.mjpeg").as_posix() + + +@pytest.fixture +def empty_dataset_mp4(tmp_filepath_mp4): + return VideoDataSet(filepath=tmp_filepath_mp4) + + +@pytest.fixture +def empty_dataset_avi(tmp_filepath_avi): + return VideoDataSet(filepath=tmp_filepath_avi) + + +@pytest.fixture +def mocked_s3_bucket(): + """Create a bucket for testing using moto.""" + with mock_s3(): + conn = boto3.client( + "s3", + region_name="us-east-1", + aws_access_key_id=AWS_CREDENTIALS["key"], + aws_secret_access_key=AWS_CREDENTIALS["secret"], + ) + conn.create_bucket(Bucket=S3_BUCKET_NAME) + yield conn + + +class TestVideoDataSet: + def test_load_mp4(self, filepath_mp4, mp4_object): + """Loading a mp4 dataset should create a FileVideo""" + ds = VideoDataSet(filepath_mp4) + loaded_video = ds.load() + assert_videos_equal(loaded_video, mp4_object) + + def test_save_and_load_mp4(self, empty_dataset_mp4, mp4_object): + """Test saving and reloading the data set.""" + empty_dataset_mp4.save(mp4_object) + reloaded_video = empty_dataset_mp4.load() + assert_videos_equal(mp4_object, reloaded_video) + assert reloaded_video.fourcc == empty_dataset_mp4._fourcc + + @pytest.mark.skip( + reason="Only one available codec that is typically installed when testing" + ) + def test_save_with_other_codec(self, tmp_filepath_mp4, mp4_object): + """Test saving the video with another codec than default.""" + save_fourcc = "xvid" + ds = VideoDataSet(filepath=tmp_filepath_mp4, fourcc=save_fourcc) + ds.save(mp4_object) + reloaded_video = ds.load() + assert reloaded_video.fourcc == save_fourcc + + def test_save_with_derived_codec(self, tmp_filepath_mp4, color_video): + """Test saving video by the codec specified in the video object""" + ds = VideoDataSet(filepath=tmp_filepath_mp4, fourcc=None) + ds.save(color_video) + reloaded_video = ds.load() + assert reloaded_video.fourcc == color_video.fourcc + + def test_saved_fps(self, empty_dataset_mp4, color_video): + """Verify that a saved video has the same framerate as specified in the video object""" + empty_dataset_mp4.save(color_video) + reloaded_video = empty_dataset_mp4.load() + assert reloaded_video.fps == TEST_FPS + + def test_save_sequence_video(self, color_video, empty_dataset_mp4): + """Test save (and load) a SequenceVideo object""" + empty_dataset_mp4.save(color_video) + reloaded_video = empty_dataset_mp4.load() + assert_videos_equal(color_video, reloaded_video) + + def test_save_generator_video( + self, color_video_generator, empty_dataset_mp4, color_video + ): + """Test save (and load) a GeneratorVideo object + + Since the GeneratorVideo is exhaused after saving the video to file we use + the SequenceVideo (color_video) which has the same frames to compare the + loaded video to. + """ + empty_dataset_mp4.save(color_video_generator) + reloaded_video = empty_dataset_mp4.load() + assert_videos_equal(color_video, reloaded_video) + + def test_exists(self, empty_dataset_mp4, mp4_object): + """Test `exists` method invocation for both existing and + nonexistent data set.""" + assert not empty_dataset_mp4.exists() + empty_dataset_mp4.save(mp4_object) + assert empty_dataset_mp4.exists() + + @pytest.mark.skip(reason="Can't deal with videos with missing time info") + def test_convert_video(self, empty_dataset_mp4, mjpeg_object): + """Load a file video in mjpeg format and save in mp4v""" + empty_dataset_mp4.save(mjpeg_object) + reloaded_video = empty_dataset_mp4.load() + assert_videos_equal(mjpeg_object, reloaded_video) + + def test_load_missing_file(self, empty_dataset_mp4): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set VideoDataSet\(.*\)" + with pytest.raises(DatasetError, match=pattern): + empty_dataset_mp4.load() + + def test_save_s3(self, mp4_object, mocked_s3_bucket, tmp_path): + """Test to save a VideoDataSet to S3 storage""" + video_name = "video.mp4" + + dataset = VideoDataSet( + filepath=S3_FULL_PATH + video_name, credentials=AWS_CREDENTIALS + ) + dataset.save(mp4_object) + + tmp_file = tmp_path / video_name + mocked_s3_bucket.download_file( + Bucket=S3_BUCKET_NAME, + Key=S3_KEY_PATH + "/" + video_name, + Filename=str(tmp_file), + ) + reloaded_video = FileVideo(str(tmp_file)) + assert_videos_equal(reloaded_video, mp4_object) + + @pytest.mark.xfail + @pytest.mark.parametrize( + "fourcc, suffix", + [ + ("mp4v", "mp4"), + ("mp4v", "mjpeg"), + ("mp4v", "avi"), + ("avc1", "mp4"), + ("avc1", "mjpeg"), + ("avc1", "avi"), + ("mjpg", "mp4"), + ("mjpg", "mjpeg"), + ("mjpg", "avi"), + ("xvid", "mp4"), + ("xvid", "mjpeg"), + ("xvid", "avi"), + ("x264", "mp4"), + ("x264", "mjpeg"), + ("x264", "avi"), + ("divx", "mp4"), + ("divx", "mjpeg"), + ("divx", "avi"), + ("fmp4", "mp4"), + ("fmp4", "mjpeg"), + ("fmp4", "avi"), + ], + ) + def test_video_codecs(self, fourcc, suffix, color_video): + """Test different codec and container combinations + + Some of these are expected to fail depending on what + codecs are installed on the machine. + """ + video_name = f"video.{suffix}" + video = SequenceVideo(color_video._frames, 25, fourcc) + ds = VideoDataSet(video_name, fourcc=None) + ds.save(video) + # We also need to verify that the correct codec was used + # since OpenCV silently (with a warning in the log) fall backs to + # another codec if one specified is not compatible with the container + reloaded_video = ds.load() + assert reloaded_video.fourcc == fourcc diff --git a/tests/extras/datasets/video/test_video_objects.py b/tests/extras/datasets/video/test_video_objects.py new file mode 100644 index 0000000000..66a284fa60 --- /dev/null +++ b/tests/extras/datasets/video/test_video_objects.py @@ -0,0 +1,170 @@ +import numpy as np +import pytest +from utils import ( + DEFAULT_FOURCC, + MJPEG_FOURCC, + MJPEG_FPS, + MJPEG_LEN, + MJPEG_SIZE, + MKV_FOURCC, + MKV_FPS, + MKV_LEN, + MKV_SIZE, + MP4_FOURCC, + MP4_FPS, + MP4_LEN, + MP4_SIZE, + TEST_FPS, + TEST_HEIGHT, + TEST_NUM_COLOR_FRAMES, + TEST_WIDTH, + assert_images_equal, +) + +from kedro.extras.datasets.video.video_dataset import ( + FileVideo, + GeneratorVideo, + SequenceVideo, +) + + +class TestSequenceVideo: + def test_sequence_video_indexing_first(self, color_video, red_frame): + """Test indexing a SequenceVideo""" + red = np.array(color_video[0]) + assert red.shape == (TEST_HEIGHT, TEST_WIDTH, 3) + assert np.all(red == red_frame) + + def test_sequence_video_indexing_last(self, color_video, purple_frame): + """Test indexing a SequenceVideo""" + purple = np.array(color_video[-1]) + assert purple.shape == (TEST_HEIGHT, TEST_WIDTH, 3) + assert np.all(purple == purple_frame) + + def test_sequence_video_iterable(self, color_video): + """Test iterating a SequenceVideo""" + for i, img in enumerate(map(np.array, color_video)): + assert np.all(img == np.array(color_video[i])) + assert i == TEST_NUM_COLOR_FRAMES - 1 + + def test_sequence_video_fps(self, color_video): + # Test the one set by the fixture + assert color_video.fps == TEST_FPS + + # Test creating with another fps + test_fps_new = 123 + color_video_new = SequenceVideo(color_video._frames, fps=test_fps_new) + assert color_video_new.fps == test_fps_new + + def test_sequence_video_len(self, color_video): + assert len(color_video) == TEST_NUM_COLOR_FRAMES + + def test_sequence_video_size(self, color_video): + assert color_video.size == (TEST_WIDTH, TEST_HEIGHT) + + def test_sequence_video_fourcc_default_value(self, color_video): + assert color_video.fourcc == DEFAULT_FOURCC + + def test_sequence_video_fourcc(self, color_video): + fourcc_new = "mjpg" + assert ( + DEFAULT_FOURCC != fourcc_new + ), "Test does not work if new test value is same as default" + color_video_new = SequenceVideo( + color_video._frames, fps=TEST_FPS, fourcc=fourcc_new + ) + assert color_video_new.fourcc == fourcc_new + + +class TestGeneratorVideo: + def test_generator_video_iterable(self, color_video_generator, color_video): + """Test iterating a GeneratorVideo + + The content of the mock GeneratorVideo should be the same as the SequenceVideo, + the content in the later is tested in other unit tests and can thus be trusted + """ + for i, img in enumerate(map(np.array, color_video_generator)): + assert np.all(img == np.array(color_video[i])) + assert i == TEST_NUM_COLOR_FRAMES - 1 + + def test_generator_video_fps(self, color_video_generator): + # Test the one set by the fixture + assert color_video_generator.fps == TEST_FPS + + # Test creating with another fps + test_fps_new = 123 + color_video_new = GeneratorVideo( + color_video_generator._gen, length=TEST_NUM_COLOR_FRAMES, fps=test_fps_new + ) + assert color_video_new.fps == test_fps_new + + def test_generator_video_len(self, color_video_generator): + assert len(color_video_generator) == TEST_NUM_COLOR_FRAMES + + def test_generator_video_size(self, color_video_generator): + assert color_video_generator.size == (TEST_WIDTH, TEST_HEIGHT) + + def test_generator_video_fourcc_default_value(self, color_video_generator): + assert color_video_generator.fourcc == DEFAULT_FOURCC + + def test_generator_video_fourcc(self, color_video_generator): + fourcc_new = "mjpg" + assert ( + DEFAULT_FOURCC != fourcc_new + ), "Test does not work if new test value is same as default" + color_video_new = GeneratorVideo( + color_video_generator._gen, + length=TEST_NUM_COLOR_FRAMES, + fps=TEST_FPS, + fourcc=fourcc_new, + ) + assert color_video_new.fourcc == fourcc_new + + +class TestFileVideo: + @pytest.mark.skip(reason="Can't deal with videos with missing time info") + def test_file_props_mjpeg(self, mjpeg_object): + assert mjpeg_object.fourcc == MJPEG_FOURCC + assert mjpeg_object.fps == MJPEG_FPS + assert mjpeg_object.size == MJPEG_SIZE + assert len(mjpeg_object) == MJPEG_LEN + + def test_file_props_mkv(self, mkv_object): + assert mkv_object.fourcc == MKV_FOURCC + assert mkv_object.fps == MKV_FPS + assert mkv_object.size == MKV_SIZE + assert len(mkv_object) == MKV_LEN + + def test_file_props_mp4(self, mp4_object): + assert mp4_object.fourcc == MP4_FOURCC + assert mp4_object.fps == MP4_FPS + assert mp4_object.size == MP4_SIZE + assert len(mp4_object) == MP4_LEN + + def test_file_index_first(self, color_video_object, red_frame): + assert_images_equal(color_video_object[0], red_frame) + + def test_file_index_last_by_index(self, color_video_object, purple_frame): + assert_images_equal(color_video_object[TEST_NUM_COLOR_FRAMES - 1], purple_frame) + + def test_file_index_last(self, color_video_object, purple_frame): + assert_images_equal(color_video_object[-1], purple_frame) + + def test_file_video_failed_capture(self, mocker): + """Validate good behavior on failed decode + + The best behavior in this case is not obvious, the len property of the + video object specifies more frames than is actually possible to decode. We + cannot know this in advance without spending loads of time to decode all frames + in order to count them.""" + mock_cv2 = mocker.patch("kedro.extras.datasets.video.video_dataset.cv2") + mock_cap = mock_cv2.VideoCapture.return_value = mocker.Mock() + mock_cap.get.return_value = 2 # Set the length of the video + ds = FileVideo("/a/b/c") + + mock_cap.read.return_value = True, np.zeros((1, 1)) + assert ds[0] + + mock_cap.read.return_value = False, None + with pytest.raises(IndexError): + ds[1] diff --git a/tests/extras/datasets/video/utils.py b/tests/extras/datasets/video/utils.py new file mode 100644 index 0000000000..6b675aed2f --- /dev/null +++ b/tests/extras/datasets/video/utils.py @@ -0,0 +1,49 @@ +import itertools + +import numpy as np +from PIL import ImageChops + +TEST_WIDTH = 640 # Arbitrary value for testing +TEST_HEIGHT = 480 # Arbitrary value for testing +TEST_FPS = 1 # Arbitrary value for testing + +TEST_NUM_COLOR_FRAMES = ( + 5 # This should be the same as number of frames in conftest videos +) +DEFAULT_FOURCC = "mp4v" # The expected default fourcc value + +# This is video data extracted from the video files with ffmpeg command +MKV_SIZE = (640, 360) +MKV_FPS = 50 +MKV_FOURCC = "h264" +MKV_LEN = 109 # from ffprobe + +MP4_SIZE = (640, 360) +MP4_FPS = 50 +MP4_FOURCC = "avc1" +MP4_LEN = 109 # from ffprobe + +MJPEG_SIZE = (640, 360) +MJPEG_FPS = 25 # From ffprobe, not reported by ffmpeg command +# I'm not sure that MJPE is the correct fourcc code for +# mjpeg video since I cannot find any official reference to +# that code. This is however what the openCV VideoCapture +# reports for the video, so we leave it like this for now.. +MJPEG_FOURCC = "mjpe" +MJPEG_LEN = 24 # from ffprobe + + +def assert_images_equal(image_1, image_2): + """Assert that two images are approximately equal, allow for some + compression artifacts""" + assert image_1.size == image_2.size + diff = np.asarray(ImageChops.difference(image_1, image_2)) + assert np.mean(diff) < 5 + assert np.mean(diff > 50) < 0.01 # Max 1% of pixels + + +def assert_videos_equal(video_1, video_2): + assert len(video_1) == len(video_2) + + for image_1, image_2 in itertools.zip_longest(video_1, video_2): + assert_images_equal(image_1, image_2) diff --git a/tests/extras/datasets/yaml/test_yaml_dataset.py b/tests/extras/datasets/yaml/test_yaml_dataset.py index 6fbb1a9a15..432afaed3d 100644 --- a/tests/extras/datasets/yaml/test_yaml_dataset.py +++ b/tests/extras/datasets/yaml/test_yaml_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from pathlib import Path, PurePosixPath import pandas as pd @@ -37,7 +9,7 @@ from s3fs.core import S3FileSystem from kedro.extras.datasets.yaml import YAMLDataSet -from kedro.io import DataSetError +from kedro.io import DatasetError from kedro.io.core import PROTOCOL_DELIMITER, Version @@ -99,7 +71,7 @@ def test_open_extra_args(self, yaml_data_set, fs_args): def test_load_missing_file(self, yaml_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set YAMLDataSet\(.*\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): yaml_data_set.load() @pytest.mark.parametrize( @@ -171,7 +143,7 @@ def test_save_and_load(self, versioned_yaml_data_set, dummy_data): def test_no_versions(self, versioned_yaml_data_set): """Check the error if no versions are available for load.""" pattern = r"Did not find any versions for YAMLDataSet\(.+\)" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_yaml_data_set.load() def test_exists(self, versioned_yaml_data_set, dummy_data): @@ -185,10 +157,10 @@ def test_prevent_overwrite(self, versioned_yaml_data_set, dummy_data): corresponding yaml file for a given save version already exists.""" versioned_yaml_data_set.save(dummy_data) pattern = ( - r"Save path \`.+\` for YAMLDataSet\(.+\) must " + r"Save path \'.+\' for YAMLDataSet\(.+\) must " r"not exist if versioning is enabled\." ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_yaml_data_set.save(dummy_data) @pytest.mark.parametrize( @@ -203,16 +175,16 @@ def test_save_version_warning( """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( - r"Save version `{0}` did not match load version `{1}` " - r"for YAMLDataSet\(.+\)".format(save_version, load_version) + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for YAMLDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_yaml_data_set.save(dummy_data) def test_http_filesystem_no_versioning(self): - pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + pattern = "Versioning is not supported for HTTP protocols." - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): YAMLDataSet( filepath="https://example.com/file.yaml", version=Version(None, None) ) @@ -229,7 +201,7 @@ def test_versioning_existing_dataset( f"(?=.*file with the same name already exists in the directory)" f"(?=.*{versioned_yaml_data_set._filepath.parent.as_posix()})" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): versioned_yaml_data_set.save(dummy_data) # Remove non-versioned dataset and try again diff --git a/tests/extras/decorators/test_memory_profiler.py b/tests/extras/decorators/test_memory_profiler.py deleted file mode 100644 index 04a30fc735..0000000000 --- a/tests/extras/decorators/test_memory_profiler.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib -import logging -from time import sleep - -import pytest - -from kedro.extras.decorators import memory_profiler - - -def sleeping_identity(inp): - sleep(0.1) - return inp - - -def test_mem_profile(caplog): - func = memory_profiler.mem_profile(sleeping_identity) - res = func(1) - - logger_name, severity, message = caplog.record_tuples[0] - assert res == 1 - assert logger_name == "kedro.extras.decorators.memory_profiler" - assert severity == logging.INFO - expected = "Running '{}.{}' consumed".format( - sleeping_identity.__module__, sleeping_identity.__qualname__ - ) - assert expected in message - - -def test_mem_profile_old_versions(caplog, mocker): - mocker.patch( - "kedro.extras.decorators.memory_profiler.memory_usage", - return_value=[[float(0)], 1], - ) - func = memory_profiler.mem_profile(sleeping_identity) - res = func(1) - - logger_name, severity, message = caplog.record_tuples[0] - assert res == 1 - assert logger_name == "kedro.extras.decorators.memory_profiler" - assert severity == logging.INFO - expected = "Running '{}.{}' consumed".format( - sleeping_identity.__module__, sleeping_identity.__qualname__ - ) - assert expected in message - - -def test_import_error(mocker): - mocker.patch.dict("sys.modules", {"memory_profiler": None}) - pattern = ( - r".*`pip install kedro\[profilers\]` to get the required " - "memory profiler dependencies" - ) - with pytest.raises(ImportError, match=pattern): - importlib.reload(memory_profiler) diff --git a/tests/extras/decorators/test_retry_node.py b/tests/extras/decorators/test_retry_node.py deleted file mode 100644 index ae12748085..0000000000 --- a/tests/extras/decorators/test_retry_node.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -import pytest - -from kedro.extras.decorators.retry_node import retry -from kedro.pipeline import node - - -def test_retry(): - def _bigger(obj): - obj["value"] += 1 - if obj["value"] >= 0: - return True - raise ValueError("Value less than 0") - - decorated = node(_bigger, "in", "out").decorate(retry()) - - with pytest.raises(ValueError, match=r"Value less than 0"): - decorated.run({"in": {"value": -3}}) - - decorated2 = node(_bigger, "in", "out").decorate(retry(n_times=2)) - assert decorated2.run({"in": {"value": -3}}) diff --git a/tests/extras/extensions/test_ipython.py b/tests/extras/extensions/test_ipython.py deleted file mode 100644 index 6f969fde19..0000000000 --- a/tests/extras/extensions/test_ipython.py +++ /dev/null @@ -1,220 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -# pylint: disable=import-outside-toplevel,reimported -import pytest - -from kedro.extras.extensions.ipython import ( - init_kedro, - load_ipython_extension, - reload_kedro, -) -from kedro.framework.session.session import _deactivate_session -from kedro.framework.startup import ProjectMetadata - - -@pytest.fixture(autouse=True) -def project_path(mocker, tmp_path): - path = tmp_path - mocker.patch("kedro.extras.extensions.ipython.project_path", path) - - -@pytest.fixture(autouse=True) -def mocked_logging(mocker): - # Disable logging.config.dictConfig in KedroSession._setup_logging as - # it changes logging.config and affects other unit tests - return mocker.patch("logging.config.dictConfig") - - -@pytest.fixture(autouse=True) -def cleanup_session(): - yield - _deactivate_session() - - -class TestInitKedro: - def test_init_kedro(self, tmp_path, caplog): - from kedro.extras.extensions.ipython import project_path - - assert project_path == tmp_path - - kedro_path = tmp_path / "here" - init_kedro(str(kedro_path)) - expected_path = kedro_path.expanduser().resolve() - expected_message = f"Updated path to Kedro project: {expected_path}" - - log_messages = [record.getMessage() for record in caplog.records] - assert expected_message in log_messages - from kedro.extras.extensions.ipython import project_path - - # make sure global variable updated - assert project_path == expected_path - - def test_init_kedro_no_path(self, tmp_path, caplog): - from kedro.extras.extensions.ipython import project_path - - assert project_path == tmp_path - - init_kedro() - expected_message = f"No path argument was provided. Using: {tmp_path}" - - log_messages = [record.getMessage() for record in caplog.records] - assert expected_message in log_messages - from kedro.extras.extensions.ipython import project_path - - # make sure global variable stayed the same - assert project_path == tmp_path - - -class TestLoadKedroObjects: - def test_load_kedro_objects(self, tmp_path, mocker): - fake_metadata = ProjectMetadata( - source_dir=tmp_path / "src", # default - config_file=tmp_path / "pyproject.toml", - package_name="fake_package_name", - project_name="fake_project_name", - project_version="0.1", - project_path=tmp_path, - ) - mocker.patch("kedro.framework.session.session.configure_project") - mocker.patch( - "kedro.framework.startup.bootstrap_project", - return_value=fake_metadata, - ) - mock_line_magic = mocker.MagicMock() - mock_line_magic.__name__ = "abc" - mocker.patch( - "kedro.framework.cli.load_entry_points", return_value=[mock_line_magic] - ) - mock_register_line_magic = mocker.patch( - "kedro.extras.extensions.ipython.register_line_magic" - ) - mock_context = mocker.patch("kedro.framework.session.KedroSession.load_context") - mock_ipython = mocker.patch("kedro.extras.extensions.ipython.get_ipython") - - reload_kedro(tmp_path) - - mock_ipython().push.assert_called_once_with( - variables={ - "context": mock_context(), - "catalog": mock_context().catalog, - "session": mocker.ANY, - } - ) - assert mock_register_line_magic.call_count == 1 - - def test_load_kedro_objects_extra_args(self, tmp_path, mocker): - fake_metadata = ProjectMetadata( - source_dir=tmp_path / "src", # default - config_file=tmp_path / "pyproject.toml", - package_name="fake_package_name", - project_name="fake_project_name", - project_version="0.1", - project_path=tmp_path, - ) - mocker.patch("kedro.framework.session.session.configure_project") - mocker.patch( - "kedro.framework.startup.bootstrap_project", - return_value=fake_metadata, - ) - mock_line_magic = mocker.MagicMock() - mock_line_magic.__name__ = "abc" - mocker.patch( - "kedro.framework.cli.load_entry_points", return_value=[mock_line_magic] - ) - mock_register_line_magic = mocker.patch( - "kedro.extras.extensions.ipython.register_line_magic" - ) - mock_session_create = mocker.patch( - "kedro.framework.session.KedroSession.create" - ) - mock_ipython = mocker.patch("kedro.extras.extensions.ipython.get_ipython") - - reload_kedro(tmp_path, env="env1", extra_params={"key": "val"}) - - mock_session_create.assert_called_once_with( - "fake_package_name", tmp_path, env="env1", extra_params={"key": "val"} - ) - mock_ipython().push.assert_called_once_with( - variables={ - "context": mock_session_create().load_context(), - "catalog": mock_session_create().load_context().catalog, - "session": mock_session_create(), - } - ) - assert mock_register_line_magic.call_count == 1 - - def test_load_kedro_objects_not_in_kedro_project(self, tmp_path, mocker): - mocker.patch( - "kedro.framework.startup._get_project_metadata", - side_effect=RuntimeError, - ) - mock_ipython = mocker.patch("kedro.extras.extensions.ipython.get_ipython") - - with pytest.raises(RuntimeError): - reload_kedro(tmp_path) - assert not mock_ipython().called - assert not mock_ipython().push.called - - -class TestLoadIPythonExtension: - @pytest.mark.parametrize( - "error,expected_log_message,level", - [ - ( - ImportError, - "Kedro appears not to be installed in your current environment.", - "ERROR", - ), - ( - RuntimeError, - "Kedro extension was registered. Make sure you pass the project path to " - "`%reload_kedro` or set it using `%init_kedro`.", - "WARNING", - ), - ], - ) - def test_load_extension_not_in_kedro_env_or_project( - self, error, expected_log_message, level, mocker, caplog - ): - mocker.patch( - "kedro.framework.startup._get_project_metadata", - side_effect=error, - ) - mock_ipython = mocker.patch("kedro.extras.extensions.ipython.get_ipython") - - load_ipython_extension(mocker.MagicMock()) - - assert not mock_ipython().called - assert not mock_ipython().push.called - - log_messages = [ - record.getMessage() - for record in caplog.records - if record.levelname == level - ] - assert log_messages == [expected_log_message] diff --git a/tests/extras/logging/test_color_logger.py b/tests/extras/logging/test_color_logger.py deleted file mode 100644 index 1c042f16ad..0000000000 --- a/tests/extras/logging/test_color_logger.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging - -from kedro.extras.logging import ColorHandler - - -def test_color_logger(caplog): - log = logging.getLogger(__name__) - for handler in log.handlers: - log.removeHandler(handler) # pragma: no cover - - log.addHandler(ColorHandler()) - log.info("Test") - - for record in caplog.records: - assert record.levelname == "INFO" - assert "Test" in record.msg diff --git a/tests/extras/transformers/conftest.py b/tests/extras/transformers/conftest.py deleted file mode 100644 index a513de26e8..0000000000 --- a/tests/extras/transformers/conftest.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This file contains the fixtures that are reusable by any tests within -this directory. You don't need to import the fixtures as pytest will -discover them automatically. More info here: -https://docs.pytest.org/en/latest/fixture.html -""" - -from typing import Any, Dict - -import pytest - -from kedro.io import AbstractDataSet, DataCatalog - - -class FakeDataSet(AbstractDataSet): - def __init__(self, data): - self.log = [] - self.data = data - - def _load(self) -> Any: - self.log.append(("load", self.data)) - return self.data - - def _save(self, data: Any) -> None: - self.log.append(("save", data)) - self.data = data - - def _describe(self) -> Dict[str, Any]: - return {"data": self.data} - - -@pytest.fixture -def fake_data_set(): - return FakeDataSet(123) - - -@pytest.fixture -def catalog(fake_data_set): - return DataCatalog({"test": fake_data_set}) diff --git a/tests/extras/transformers/test_memory_profiler.py b/tests/extras/transformers/test_memory_profiler.py deleted file mode 100644 index 5bc139ce4f..0000000000 --- a/tests/extras/transformers/test_memory_profiler.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib - -import pytest - -import kedro.extras.transformers.memory_profiler as tf - - -class TestMemoryTransformer: - def test_memory_usage(self, catalog, caplog): - expected_log = "MiB memory at peak time" - catalog.add_transformer(tf.ProfileMemoryTransformer()) - - catalog.save("test", 42) - assert "Saving test consumed" in caplog.text - assert expected_log in caplog.text - caplog.clear() - assert catalog.load("test") == 42 - assert "Loading test consumed" in caplog.text - assert expected_log in caplog.text - - def test_import_error(self, mocker): - mocker.patch.dict("sys.modules", {"memory_profiler": None}) - pattern = ( - r".*`pip install kedro\[profilers\]` to get the required " - "memory profiler dependencies" - ) - with pytest.raises(ImportError, match=pattern): - importlib.reload(tf) diff --git a/tests/extras/transformers/test_time_profiler.py b/tests/extras/transformers/test_time_profiler.py deleted file mode 100644 index 1bcd7bbff7..0000000000 --- a/tests/extras/transformers/test_time_profiler.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -from kedro.extras.transformers import ProfileTimeTransformer - - -class TestTransformers: - def test_timing(self, catalog, caplog): - catalog.add_transformer(ProfileTimeTransformer()) - - catalog.save("test", 42) - assert "Saving test took" in caplog.text - assert catalog.load("test") == 42 - assert "Loading test took" in caplog.text diff --git a/tests/framework/cli/conftest.py b/tests/framework/cli/conftest.py index 5777ec1b9a..b6bba00a23 100644 --- a/tests/framework/cli/conftest.py +++ b/tests/framework/cli/conftest.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """ This file contains the fixtures that are reusable by any tests within this directory. You don't need to import the fixtures as pytest will @@ -47,6 +19,7 @@ from kedro.framework.cli.catalog import catalog_cli from kedro.framework.cli.cli import cli from kedro.framework.cli.jupyter import jupyter_cli +from kedro.framework.cli.micropkg import micropkg_cli from kedro.framework.cli.pipeline import pipeline_cli from kedro.framework.cli.project import project_group from kedro.framework.cli.registry import registry_cli @@ -60,13 +33,13 @@ @fixture def entry_points(mocker): - return mocker.patch("pkg_resources.iter_entry_points") + return mocker.patch("importlib_metadata.entry_points", spec=True) @fixture def entry_point(mocker, entry_points): - ep = mocker.MagicMock() - entry_points.return_value = [ep] + ep = mocker.patch("importlib_metadata.EntryPoint", spec=True) + entry_points.return_value.select.return_value = [ep] return ep @@ -115,6 +88,7 @@ def fake_metadata(fake_root_dir): fake_root_dir / REPO_NAME, kedro_version, fake_root_dir / REPO_NAME / "src", + kedro_version, ) return metadata @@ -131,6 +105,7 @@ def fake_kedro_cli(): catalog_cli, jupyter_cli, pipeline_cli, + micropkg_cli, project_group, registry_cli, ], @@ -142,11 +117,14 @@ def fake_project_cli( fake_repo_path: Path, dummy_config: Path, fake_kedro_cli: click.CommandCollection ): old_settings = settings.as_dict() - starter_path = Path(__file__).parents[3].resolve() + starter_path = Path(__file__).resolve().parents[3] starter_path = starter_path / "features" / "steps" / "test_starter" CliRunner().invoke( fake_kedro_cli, ["new", "-c", str(dummy_config), "--starter", str(starter_path)] ) + # Delete the project logging.yml, which leaves behind info.log and error.log files. + # This leaves logging config as the framework default. + (fake_repo_path / "conf" / "base" / "logging.yml").unlink() # NOTE: Here we load a couple of modules, as they would be imported in # the code and tests. @@ -160,18 +138,23 @@ def fake_project_cli( yield fake_kedro_cli # reset side-effects of configure_project - pipelines._clear(PACKAGE_NAME) # this resets pipelines loading state + pipelines.configure() + for key, value in old_settings.items(): settings.set(key, value) sys.path = old_path - del sys.modules[PACKAGE_NAME] + + # configure_project does imports that add PACKAGE_NAME.pipelines, + # PACKAGE_NAME.settings to sys.modules. These need to be removed. + # Ideally we would reset sys.modules to exactly what it was before + # running anything, but removal of distutils.build.commands from + # sys.modules mysteriously makes some tests for `kedro micropkg package` + # fail on Windows, Python 3.7 and 3.8. + for module in list(sys.modules.keys()): + if module.startswith(PACKAGE_NAME): + del sys.modules[module] @fixture def chdir_to_dummy_project(fake_repo_path, monkeypatch): monkeypatch.chdir(str(fake_repo_path)) - - -@fixture -def patch_log(mocker): - mocker.patch("logging.config.dictConfig") diff --git a/tests/framework/cli/hooks/test_manager.py b/tests/framework/cli/hooks/test_manager.py index e66b23b378..9c60f9951f 100644 --- a/tests/framework/cli/hooks/test_manager.py +++ b/tests/framework/cli/hooks/test_manager.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import pytest from kedro.framework.cli.hooks.manager import CLIHooksManager diff --git a/tests/extras/logging/__init__.py b/tests/framework/cli/micropkg/__init__.py similarity index 100% rename from tests/extras/logging/__init__.py rename to tests/framework/cli/micropkg/__init__.py diff --git a/tests/framework/cli/micropkg/conftest.py b/tests/framework/cli/micropkg/conftest.py new file mode 100644 index 0000000000..ff8348b755 --- /dev/null +++ b/tests/framework/cli/micropkg/conftest.py @@ -0,0 +1,79 @@ +import shutil + +import pytest + +from kedro.framework.project import settings + + +@pytest.fixture(autouse=True) +def cleanup_micropackages(fake_repo_path, fake_package_path): + packages = {p.name for p in fake_package_path.iterdir() if p.is_dir()} + + yield + + created_packages = { + p.name + for p in fake_package_path.iterdir() + if p.is_dir() and p.name != "__pycache__" + } + created_packages -= packages + + for micropackage in created_packages: + shutil.rmtree(str(fake_package_path / micropackage)) + + confs = fake_repo_path / settings.CONF_SOURCE + for each in confs.rglob(f"*{micropackage}*"): + if each.is_file(): + each.unlink() + + tests = fake_repo_path / "src" / "tests" / micropackage + if tests.is_dir(): + shutil.rmtree(str(tests)) + + +@pytest.fixture(autouse=True) +def cleanup_pipelines(fake_repo_path, fake_package_path): + pipes_path = fake_package_path / "pipelines" + old_pipelines = {p.name for p in pipes_path.iterdir() if p.is_dir()} + requirements_txt = fake_repo_path / "src" / "requirements.txt" + requirements = requirements_txt.read_text() + yield + + # remove created pipeline files after the test + created_pipelines = { + p.name for p in pipes_path.iterdir() if p.is_dir() and p.name != "__pycache__" + } + created_pipelines -= old_pipelines + + for pipeline in created_pipelines: + shutil.rmtree(str(pipes_path / pipeline)) + + confs = fake_repo_path / settings.CONF_SOURCE + for each in confs.rglob(f"*{pipeline}*"): # clean all pipeline config files + if each.is_file(): + each.unlink() + + tests = fake_repo_path / "src" / "tests" / "pipelines" / pipeline + if tests.is_dir(): + shutil.rmtree(str(tests)) + + # reset requirements.txt + requirements_txt.write_text(requirements) + + +@pytest.fixture +def cleanup_dist(fake_repo_path): + yield + dist_dir = fake_repo_path / "dist" + if dist_dir.exists(): + shutil.rmtree(str(dist_dir)) + + +@pytest.fixture +def cleanup_pyproject_toml(fake_repo_path): + pyproject_toml = fake_repo_path / "pyproject.toml" + existing_toml = pyproject_toml.read_text() + + yield + + pyproject_toml.write_text(existing_toml) diff --git a/tests/framework/cli/micropkg/test_micropkg_package.py b/tests/framework/cli/micropkg/test_micropkg_package.py new file mode 100644 index 0000000000..4c3daf7abe --- /dev/null +++ b/tests/framework/cli/micropkg/test_micropkg_package.py @@ -0,0 +1,581 @@ +import tarfile +import textwrap +from pathlib import Path + +import pytest +import toml +from click.testing import CliRunner + +from kedro.framework.cli.micropkg import _get_sdist_name + +PIPELINE_NAME = "my_pipeline" + +LETTER_ERROR = "It must contain only letters, digits, and/or underscores." +FIRST_CHAR_ERROR = "It must start with a letter or underscore." +TOO_SHORT_ERROR = "It must be at least 2 characters long." + + +@pytest.mark.usefixtures("chdir_to_dummy_project", "cleanup_dist") +class TestMicropkgPackageCommand: + def assert_sdist_contents_correct( + self, sdist_location, package_name=PIPELINE_NAME, version="0.1" + ): + sdist_name = _get_sdist_name(name=package_name, version=version) + sdist_file = sdist_location / sdist_name + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 + + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) + + expected_files = { + f"{package_name}-{version}/{package_name}/__init__.py", + f"{package_name}-{version}/{package_name}/README.md", + f"{package_name}-{version}/{package_name}/nodes.py", + f"{package_name}-{version}/{package_name}/pipeline.py", + f"{package_name}-{version}/{package_name}/config/parameters/{package_name}.yml", + f"{package_name}-{version}/tests/__init__.py", + f"{package_name}-{version}/tests/test_pipeline.py", + } + assert expected_files <= sdist_contents + + @pytest.mark.parametrize( + "options,package_name,success_message", + [ + ([], PIPELINE_NAME, f"'dummy_package.pipelines.{PIPELINE_NAME}' packaged!"), + ( + ["--alias", "alternative"], + "alternative", + f"'dummy_package.pipelines.{PIPELINE_NAME}' packaged as 'alternative'!", + ), + ], + ) + def test_package_micropkg( + self, + fake_repo_path, + fake_project_cli, + options, + package_name, + success_message, + fake_metadata, + ): + result = CliRunner().invoke( + fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata + ) + assert result.exit_code == 0 + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", f"pipelines.{PIPELINE_NAME}"] + options, + obj=fake_metadata, + ) + + assert result.exit_code == 0 + assert success_message in result.output + + sdist_location = fake_repo_path / "dist" + assert f"Location: {sdist_location}" in result.output + + self.assert_sdist_contents_correct( + sdist_location=sdist_location, package_name=package_name, version="0.1" + ) + + def test_micropkg_package_same_name_as_package_name( + self, fake_metadata, fake_project_cli, fake_repo_path + ): + """Create modular pipeline with the same name as the + package name, then package as is. The command should run + and the resulting sdist should have all expected contents. + """ + pipeline_name = fake_metadata.package_name + result = CliRunner().invoke( + fake_project_cli, ["pipeline", "create", pipeline_name], obj=fake_metadata + ) + assert result.exit_code == 0 + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", f"pipelines.{pipeline_name}"], + obj=fake_metadata, + ) + sdist_location = fake_repo_path / "dist" + + assert result.exit_code == 0 + assert f"Location: {sdist_location}" in result.output + self.assert_sdist_contents_correct( + sdist_location=sdist_location, package_name=pipeline_name + ) + + def test_micropkg_package_same_name_as_package_name_alias( + self, fake_metadata, fake_project_cli, fake_repo_path + ): + """Create modular pipeline, then package under alias + the same name as the package name. The command should run + and the resulting sdist should have all expected contents. + """ + alias = fake_metadata.package_name + result = CliRunner().invoke( + fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata + ) + assert result.exit_code == 0 + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", f"pipelines.{PIPELINE_NAME}", "--alias", alias], + obj=fake_metadata, + ) + sdist_location = fake_repo_path / "dist" + + assert result.exit_code == 0 + assert f"Location: {sdist_location}" in result.output + self.assert_sdist_contents_correct( + sdist_location=sdist_location, package_name=alias + ) + + @pytest.mark.parametrize("existing_dir", [True, False]) + def test_micropkg_package_to_destination( + self, fake_project_cli, existing_dir, tmp_path, fake_metadata + ): + destination = (tmp_path / "in" / "here").resolve() + if existing_dir: + destination.mkdir(parents=True) + + result = CliRunner().invoke( + fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata + ) + assert result.exit_code == 0 + result = CliRunner().invoke( + fake_project_cli, + [ + "micropkg", + "package", + f"pipelines.{PIPELINE_NAME}", + "--destination", + str(destination), + ], + obj=fake_metadata, + ) + + assert result.exit_code == 0 + success_message = ( + f"'dummy_package.pipelines.{PIPELINE_NAME}' packaged! " + f"Location: {destination}" + ) + assert success_message in result.output + + self.assert_sdist_contents_correct(sdist_location=destination) + + def test_micropkg_package_overwrites_sdist( + self, fake_project_cli, tmp_path, fake_metadata + ): + destination = (tmp_path / "in" / "here").resolve() + destination.mkdir(parents=True) + sdist_file = destination / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + sdist_file.touch() + + result = CliRunner().invoke( + fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata + ) + assert result.exit_code == 0 + result = CliRunner().invoke( + fake_project_cli, + [ + "micropkg", + "package", + f"pipelines.{PIPELINE_NAME}", + "--destination", + str(destination), + ], + obj=fake_metadata, + ) + assert result.exit_code == 0 + + warning_message = f"Package file {sdist_file} will be overwritten!" + success_message = ( + f"'dummy_package.pipelines.{PIPELINE_NAME}' packaged! " + f"Location: {destination}" + ) + assert warning_message in result.output + assert success_message in result.output + + self.assert_sdist_contents_correct(sdist_location=destination) + + @pytest.mark.parametrize( + "bad_alias,error_message", + [ + ("bad name", LETTER_ERROR), + ("bad%name", LETTER_ERROR), + ("1bad", FIRST_CHAR_ERROR), + ("a", TOO_SHORT_ERROR), + ], + ) + def test_package_micropkg_bad_alias( + self, fake_project_cli, bad_alias, error_message + ): + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", f"pipelines.{PIPELINE_NAME}", "--alias", bad_alias], + ) + assert result.exit_code + assert error_message in result.output + + def test_package_micropkg_invalid_module_path(self, fake_project_cli): + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "package", f"pipelines/{PIPELINE_NAME}"] + ) + error_message = ( + "The micro-package location you provided is not a valid Python module path" + ) + + assert result.exit_code + assert error_message in result.output + + def test_package_micropkg_no_config( + self, fake_repo_path, fake_project_cli, fake_metadata + ): + version = "0.1" + result = CliRunner().invoke( + fake_project_cli, + ["pipeline", "create", PIPELINE_NAME, "--skip-config"], + obj=fake_metadata, + ) + assert result.exit_code == 0 + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", f"pipelines.{PIPELINE_NAME}"], + obj=fake_metadata, + ) + + assert result.exit_code == 0 + assert f"'dummy_package.pipelines.{PIPELINE_NAME}' packaged!" in result.output + + sdist_location = fake_repo_path / "dist" + assert f"Location: {sdist_location}" in result.output + + # the sdist contents are slightly different (config shouldn't be included), + # which is why we can't call self.assert_sdist_contents_correct here + sdist_file = sdist_location / _get_sdist_name( + name=PIPELINE_NAME, version=version + ) + assert sdist_file.is_file() + assert len(list((fake_repo_path / "dist").iterdir())) == 1 + + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) + + expected_files = { + f"{PIPELINE_NAME}-{version}/{PIPELINE_NAME}/__init__.py", + f"{PIPELINE_NAME}-{version}/{PIPELINE_NAME}/README.md", + f"{PIPELINE_NAME}-{version}/{PIPELINE_NAME}/nodes.py", + f"{PIPELINE_NAME}-{version}/{PIPELINE_NAME}/pipeline.py", + f"{PIPELINE_NAME}-{version}/tests/__init__.py", + f"{PIPELINE_NAME}-{version}/tests/test_pipeline.py", + } + assert expected_files <= sdist_contents + assert f"{PIPELINE_NAME}/config/parameters.yml" not in sdist_contents + + def test_package_non_existing_micropkg_dir( + self, fake_package_path, fake_project_cli, fake_metadata + ): + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", "pipelines.non_existing"], + obj=fake_metadata, + ) + assert result.exit_code == 1 + pipeline_dir = fake_package_path / "pipelines" / "non_existing" + error_message = f"Error: Directory '{pipeline_dir}' doesn't exist." + assert error_message in result.output + + def test_package_empty_micropkg_dir( + self, fake_project_cli, fake_package_path, fake_metadata + ): + pipeline_dir = fake_package_path / "pipelines" / "empty_dir" + pipeline_dir.mkdir() + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", "pipelines.empty_dir"], + obj=fake_metadata, + ) + assert result.exit_code == 1 + error_message = f"Error: '{pipeline_dir}' is an empty directory." + assert error_message in result.output + + def test_package_modular_pipeline_with_nested_parameters( + self, fake_repo_path, fake_project_cli, fake_metadata + ): + """ + The setup for the test is as follows: + + Create two modular pipelines, to verify that only the parameter file with matching pipeline + name will be packaged. + + Add a directory with a parameter file to verify that if a project has parameters structured + like below, that the ones inside a directory with the pipeline name are packaged as well + when calling `kedro micropkg package` for a specific pipeline. + + parameters + └── retail + └── params1.ym + """ + CliRunner().invoke( + fake_project_cli, ["pipeline", "create", "retail"], obj=fake_metadata + ) + CliRunner().invoke( + fake_project_cli, + ["pipeline", "create", "retail_banking"], + obj=fake_metadata, + ) + nested_param_path = Path( + fake_repo_path / "conf" / "base" / "parameters" / "retail" + ) + nested_param_path.mkdir(parents=True, exist_ok=True) + (nested_param_path / "params1.yml").touch() + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", "pipelines.retail"], + obj=fake_metadata, + ) + + assert result.exit_code == 0 + assert "'dummy_package.pipelines.retail' packaged!" in result.output + + sdist_location = fake_repo_path / "dist" + assert f"Location: {sdist_location}" in result.output + + sdist_name = _get_sdist_name(name="retail", version="0.1") + sdist_file = sdist_location / sdist_name + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 + + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) + assert ( + "retail-0.1/retail/config/parameters/retail/params1.yml" in sdist_contents + ) + assert "retail-0.1/retail/config/parameters/retail.yml" in sdist_contents + assert ( + "retail-0.1/retail/config/parameters/retail_banking.yml" + not in sdist_contents + ) + + def test_package_pipeline_with_deep_nested_parameters( + self, fake_repo_path, fake_project_cli, fake_metadata + ): + CliRunner().invoke( + fake_project_cli, ["pipeline", "create", "retail"], obj=fake_metadata + ) + deep_nested_param_path = Path( + fake_repo_path / "conf" / "base" / "parameters" / "deep" / "retail" + ) + deep_nested_param_path.mkdir(parents=True, exist_ok=True) + (deep_nested_param_path / "params1.yml").touch() + + deep_nested_param_path2 = Path( + fake_repo_path / "conf" / "base" / "parameters" / "retail" / "deep" + ) + deep_nested_param_path2.mkdir(parents=True, exist_ok=True) + (deep_nested_param_path2 / "params1.yml").touch() + + deep_nested_param_path3 = Path( + fake_repo_path / "conf" / "base" / "parameters" / "deep" + ) + deep_nested_param_path3.mkdir(parents=True, exist_ok=True) + (deep_nested_param_path3 / "retail.yml").touch() + + super_deep_nested_param_path = Path( + fake_repo_path + / "conf" + / "base" + / "parameters" + / "a" + / "b" + / "c" + / "d" + / "retail" + ) + super_deep_nested_param_path.mkdir(parents=True, exist_ok=True) + (super_deep_nested_param_path / "params3.yml").touch() + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", "pipelines.retail"], + obj=fake_metadata, + ) + + assert result.exit_code == 0 + assert "'dummy_package.pipelines.retail' packaged!" in result.output + + sdist_location = fake_repo_path / "dist" + assert f"Location: {sdist_location}" in result.output + + sdist_name = _get_sdist_name(name="retail", version="0.1") + sdist_file = sdist_location / sdist_name + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 + + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) + assert ( + "retail-0.1/retail/config/parameters/deep/retail/params1.yml" + in sdist_contents + ) + assert ( + "retail-0.1/retail/config/parameters/retail/deep/params1.yml" + in sdist_contents + ) + assert "retail-0.1/retail/config/parameters/retail.yml" in sdist_contents + assert "retail-0.1/retail/config/parameters/deep/retail.yml" in sdist_contents + assert ( + "retail-0.1/retail/config/parameters/a/b/c/d/retail/params3.yml" + in sdist_contents + ) + + def test_micropkg_package_default( + self, fake_repo_path, fake_package_path, fake_project_cli, fake_metadata + ): + _pipeline_name = "data_engineering" + + pipelines_dir = fake_package_path / "pipelines" / _pipeline_name + assert pipelines_dir.is_dir() + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", f"pipelines.{_pipeline_name}"], + obj=fake_metadata, + ) + assert result.exit_code == 0 + + # test for actual version + sdist_location = fake_repo_path / "dist" + sdist_name = _get_sdist_name(name=_pipeline_name, version="0.1") + sdist_file = sdist_location / sdist_name + + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 + + def test_micropkg_package_nested_module( + self, fake_project_cli, fake_metadata, fake_repo_path, fake_package_path + ): + CliRunner().invoke( + fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata + ) + + nested_utils = fake_package_path / "pipelines" / PIPELINE_NAME / "utils" + nested_utils.mkdir(parents=True) + (nested_utils / "__init__.py").touch() + (nested_utils / "useful.py").touch() + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", f"pipelines.{PIPELINE_NAME}.utils"], + obj=fake_metadata, + ) + assert result.exit_code == 0 + + sdist_location = fake_repo_path / "dist" + sdist_name = _get_sdist_name(name="utils", version="0.1") + sdist_file = sdist_location / sdist_name + + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 + + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) + expected_files = { + "utils-0.1/utils/__init__.py", + "utils-0.1/utils/useful.py", + } + assert expected_files <= sdist_contents + assert f"{PIPELINE_NAME}/pipeline.py" not in sdist_contents + + +@pytest.mark.usefixtures( + "chdir_to_dummy_project", "cleanup_dist", "cleanup_pyproject_toml" +) +class TestMicropkgPackageFromManifest: + def test_micropkg_package_all( # pylint: disable=too-many-locals + self, fake_repo_path, fake_project_cli, fake_metadata, tmp_path, mocker + ): + # pylint: disable=import-outside-toplevel + from kedro.framework.cli import micropkg + + spy = mocker.spy(micropkg, "_package_micropkg") + pyproject_toml = fake_repo_path / "pyproject.toml" + other_dest = tmp_path / "here" + other_dest.mkdir() + project_toml_str = textwrap.dedent( + f""" + [tool.kedro.micropkg.package] + "pipelines.first" = {{destination = "{other_dest.as_posix()}"}} + "pipelines.second" = {{alias = "ds", env = "local"}} + "pipelines.third" = {{}} + """ + ) + with pyproject_toml.open(mode="a") as file: + file.write(project_toml_str) + + for name in ("first", "second", "third"): + CliRunner().invoke( + fake_project_cli, ["pipeline", "create", name], obj=fake_metadata + ) + + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "package", "--all"], obj=fake_metadata + ) + + assert result.exit_code == 0 + assert "Micro-packages packaged!" in result.output + assert spy.call_count == 3 + + build_config = toml.loads(project_toml_str) + package_manifest = build_config["tool"]["kedro"]["micropkg"]["package"] + for pipeline_name, packaging_specs in package_manifest.items(): + expected_call = mocker.call(pipeline_name, fake_metadata, **packaging_specs) + assert expected_call in spy.call_args_list + + def test_micropkg_package_all_empty_toml( + self, fake_repo_path, fake_project_cli, fake_metadata, mocker + ): + # pylint: disable=import-outside-toplevel + from kedro.framework.cli import micropkg + + spy = mocker.spy(micropkg, "_package_micropkg") + pyproject_toml = fake_repo_path / "pyproject.toml" + with pyproject_toml.open(mode="a") as file: + file.write("\n[tool.kedro.micropkg.package]\n") + + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "package", "--all"], obj=fake_metadata + ) + + assert result.exit_code == 0 + expected_message = ( + "Nothing to package. Please update the 'pyproject.toml' " + "package manifest section." + ) + assert expected_message in result.output + assert not spy.called + + def test_invalid_toml(self, fake_repo_path, fake_project_cli, fake_metadata): + pyproject_toml = fake_repo_path / "pyproject.toml" + with pyproject_toml.open(mode="a") as file: + file.write("what/toml?") + + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "package", "--all"], obj=fake_metadata + ) + + assert result.exit_code + assert isinstance(result.exception, toml.TomlDecodeError) + + def test_micropkg_package_no_arg_provided(self, fake_project_cli, fake_metadata): + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "package"], obj=fake_metadata + ) + assert result.exit_code + expected_message = ( + "Please specify a micro-package name or add '--all' to package all micro-packages in " + "the 'pyproject.toml' package manifest section." + ) + assert expected_message in result.output diff --git a/tests/framework/cli/micropkg/test_micropkg_pull.py b/tests/framework/cli/micropkg/test_micropkg_pull.py new file mode 100644 index 0000000000..9cbad00a90 --- /dev/null +++ b/tests/framework/cli/micropkg/test_micropkg_pull.py @@ -0,0 +1,953 @@ +import filecmp +import shutil +import tarfile +import textwrap +from pathlib import Path +from unittest.mock import Mock + +import pytest +import toml +import yaml +from click import ClickException +from click.testing import CliRunner + +from kedro.framework.cli.micropkg import _get_sdist_name, safe_extract +from kedro.framework.project import settings + +PIPELINE_NAME = "my_pipeline" + + +def call_pipeline_create(cli, metadata, pipeline_name=PIPELINE_NAME): + result = CliRunner().invoke( + cli, ["pipeline", "create", pipeline_name], obj=metadata + ) + assert result.exit_code == 0 + + +def call_micropkg_package( + cli, metadata, alias=None, destination=None, pipeline_name=PIPELINE_NAME +): + options = ["--alias", alias] if alias else [] + options += ["--destination", str(destination)] if destination else [] + result = CliRunner().invoke( + cli, + ["micropkg", "package", f"pipelines.{pipeline_name}", *options], + obj=metadata, + ) + assert result.exit_code == 0, result.output + + +def call_pipeline_delete(cli, metadata, pipeline_name=PIPELINE_NAME): + result = CliRunner().invoke( + cli, ["pipeline", "delete", "-y", pipeline_name], obj=metadata + ) + assert result.exit_code == 0 + + +@pytest.mark.usefixtures("chdir_to_dummy_project", "cleanup_dist") +class TestMicropkgPullCommand: + def assert_package_files_exist(self, source_path): + assert {f.name for f in source_path.iterdir()} == { + "__init__.py", + "nodes.py", + "pipeline.py", + "README.md", + } + + @pytest.mark.parametrize("env", [None, "local"]) + @pytest.mark.parametrize( + "alias, destination", + [ + (None, None), + ("aliased", None), + ("aliased", "pipelines"), + (None, "pipelines"), + ], + ) + def test_pull_local_sdist( + self, + fake_project_cli, + fake_repo_path, + fake_package_path, + env, + alias, + destination, + fake_metadata, + ): + """Test for pulling a valid sdist file locally.""" + # pylint: disable=too-many-locals + call_pipeline_create(fake_project_cli, fake_metadata) + call_micropkg_package(fake_project_cli, fake_metadata) + call_pipeline_delete(fake_project_cli, fake_metadata) + + source_path = fake_package_path / "pipelines" / PIPELINE_NAME + config_path = ( + fake_repo_path / settings.CONF_SOURCE / "base" / "pipelines" / PIPELINE_NAME + ) + test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME + # Make sure the files actually deleted before pulling from the sdist file. + assert not source_path.exists() + assert not test_path.exists() + assert not config_path.exists() + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + assert sdist_file.is_file() + + options = ["-e", env] if env else [] + options += ["--alias", alias] if alias else [] + options += ["--destination", destination] if destination else [] + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "pull", str(sdist_file), *options], + obj=fake_metadata, + ) + assert result.exit_code == 0, result.output + assert "pulled and unpacked" in result.output + + pipeline_name = alias or PIPELINE_NAME + destination = destination or Path() + source_dest = fake_package_path / destination / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / destination / pipeline_name + config_env = env or "base" + params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / config_env + / "parameters" + / f"{pipeline_name}.yml" + ) + + self.assert_package_files_exist(source_dest) + assert params_config.is_file() + actual_test_files = {f.name for f in test_dest.iterdir()} + expected_test_files = {"__init__.py", "test_pipeline.py"} + assert actual_test_files == expected_test_files + + @pytest.mark.parametrize("env", [None, "local"]) + @pytest.mark.parametrize( + "alias, destination", + [ + (None, None), + ("aliased", None), + ("aliased", "pipelines"), + (None, "pipelines"), + ], + ) + def test_pull_local_sdist_compare( + self, + fake_project_cli, + fake_repo_path, + fake_package_path, + env, + alias, + destination, + fake_metadata, + ): + """Test for pulling a valid sdist file locally, unpack it + into another location and check that unpacked files + are identical to the ones in the original modular pipeline. + """ + # pylint: disable=too-many-locals + pipeline_name = "another_pipeline" + call_pipeline_create(fake_project_cli, fake_metadata) + call_micropkg_package(fake_project_cli, fake_metadata, alias=pipeline_name) + + source_path = fake_package_path / "pipelines" / PIPELINE_NAME + test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME + source_params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / "base" + / "parameters" + / f"{PIPELINE_NAME}.yml" + ) + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=pipeline_name, version="0.1") + ) + assert sdist_file.is_file() + + options = ["-e", env] if env else [] + options += ["--alias", alias] if alias else [] + options += ["--destination", destination] if destination else [] + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "pull", str(sdist_file), *options], + obj=fake_metadata, + ) + assert result.exit_code == 0, result.output + assert "pulled and unpacked" in result.output + + pipeline_name = alias or pipeline_name + destination = destination or Path() + source_dest = fake_package_path / destination / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / destination / pipeline_name + config_env = env or "base" + dest_params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / config_env + / "parameters" + / f"{pipeline_name}.yml" + ) + + assert not filecmp.dircmp(source_path, source_dest).diff_files + assert not filecmp.dircmp(test_path, test_dest).diff_files + assert source_params_config.read_bytes() == dest_params_config.read_bytes() + + def test_micropkg_pull_same_alias_package_name( + self, + fake_project_cli, + fake_repo_path, + fake_package_path, + fake_metadata, + ): + call_pipeline_create(fake_project_cli, fake_metadata) + call_micropkg_package(fake_project_cli, fake_metadata) + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + + pipeline_name = PIPELINE_NAME + destination = "tools" + + result = CliRunner().invoke( + fake_project_cli, + [ + "micropkg", + "pull", + str(sdist_file), + "--destination", + destination, + "--alias", + pipeline_name, + ], + obj=fake_metadata, + ) + assert result.exit_code == 0, result.stderr + assert "pulled and unpacked" in result.output + + source_dest = fake_package_path / destination / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / destination / pipeline_name + config_env = "base" + params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / config_env + / "parameters" + / f"{pipeline_name}.yml" + ) + + self.assert_package_files_exist(source_dest) + assert params_config.is_file() + actual_test_files = {f.name for f in test_dest.iterdir()} + expected_test_files = {"__init__.py", "test_pipeline.py"} + assert actual_test_files == expected_test_files + + def test_micropkg_pull_nested_destination( + self, + fake_project_cli, + fake_repo_path, + fake_package_path, + fake_metadata, + ): + call_pipeline_create(fake_project_cli, fake_metadata) + call_micropkg_package(fake_project_cli, fake_metadata) + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + + pipeline_name = PIPELINE_NAME + destination = "pipelines/nested" + + result = CliRunner().invoke( + fake_project_cli, + [ + "micropkg", + "pull", + str(sdist_file), + "--destination", + destination, + "--alias", + pipeline_name, + ], + obj=fake_metadata, + ) + assert result.exit_code == 0, result.stderr + assert "pulled and unpacked" in result.output + + source_dest = fake_package_path / destination / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / destination / pipeline_name + config_env = "base" + params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / config_env + / "parameters" + / f"{pipeline_name}.yml" + ) + + self.assert_package_files_exist(source_dest) + assert params_config.is_file() + actual_test_files = {f.name for f in test_dest.iterdir()} + expected_test_files = {"__init__.py", "test_pipeline.py"} + assert actual_test_files == expected_test_files + + def test_micropkg_alias_refactors_imports( # pylint: disable=too-many-locals + self, fake_project_cli, fake_package_path, fake_repo_path, fake_metadata + ): + call_pipeline_create(fake_project_cli, fake_metadata) + pipeline_file = fake_package_path / "pipelines" / PIPELINE_NAME / "pipeline.py" + import_stmt = ( + f"import {fake_metadata.package_name}.pipelines.{PIPELINE_NAME}.nodes" + ) + with pipeline_file.open("a") as f: + f.write(import_stmt) + + package_alias = "alpha" + pull_alias = "beta" + pull_destination = "pipelines/lib" + + call_micropkg_package( + cli=fake_project_cli, metadata=fake_metadata, alias=package_alias + ) + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=package_alias, version="0.1") + ) + CliRunner().invoke( + fake_project_cli, ["micropkg", "pull", str(sdist_file)], obj=fake_metadata + ) + CliRunner().invoke( + fake_project_cli, + [ + "micropkg", + "pull", + str(sdist_file), + "--alias", + pull_alias, + "--destination", + pull_destination, + ], + obj=fake_metadata, + ) + pull = f"pipelines.lib.{pull_alias}" + for alias in (package_alias, pull): + alias_path = Path(*alias.split(".")) + path = fake_package_path / alias_path / "pipeline.py" + file_content = path.read_text() + expected_stmt = f"import {fake_metadata.package_name}.{alias}.nodes" + assert expected_stmt in file_content + + def test_micropkg_pull_from_aliased_pipeline_conflicting_name( + self, fake_project_cli, fake_package_path, fake_repo_path, fake_metadata + ): + package_name = fake_metadata.package_name + call_pipeline_create(fake_project_cli, fake_metadata) + pipeline_file = fake_package_path / "pipelines" / PIPELINE_NAME / "pipeline.py" + import_stmt = f"import {package_name}.pipelines.{PIPELINE_NAME}.nodes" + with pipeline_file.open("a") as f: + f.write(import_stmt) + + call_micropkg_package( + cli=fake_project_cli, metadata=fake_metadata, alias=package_name + ) + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=package_name, version="0.1") + ) + assert sdist_file.is_file() + + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "pull", str(sdist_file)], obj=fake_metadata + ) + assert result.exit_code == 0, result.output + + path = fake_package_path / package_name / "pipeline.py" + file_content = path.read_text() + expected_stmt = f"import {package_name}.{package_name}.nodes" + assert expected_stmt in file_content + + def test_micropkg_pull_as_aliased_pipeline_conflicting_name( + self, fake_project_cli, fake_package_path, fake_repo_path, fake_metadata + ): + package_name = fake_metadata.package_name + call_pipeline_create(fake_project_cli, fake_metadata) + pipeline_file = fake_package_path / "pipelines" / PIPELINE_NAME / "pipeline.py" + import_stmt = f"import {package_name}.pipelines.{PIPELINE_NAME}.nodes" + with pipeline_file.open("a") as f: + f.write(import_stmt) + + call_micropkg_package(cli=fake_project_cli, metadata=fake_metadata) + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + assert sdist_file.is_file() + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "pull", str(sdist_file), "--alias", package_name], + obj=fake_metadata, + ) + assert result.exit_code == 0, result.output + path = fake_package_path / package_name / "pipeline.py" + file_content = path.read_text() + expected_stmt = f"import {package_name}.{package_name}.nodes" + assert expected_stmt in file_content + + def test_pull_sdist_fs_args( + self, fake_project_cli, fake_repo_path, mocker, tmp_path, fake_metadata + ): + """Test for pulling a sdist file with custom fs_args specified.""" + call_pipeline_create(fake_project_cli, fake_metadata) + call_micropkg_package(fake_project_cli, fake_metadata) + call_pipeline_delete(fake_project_cli, fake_metadata) + + fs_args_config = tmp_path / "fs_args_config.yml" + with fs_args_config.open(mode="w") as f: + yaml.dump({"fs_arg_1": 1, "fs_arg_2": {"fs_arg_2_nested_1": 2}}, f) + mocked_filesystem = mocker.patch("fsspec.filesystem") + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + + options = ["--fs-args", str(fs_args_config)] + CliRunner().invoke( + fake_project_cli, ["micropkg", "pull", str(sdist_file), *options] + ) + + mocked_filesystem.assert_called_once_with( + "file", fs_arg_1=1, fs_arg_2={"fs_arg_2_nested_1": 2} + ) + + @pytest.mark.parametrize("env", [None, "local"]) + @pytest.mark.parametrize("alias", [None, "alias_path"]) + def test_pull_tests_missing( + self, + fake_project_cli, + fake_repo_path, + fake_package_path, + env, + alias, + fake_metadata, + ): + """Test for pulling a valid sdist file locally, + but `tests` directory is missing from the sdist file. + """ + # pylint: disable=too-many-locals + call_pipeline_create(fake_project_cli, fake_metadata) + test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME + shutil.rmtree(test_path) + assert not test_path.exists() + call_micropkg_package(fake_project_cli, fake_metadata) + call_pipeline_delete(fake_project_cli, fake_metadata) + + source_path = fake_package_path / "pipelines" / PIPELINE_NAME + source_params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / "base" + / "parameters" + / f"{PIPELINE_NAME}.yml" + ) + # Make sure the files actually deleted before pulling from the sdist file. + assert not source_path.exists() + assert not source_params_config.exists() + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + assert sdist_file.is_file() + + options = ["-e", env] if env else [] + options += ["--alias", alias] if alias else [] + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "pull", str(sdist_file), *options], + obj=fake_metadata, + ) + assert result.exit_code == 0 + + pipeline_name = alias or PIPELINE_NAME + source_dest = fake_package_path / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / pipeline_name + config_env = env or "base" + params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / config_env + / "parameters" + / f"{pipeline_name}.yml" + ) + + self.assert_package_files_exist(source_dest) + assert params_config.is_file() + assert not test_dest.exists() + + @pytest.mark.parametrize("env", [None, "local"]) + @pytest.mark.parametrize("alias", [None, "alias_path"]) + def test_pull_config_missing( + self, + fake_project_cli, + fake_repo_path, + fake_package_path, + env, + alias, + fake_metadata, + ): + """ + Test for pulling a valid sdist file locally, but `config` directory is missing + from the sdist file. + """ + # pylint: disable=too-many-locals + call_pipeline_create(fake_project_cli, fake_metadata) + source_params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / "base" + / "parameters" + / f"{PIPELINE_NAME}.yml" + ) + source_params_config.unlink() + call_micropkg_package(fake_project_cli, fake_metadata) + call_pipeline_delete(fake_project_cli, fake_metadata) + + source_path = fake_package_path / "pipelines" / PIPELINE_NAME + test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME + # Make sure the files actually deleted before pulling from the sdist file. + assert not source_path.exists() + assert not test_path.exists() + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + assert sdist_file.is_file() + + options = ["-e", env] if env else [] + options += ["--alias", alias] if alias else [] + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "pull", str(sdist_file), *options], + obj=fake_metadata, + ) + assert result.exit_code == 0 + + pipeline_name = alias or PIPELINE_NAME + source_dest = fake_package_path / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / pipeline_name + config_env = env or "base" + dest_params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / config_env + / "parameters" + / f"{pipeline_name}.yml" + ) + + self.assert_package_files_exist(source_dest) + assert not dest_params_config.exists() + actual_test_files = {f.name for f in test_dest.iterdir()} + expected_test_files = {"__init__.py", "test_pipeline.py"} + assert actual_test_files == expected_test_files + + @pytest.mark.parametrize("env", [None, "local"]) + @pytest.mark.parametrize("alias", [None, "alias_path"]) + def test_pull_from_pypi( + self, + fake_project_cli, + fake_repo_path, + mocker, + tmp_path, + fake_package_path, + env, + alias, + fake_metadata, + ): + """ + Test for pulling a valid sdist file from pypi. + """ + # pylint: disable=too-many-locals + call_pipeline_create(fake_project_cli, fake_metadata) + # We mock the `pip download` call, and manually create a package sdist file + # to simulate the pypi scenario instead + call_micropkg_package(fake_project_cli, fake_metadata, destination=tmp_path) + version = "0.1" + sdist_file = tmp_path / _get_sdist_name(name=PIPELINE_NAME, version=version) + assert sdist_file.is_file() + call_pipeline_delete(fake_project_cli, fake_metadata) + + source_path = fake_package_path / "pipelines" / PIPELINE_NAME + test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME + source_params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / "base" + / "parameters" + / f"{PIPELINE_NAME}.yml" + ) + # Make sure the files actually deleted before pulling from pypi. + assert not source_path.exists() + assert not test_path.exists() + assert not source_params_config.exists() + + python_call_mock = mocker.patch("kedro.framework.cli.micropkg.python_call") + mocker.patch( + "kedro.framework.cli.micropkg.tempfile.TemporaryDirectory", + return_value=tmp_path, + ) + + # Mock needed to avoid an error when build.util.project_wheel_metadata + # calls tempfile.TemporaryDirectory, which is mocked + class _FakeWheelMetadata: + def get_all(self, name, failobj=None): # pylint: disable=unused-argument + return [] + + mocker.patch( + "kedro.framework.cli.micropkg.project_wheel_metadata", + return_value=_FakeWheelMetadata(), + ) + + options = ["-e", env] if env else [] + options += ["--alias", alias] if alias else [] + + package_name = "my-pipeline" + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "pull", package_name, *options], + obj=fake_metadata, + ) + assert result.exit_code == 0 + assert "pulled and unpacked" in result.output + + python_call_mock.assert_called_once_with( + "pip", + [ + "download", + "--no-deps", + "--no-binary", + ":all:", + "--dest", + str(tmp_path), + package_name, + ], + ) + + pipeline_name = alias or PIPELINE_NAME + source_dest = fake_package_path / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / pipeline_name + config_env = env or "base" + dest_params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / config_env + / "parameters" + / f"{pipeline_name}.yml" + ) + + self.assert_package_files_exist(source_dest) + assert dest_params_config.is_file() + actual_test_files = {f.name for f in test_dest.iterdir()} + expected_test_files = {"__init__.py", "test_pipeline.py"} + assert actual_test_files == expected_test_files + + def test_invalid_pull_from_pypi( + self, fake_project_cli, mocker, tmp_path, fake_metadata + ): + """ + Test for pulling package from pypi, and it cannot be found. + """ + + pypi_error_message = ( + "ERROR: Could not find a version that satisfies the requirement" + ) + python_call_mock = mocker.patch( + "kedro.framework.cli.micropkg.python_call", + side_effect=ClickException(pypi_error_message), + ) + mocker.patch( + "kedro.framework.cli.micropkg.tempfile.TemporaryDirectory", + return_value=tmp_path, + ) + + invalid_pypi_name = "non_existent" + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "pull", invalid_pypi_name], obj=fake_metadata + ) + assert result.exit_code + + python_call_mock.assert_called_once_with( + "pip", + [ + "download", + "--no-deps", + "--no-binary", + ":all:", + "--dest", + str(tmp_path), + invalid_pypi_name, + ], + ) + + assert pypi_error_message in result.stdout + + def test_pull_from_pypi_more_than_one_sdist_file( + self, fake_project_cli, mocker, tmp_path, fake_metadata + ): + """ + Test for pulling a sdist file with `pip download`, but there are more than one sdist + file to unzip. + """ + # We mock the `pip download` call, and manually create a package sdist file + # to simulate the pypi scenario instead + call_pipeline_create(fake_project_cli, fake_metadata) + call_micropkg_package(fake_project_cli, fake_metadata, destination=tmp_path) + call_micropkg_package( + fake_project_cli, fake_metadata, alias="another", destination=tmp_path + ) + mocker.patch("kedro.framework.cli.micropkg.python_call") + mocker.patch( + "kedro.framework.cli.micropkg.tempfile.TemporaryDirectory", + return_value=tmp_path, + ) + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "pull", PIPELINE_NAME], obj=fake_metadata + ) + + assert result.exit_code + assert "Error: More than 1 or no sdist files found:" in result.output + + def test_pull_unsupported_protocol_by_fsspec( + self, fake_project_cli, fake_metadata, tmp_path, mocker + ): + protocol = "unsupported" + exception_message = f"Protocol not known: {protocol}" + error_message = "Error: More than 1 or no sdist files found:" + package_path = f"{protocol}://{PIPELINE_NAME}" + + python_call_mock = mocker.patch("kedro.framework.cli.micropkg.python_call") + filesystem_mock = mocker.patch( + "fsspec.filesystem", side_effect=ValueError(exception_message) + ) + mocker.patch( + "kedro.framework.cli.micropkg.tempfile.TemporaryDirectory", + return_value=tmp_path, + ) + + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "pull", package_path], obj=fake_metadata + ) + + assert result.exit_code + filesystem_mock.assert_called_once_with(protocol) + python_call_mock.assert_called_once_with( + "pip", + [ + "download", + "--no-deps", + "--no-binary", + ":all:", + "--dest", + str(tmp_path), + package_path, + ], + ) + assert exception_message in result.output + assert "Trying to use 'pip download'..." in result.output + assert error_message in result.output + + def test_micropkg_pull_invalid_sdist( + self, fake_project_cli, fake_repo_path, fake_metadata, tmp_path + ): + """ + Test for pulling an invalid sdist file locally with more than one package. + """ + error_message = ( + "Invalid sdist was extracted: exactly one directory was expected" + ) + + call_pipeline_create(fake_project_cli, fake_metadata) + call_micropkg_package(fake_project_cli, fake_metadata) + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + assert sdist_file.is_file() + + with tarfile.open(sdist_file, "r:gz") as tar: + tar.extractall(tmp_path) + + # Create extra project + extra_project = tmp_path / f"{PIPELINE_NAME}-0.1_extra" + extra_project.mkdir() + (extra_project / "README.md").touch() + + # Recreate sdist + sdist_file.unlink() + with tarfile.open(sdist_file, "w:gz") as tar: + # Adapted from https://stackoverflow.com/a/65820259/554319 + for fn in tmp_path.iterdir(): + tar.add(fn, arcname=fn.relative_to(tmp_path)) + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "pull", str(sdist_file)], + obj=fake_metadata, + ) + assert result.exit_code == 1 + assert error_message in result.stdout + + def test_micropkg_pull_invalid_package_contents( + self, fake_project_cli, fake_repo_path, fake_metadata, tmp_path + ): + """ + Test for pulling an invalid sdist file locally with more than one package. + """ + error_message = "Invalid package contents: exactly one package was expected" + + call_pipeline_create(fake_project_cli, fake_metadata) + call_micropkg_package(fake_project_cli, fake_metadata) + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + assert sdist_file.is_file() + + with tarfile.open(sdist_file, "r:gz") as tar: + tar.extractall(tmp_path) + + # Create extra package + extra_package = tmp_path / f"{PIPELINE_NAME}-0.1" / f"{PIPELINE_NAME}_extra" + extra_package.mkdir() + (extra_package / "__init__.py").touch() + + # Recreate sdist + sdist_file.unlink() + with tarfile.open(sdist_file, "w:gz") as tar: + # Adapted from https://stackoverflow.com/a/65820259/554319 + for fn in tmp_path.iterdir(): + tar.add(fn, arcname=fn.relative_to(tmp_path)) + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "pull", str(sdist_file)], + obj=fake_metadata, + ) + assert result.exit_code == 1 + assert error_message in result.stdout + + @pytest.mark.parametrize( + "tar_members,path_name", + [ + (["../tarmember", "tarmember"], "destination"), + (["tarmember", "../tarmember"], "destination"), + ], + ) + def test_path_traversal( + self, + tar_members, + path_name, + ): + """Test for checking path traversal attempt in tar file""" + tar = Mock() + tar.getmembers.return_value = [ + tarfile.TarInfo(name=tar_name) for tar_name in tar_members + ] + path = Path(path_name) + with pytest.raises(Exception, match="Failed to safely extract tar file."): + safe_extract(tar, path) + + +@pytest.mark.usefixtures( + "chdir_to_dummy_project", "cleanup_dist", "cleanup_pyproject_toml" +) +class TestMicropkgPullFromManifest: + def test_micropkg_pull_all( # pylint: disable=too-many-locals + self, fake_repo_path, fake_project_cli, fake_metadata, mocker + ): + # pylint: disable=import-outside-toplevel, line-too-long + from kedro.framework.cli import micropkg + + spy = mocker.spy(micropkg, "_pull_package") + pyproject_toml = fake_repo_path / "pyproject.toml" + sdist_file = str(fake_repo_path / "dist" / _get_sdist_name("{}", "0.1")) + project_toml_str = textwrap.dedent( + f""" + [tool.kedro.micropkg.pull] + "{sdist_file.format("first")}" = {{alias = "dp", destination = "pipelines"}} + "{sdist_file.format("second")}" = {{alias = "ds", destination = "pipelines", env = "local"}} + "{sdist_file.format("third")}" = {{}} + """ + ) + + with pyproject_toml.open(mode="a") as file: + file.write(project_toml_str) + + for name in ("first", "second", "third"): + call_pipeline_create(fake_project_cli, fake_metadata, pipeline_name=name) + call_micropkg_package(fake_project_cli, fake_metadata, pipeline_name=name) + call_pipeline_delete(fake_project_cli, fake_metadata, pipeline_name=name) + + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "pull", "--all"], obj=fake_metadata + ) + + assert result.exit_code == 0 + assert "Micro-packages pulled and unpacked!" in result.output + assert spy.call_count == 3 + + build_config = toml.loads(project_toml_str) + pull_manifest = build_config["tool"]["kedro"]["micropkg"]["pull"] + for sdist_file, pull_specs in pull_manifest.items(): + expected_call = mocker.call(sdist_file, fake_metadata, **pull_specs) + assert expected_call in spy.call_args_list + + def test_micropkg_pull_all_empty_toml( + self, fake_repo_path, fake_project_cli, fake_metadata, mocker + ): + # pylint: disable=import-outside-toplevel + from kedro.framework.cli import micropkg + + spy = mocker.spy(micropkg, "_pull_package") + pyproject_toml = fake_repo_path / "pyproject.toml" + with pyproject_toml.open(mode="a") as file: + file.write("\n[tool.kedro.micropkg.pull]\n") + + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "pull", "--all"], obj=fake_metadata + ) + + assert result.exit_code == 0 + expected_message = ( + "Nothing to pull. Please update the 'pyproject.toml' package " + "manifest section." + ) + assert expected_message in result.output + assert not spy.called + + def test_invalid_toml(self, fake_repo_path, fake_project_cli, fake_metadata): + pyproject_toml = fake_repo_path / "pyproject.toml" + with pyproject_toml.open(mode="a") as file: + file.write("what/toml?") + + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "pull", "--all"], obj=fake_metadata + ) + + assert result.exit_code + assert isinstance(result.exception, toml.TomlDecodeError) + + def test_micropkg_pull_no_arg_provided(self, fake_project_cli, fake_metadata): + result = CliRunner().invoke( + fake_project_cli, ["micropkg", "pull"], obj=fake_metadata + ) + assert result.exit_code + expected_message = ( + "Please specify a package path or add '--all' to pull all micro-packages in the" + " 'pyproject.toml' package manifest section." + ) + assert expected_message in result.output diff --git a/tests/framework/cli/micropkg/test_micropkg_requirements.py b/tests/framework/cli/micropkg/test_micropkg_requirements.py new file mode 100644 index 0000000000..b0070a1bee --- /dev/null +++ b/tests/framework/cli/micropkg/test_micropkg_requirements.py @@ -0,0 +1,271 @@ +import pytest +from click.testing import CliRunner + +from kedro.framework.cli.micropkg import _get_sdist_name, _safe_parse_requirements + +PIPELINE_NAME = "my_pipeline" + +# Inspired by test cases given in https://www.python.org/dev/peps/pep-0508/. +# These are all valid requirement specifications that can be used in both +# requirements.txt and in METADATA Requires-Dist. +SIMPLE_REQUIREMENTS = """A +A.B-C_D +aa +name +name<=1 +name>=3 +name>=3,<2 +name==1.2.3 +name!=1.2.3 # inline comment +# whole line comment +name@http://foo.com +name [fred,bar] @ http://foo.com ; python_version=='2.7' +name[quux, strange];python_version<'2.7' and platform_version=='2' +name; os_name=='a' or os_name=='b' +requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7" +pip @ https://github.com/pypa/pip/archive/1.3.1.zip#sha1=da9234ees +""" + +# These requirements can be used in requirements.txt but not in METADATA Requires-Dist. +# They cannot be parsed by packaging. +COMPLEX_REQUIREMENTS = """--extra-index-url https://this.wont.work +-r other_requirements.txt +./path/to/package.whl +http://some.website.com/package.whl +""" + + +@pytest.mark.usefixtures("chdir_to_dummy_project", "cleanup_dist") +class TestMicropkgRequirements: + """Many of these tests follow the pattern: + - create a pipeline with some sort of requirements.txt + - package the pipeline/micro-package + - delete the pipeline and pull in the packaged one + - assert the project's modified requirements.txt is as expected + """ + + def call_pipeline_create(self, cli, metadata): + result = CliRunner().invoke( + cli, ["pipeline", "create", PIPELINE_NAME], obj=metadata + ) + assert result.exit_code == 0 + + def call_micropkg_package(self, cli, metadata): + result = CliRunner().invoke( + cli, + ["micropkg", "package", f"pipelines.{PIPELINE_NAME}"], + obj=metadata, + ) + assert result.exit_code == 0 + + def call_pipeline_delete(self, cli, metadata): + result = CliRunner().invoke( + cli, ["pipeline", "delete", "-y", PIPELINE_NAME], obj=metadata + ) + assert result.exit_code == 0 + + def call_micropkg_pull(self, cli, metadata, repo_path): + sdist_file = ( + repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + assert sdist_file.is_file() + + result = CliRunner().invoke( + cli, + ["micropkg", "pull", str(sdist_file)], + obj=metadata, + ) + assert result.exit_code == 0 + + def test_existing_complex_project_requirements_txt( + self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path + ): + """Pipeline requirements.txt and project requirements.txt.""" + project_requirements_txt = fake_repo_path / "src" / "requirements.txt" + with open(project_requirements_txt, "a", encoding="utf-8") as file: + file.write(COMPLEX_REQUIREMENTS) + existing_requirements = _safe_parse_requirements( + project_requirements_txt.read_text() + ) + + self.call_pipeline_create(fake_project_cli, fake_metadata) + pipeline_requirements_txt = ( + fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" + ) + pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS) + + self.call_micropkg_package(fake_project_cli, fake_metadata) + self.call_pipeline_delete(fake_project_cli, fake_metadata) + self.call_micropkg_pull(fake_project_cli, fake_metadata, fake_repo_path) + + packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS) + pulled_requirements = _safe_parse_requirements( + project_requirements_txt.read_text() + ) + # The project requirements.txt afterwards should be the requirements that already existed in + # project requirements.txt + those pulled in from pipeline requirements.txt. + # Unparseable COMPLEX_REQUIREMENTS should still be there. + assert pulled_requirements == existing_requirements | packaged_requirements + assert COMPLEX_REQUIREMENTS in project_requirements_txt.read_text() + + def test_existing_project_requirements_txt( + self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path + ): + """Pipeline requirements.txt and project requirements.txt.""" + project_requirements_txt = fake_repo_path / "src" / "requirements.txt" + existing_requirements = _safe_parse_requirements( + project_requirements_txt.read_text() + ) + + self.call_pipeline_create(fake_project_cli, fake_metadata) + pipeline_requirements_txt = ( + fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" + ) + pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS) + + self.call_micropkg_package(fake_project_cli, fake_metadata) + self.call_pipeline_delete(fake_project_cli, fake_metadata) + self.call_micropkg_pull(fake_project_cli, fake_metadata, fake_repo_path) + + packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS) + pulled_requirements = _safe_parse_requirements( + project_requirements_txt.read_text() + ) + # Project requirements.txt afterwards should be the requirements that already existed in + # project requirements.txt + those pulled in from pipeline requirements.txt. + assert pulled_requirements == existing_requirements | packaged_requirements + + def test_missing_project_requirements_txt( + self, + fake_project_cli, + fake_metadata, + fake_package_path, + fake_repo_path, + ): + """Pipeline requirements.txt without requirements.txt at + project level.""" + + # Remove project requirements.txt + project_requirements_txt = fake_repo_path / "src" / "requirements.txt" + project_requirements_txt.unlink() + + self.call_pipeline_create(fake_project_cli, fake_metadata) + pipeline_requirements_txt = ( + fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" + ) + + pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS) + packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS) + + self.call_micropkg_package(fake_project_cli, fake_metadata) + self.call_pipeline_delete(fake_project_cli, fake_metadata) + self.call_micropkg_pull(fake_project_cli, fake_metadata, fake_repo_path) + + assert project_requirements_txt.exists() + pulled_requirements = _safe_parse_requirements( + project_requirements_txt.read_text() + ) + assert packaged_requirements == pulled_requirements + + def test_no_requirements( + self, + fake_project_cli, + fake_metadata, + fake_repo_path, + ): + """No pipeline requirements.txt, and also no requirements.txt + at project level.""" + # Remove project requirements.txt + project_requirements_txt = fake_repo_path / "src" / "requirements.txt" + project_requirements_txt.unlink() + + self.call_pipeline_create(fake_project_cli, fake_metadata) + self.call_micropkg_package(fake_project_cli, fake_metadata) + self.call_pipeline_delete(fake_project_cli, fake_metadata) + self.call_micropkg_pull(fake_project_cli, fake_metadata, fake_repo_path) + + assert not project_requirements_txt.exists() + + def test_all_requirements_already_covered( + self, fake_project_cli, fake_metadata, fake_repo_path, fake_package_path + ): + """All requirements from pipeline requirements.txt already exist at project + level requirements.txt.""" + self.call_pipeline_create(fake_project_cli, fake_metadata) + pipeline_requirements_txt = ( + fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" + ) + project_requirements_txt = fake_repo_path / "src" / "requirements.txt" + pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS) + project_requirements_txt.write_text(SIMPLE_REQUIREMENTS) + + self.call_micropkg_package(fake_project_cli, fake_metadata) + self.call_pipeline_delete(fake_project_cli, fake_metadata) + self.call_micropkg_pull(fake_project_cli, fake_metadata, fake_repo_path) + + # Pipeline requirements.txt expected to be copied into project requirements.txt without any + # addition + assert project_requirements_txt.read_text() == SIMPLE_REQUIREMENTS + + def test_no_pipeline_requirements_txt( + self, fake_project_cli, fake_metadata, fake_repo_path + ): + """No pipeline requirements.txt and no project requirements.txt does not + create project requirements.txt.""" + + # Remove project requirements.txt + project_requirements_txt = fake_repo_path / "src" / "requirements.txt" + project_requirements_txt.unlink() + + self.call_pipeline_create(fake_project_cli, fake_metadata) + self.call_micropkg_package(fake_project_cli, fake_metadata) + self.call_pipeline_delete(fake_project_cli, fake_metadata) + self.call_micropkg_pull(fake_project_cli, fake_metadata, fake_repo_path) + + assert not project_requirements_txt.exists() + + def test_empty_pipeline_requirements_txt( + self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path + ): + """Empty pipeline requirements.txt and no project requirements.txt does not + create project requirements.txt.""" + + # Remove project requirements.txt + project_requirements_txt = fake_repo_path / "src" / "requirements.txt" + project_requirements_txt.unlink() + + self.call_pipeline_create(fake_project_cli, fake_metadata) + pipeline_requirements_txt = ( + fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" + ) + pipeline_requirements_txt.touch() + self.call_micropkg_package(fake_project_cli, fake_metadata) + self.call_pipeline_delete(fake_project_cli, fake_metadata) + self.call_micropkg_pull(fake_project_cli, fake_metadata, fake_repo_path) + + assert not project_requirements_txt.exists() + + @pytest.mark.parametrize("requirement", COMPLEX_REQUIREMENTS.splitlines()) + def test_complex_requirements( + self, requirement, fake_project_cli, fake_metadata, fake_package_path + ): + """Options that are valid in requirements.txt but cannot be packaged using + setup.py.""" + self.call_pipeline_create(fake_project_cli, fake_metadata) + pipeline_requirements_txt = ( + fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" + ) + pipeline_requirements_txt.write_text(requirement) + + result = CliRunner().invoke( + fake_project_cli, + ["micropkg", "package", f"pipelines.{PIPELINE_NAME}"], + obj=fake_metadata, + ) + assert result.exit_code == 1 + assert ( + "InvalidRequirement: Expected package name at the start of dependency specifier" + in result.output + or "InvalidRequirement: Expected end or semicolon" in result.output + or "InvalidRequirement: Parse error" in result.output + ) diff --git a/tests/framework/cli/pipeline/conftest.py b/tests/framework/cli/pipeline/conftest.py index eb5637d35b..f934ab6939 100644 --- a/tests/framework/cli/pipeline/conftest.py +++ b/tests/framework/cli/pipeline/conftest.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import shutil import pytest @@ -33,10 +6,29 @@ @pytest.fixture(autouse=True) -def mocked_logging(mocker): - # Disable logging.config.dictConfig in KedroSession._setup_logging as - # it changes logging.config and affects other unit tests - return mocker.patch("logging.config.dictConfig") +def cleanup_micropackages(fake_repo_path, fake_package_path): + packages = {p.name for p in fake_package_path.iterdir() if p.is_dir()} + + yield + + created_packages = { + p.name + for p in fake_package_path.iterdir() + if p.is_dir() and p.name != "__pycache__" + } + created_packages -= packages + + for micropackage in created_packages: + shutil.rmtree(str(fake_package_path / micropackage)) + + confs = fake_repo_path / settings.CONF_SOURCE + for each in confs.rglob(f"*{micropackage}*"): + if each.is_file(): + each.unlink() + + tests = fake_repo_path / "src" / "tests" / micropackage + if tests.is_dir(): + shutil.rmtree(str(tests)) @pytest.fixture(autouse=True) @@ -61,23 +53,16 @@ def cleanup_pipelines(fake_repo_path, fake_package_path): if each.is_file(): each.unlink() - dirs_to_delete = ( - dirpath - for pattern in ("parameters", "catalog") - for dirpath in confs.rglob(pattern) - if dirpath.is_dir() and not any(dirpath.iterdir()) - ) - for dirpath in dirs_to_delete: - dirpath.rmdir() + for pattern in ("parameter", "catalog"): + for dirpath in confs.rglob(pattern): + if dirpath.is_dir() and not any(dirpath.iterdir()): + dirpath.rmdir() tests = fake_repo_path / "src" / "tests" / "pipelines" / pipeline if tests.is_dir(): shutil.rmtree(str(tests)) - # remove requirements.in and reset requirements.txt - requirements_in = fake_repo_path / "src" / "requirements.in" - if requirements_in.exists(): - requirements_in.unlink() + # reset requirements.txt requirements_txt.write_text(requirements) @@ -87,3 +72,13 @@ def cleanup_dist(fake_repo_path): dist_dir = fake_repo_path / "dist" if dist_dir.exists(): shutil.rmtree(str(dist_dir)) + + +@pytest.fixture +def cleanup_pyproject_toml(fake_repo_path): + pyproject_toml = fake_repo_path / "pyproject.toml" + existing_toml = pyproject_toml.read_text() + + yield + + pyproject_toml.write_text(existing_toml) diff --git a/tests/framework/cli/pipeline/test_pipeline.py b/tests/framework/cli/pipeline/test_pipeline.py index e8d1904feb..4bdd965526 100644 --- a/tests/framework/cli/pipeline/test_pipeline.py +++ b/tests/framework/cli/pipeline/test_pipeline.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import os import shutil from pathlib import Path @@ -68,16 +41,11 @@ def make_pipelines(request, fake_repo_path, fake_package_path, mocker): TOO_SHORT_ERROR = "It must be at least 2 characters long." -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures("chdir_to_dummy_project") class TestPipelineCreateCommand: @pytest.mark.parametrize("env", [None, "local"]) def test_create_pipeline( # pylint: disable=too-many-locals - self, - fake_repo_path, - fake_project_cli, - fake_metadata, - env, - fake_package_path, + self, fake_repo_path, fake_project_cli, fake_metadata, env, fake_package_path ): """Test creation of a pipeline""" pipelines_dir = fake_package_path / "pipelines" @@ -90,15 +58,11 @@ def test_create_pipeline( # pylint: disable=too-many-locals result = CliRunner().invoke(fake_project_cli, cmd, obj=fake_metadata) assert result.exit_code == 0 - assert ( - f"To be able to run the pipeline `{PIPELINE_NAME}`, you will need " - f"to add it to `register_pipelines()`" in result.output - ) # pipeline - assert f"Creating the pipeline `{PIPELINE_NAME}`: OK" in result.output - assert f"Location: `{pipelines_dir / PIPELINE_NAME}`" in result.output - assert f"Pipeline `{PIPELINE_NAME}` was successfully created." in result.output + assert f"Creating the pipeline '{PIPELINE_NAME}': OK" in result.output + assert f"Location: '{pipelines_dir / PIPELINE_NAME}'" in result.output + assert f"Pipeline '{PIPELINE_NAME}' was successfully created." in result.output # config conf_env = env or "base" @@ -124,12 +88,8 @@ def test_create_pipeline_skip_config( result = CliRunner().invoke(fake_project_cli, cmd, obj=fake_metadata) assert result.exit_code == 0 - assert ( - f"To be able to run the pipeline `{PIPELINE_NAME}`, you will need " - f"to add it to `register_pipelines()`" in result.output - ) - assert f"Creating the pipeline `{PIPELINE_NAME}`: OK" in result.output - assert f"Pipeline `{PIPELINE_NAME}` was successfully created." in result.output + assert f"Creating the pipeline '{PIPELINE_NAME}': OK" in result.output + assert f"Pipeline '{PIPELINE_NAME}' was successfully created." in result.output conf_dirs = list((fake_repo_path / settings.CONF_SOURCE).rglob(PIPELINE_NAME)) assert conf_dirs == [] # no configs created for the pipeline @@ -205,8 +165,8 @@ def test_skip_copy(self, fake_repo_path, fake_project_cli, fake_metadata): result = CliRunner().invoke(fake_project_cli, cmd, obj=fake_metadata) assert result.exit_code == 0 - assert "__init__.py`: SKIPPED" in result.output - assert f"parameters{os.sep}{PIPELINE_NAME}.yml`: SKIPPED" in result.output + assert "__init__.py': SKIPPED" in result.output + assert f"parameters{os.sep}{PIPELINE_NAME}.yml': SKIPPED" in result.output assert result.output.count("SKIPPED") == 2 # only 2 files skipped def test_failed_copy( @@ -272,7 +232,7 @@ def test_duplicate_pipeline_name( second = CliRunner().invoke(fake_project_cli, cmd, obj=fake_metadata) assert second.exit_code - assert f"Creating the pipeline `{PIPELINE_NAME}`: FAILED" in second.output + assert f"Creating the pipeline '{PIPELINE_NAME}': FAILED" in second.output assert "directory already exists" in second.output def test_bad_env(self, fake_project_cli, fake_metadata): @@ -281,10 +241,10 @@ def test_bad_env(self, fake_project_cli, fake_metadata): cmd = ["pipeline", "create", "-e", env, PIPELINE_NAME] result = CliRunner().invoke(fake_project_cli, cmd, obj=fake_metadata) assert result.exit_code - assert f"Unable to locate environment `{env}`" in result.output + assert f"Unable to locate environment '{env}'" in result.output -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "make_pipelines") +@pytest.mark.usefixtures("chdir_to_dummy_project", "make_pipelines") class TestPipelineDeleteCommand: @pytest.mark.parametrize( "make_pipelines,env,expected_conf", @@ -317,14 +277,14 @@ def test_delete_pipeline( / f"{PIPELINE_NAME}.yml" ) - assert f"Deleting `{source_path}`: OK" in result.output - assert f"Deleting `{tests_path}`: OK" in result.output - assert f"Deleting `{params_path}`: OK" in result.output + assert f"Deleting '{source_path}': OK" in result.output + assert f"Deleting '{tests_path}': OK" in result.output + assert f"Deleting '{params_path}': OK" in result.output - assert f"Pipeline `{PIPELINE_NAME}` was successfully deleted." in result.output + assert f"Pipeline '{PIPELINE_NAME}' was successfully deleted." in result.output assert ( - f"If you added the pipeline `{PIPELINE_NAME}` to `register_pipelines()` in " - f"`{fake_package_path / 'pipeline_registry.py'}`, you will need to remove it." + f"If you added the pipeline '{PIPELINE_NAME}' to 'register_pipelines()' in " + f"""'{fake_package_path / "pipeline_registry.py"}', you will need to remove it.""" ) in result.output assert not source_path.exists() @@ -353,14 +313,14 @@ def test_delete_pipeline_skip( / f"{PIPELINE_NAME}.yml" ) - assert f"Deleting `{source_path}`" not in result.output - assert f"Deleting `{tests_path}`: OK" in result.output - assert f"Deleting `{params_path}`: OK" in result.output + assert f"Deleting '{source_path}'" not in result.output + assert f"Deleting '{tests_path}': OK" in result.output + assert f"Deleting '{params_path}': OK" in result.output - assert f"Pipeline `{PIPELINE_NAME}` was successfully deleted." in result.output + assert f"Pipeline '{PIPELINE_NAME}' was successfully deleted." in result.output assert ( - f"If you added the pipeline `{PIPELINE_NAME}` to `register_pipelines()` in " - f"`{fake_package_path / 'pipeline_registry.py'}`, you will need to remove it." + f"If you added the pipeline '{PIPELINE_NAME}' to 'register_pipelines()' in " + f"""'{fake_package_path / "pipeline_registry.py"}', you will need to remove it.""" ) in result.output assert not source_path.exists() @@ -383,7 +343,7 @@ def test_delete_pipeline_fail( ) assert result.exit_code, result.output - assert f"Deleting `{source_path}`: FAILED" in result.output + assert f"Deleting '{source_path}': FAILED" in result.output @pytest.mark.parametrize( "bad_name,error_message", @@ -399,9 +359,7 @@ def test_bad_pipeline_name( ): """Test error message when bad pipeline name was provided.""" result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "delete", "-y", bad_name], - obj=fake_metadata, + fake_project_cli, ["pipeline", "delete", "-y", bad_name], obj=fake_metadata ) assert result.exit_code assert error_message in result.output @@ -413,7 +371,7 @@ def test_pipeline_not_found(self, fake_project_cli, fake_metadata): obj=fake_metadata, ) assert result.exit_code - assert "Pipeline `non_existent` not found." in result.output + assert "Pipeline 'non_existent' not found." in result.output def test_bad_env(self, fake_project_cli, fake_metadata): """Test error when provided conf environment does not exist.""" @@ -423,7 +381,7 @@ def test_bad_env(self, fake_project_cli, fake_metadata): obj=fake_metadata, ) assert result.exit_code - assert "Unable to locate environment `invalid_env`" in result.output + assert "Unable to locate environment 'invalid_env'" in result.output @pytest.mark.parametrize("input_", ["n", "N", "random"]) def test_pipeline_delete_confirmation( @@ -453,7 +411,7 @@ def test_pipeline_delete_confirmation( assert str(params_path) in result.output assert ( - f"Are you sure you want to delete pipeline `{PIPELINE_NAME}`" + f"Are you sure you want to delete pipeline '{PIPELINE_NAME}'" in result.output ) assert "Deletion aborted!" in result.output @@ -494,7 +452,7 @@ def test_pipeline_delete_confirmation_skip( assert str(params_path) in result.output assert ( - f"Are you sure you want to delete pipeline `{PIPELINE_NAME}`" + f"Are you sure you want to delete pipeline '{PIPELINE_NAME}'" in result.output ) assert "Deletion aborted!" in result.output @@ -514,7 +472,7 @@ def source(self, tmp_path) -> Path: source_dir.mkdir() (source_dir / "existing").mkdir() (source_dir / "existing" / "source_file").touch() - (source_dir / "existing" / "common").write_text("source") + (source_dir / "existing" / "common").write_text("source", encoding="utf-8") (source_dir / "new").mkdir() (source_dir / "new" / "source_file").touch() return source_dir @@ -525,7 +483,7 @@ def test_sync_target_exists(self, source, tmp_path): target.mkdir() (target / "existing").mkdir() (target / "existing" / "target_file").touch() - (target / "existing" / "common").write_text("target") + (target / "existing" / "common").write_text("target", encoding="utf-8") _sync_dirs(source, target) @@ -535,7 +493,7 @@ def test_sync_target_exists(self, source, tmp_path): assert (source / "new" / "source_file").is_file() assert (target / "existing" / "source_file").is_file() - assert (target / "existing" / "common").read_text() == "target" + assert (target / "existing" / "common").read_text(encoding="utf-8") == "target" assert (target / "existing" / "target_file").exists() assert (target / "new" / "source_file").is_file() @@ -551,6 +509,6 @@ def test_sync_no_target(self, source, tmp_path): assert (source / "new" / "source_file").is_file() assert (target / "existing" / "source_file").is_file() - assert (target / "existing" / "common").read_text() == "source" + assert (target / "existing" / "common").read_text(encoding="utf-8") == "source" assert not (target / "existing" / "target_file").exists() assert (target / "new" / "source_file").is_file() diff --git a/tests/framework/cli/pipeline/test_pipeline_package.py b/tests/framework/cli/pipeline/test_pipeline_package.py deleted file mode 100644 index c3c640bfa2..0000000000 --- a/tests/framework/cli/pipeline/test_pipeline_package.py +++ /dev/null @@ -1,412 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -import textwrap -from pathlib import Path -from zipfile import ZipFile - -import pytest -import toml -from click.testing import CliRunner - -from kedro.framework.cli.pipeline import _get_wheel_name - -PIPELINE_NAME = "my_pipeline" - -LETTER_ERROR = "It must contain only letters, digits, and/or underscores." -FIRST_CHAR_ERROR = "It must start with a letter or underscore." -TOO_SHORT_ERROR = "It must be at least 2 characters long." - - -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "cleanup_dist") -class TestPipelinePackageCommand: - def assert_wheel_contents_correct( - self, wheel_location, package_name=PIPELINE_NAME, version="0.1" - ): - wheel_name = _get_wheel_name(name=package_name, version=version) - wheel_file = wheel_location / wheel_name - assert wheel_file.is_file() - assert len(list(wheel_location.iterdir())) == 1 - - # pylint: disable=consider-using-with - wheel_contents = set(ZipFile(str(wheel_file)).namelist()) - expected_files = { - f"{package_name}/__init__.py", - f"{package_name}/README.md", - f"{package_name}/nodes.py", - f"{package_name}/pipeline.py", - f"{package_name}/config/parameters/{package_name}.yml", - "tests/__init__.py", - "tests/test_pipeline.py", - } - assert expected_files <= wheel_contents - - @pytest.mark.parametrize( - "options,package_name,success_message", - [ - ([], PIPELINE_NAME, f"Pipeline `{PIPELINE_NAME}` packaged!"), - ( - ["--alias", "alternative"], - "alternative", - f"Pipeline `{PIPELINE_NAME}` packaged as `alternative`!", - ), - ], - ) - def test_package_pipeline( - self, - fake_repo_path, - fake_project_cli, - options, - package_name, - success_message, - fake_metadata, - ): - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata - ) - assert result.exit_code == 0 - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "package", PIPELINE_NAME] + options, - obj=fake_metadata, - ) - - assert result.exit_code == 0 - assert success_message in result.output - - wheel_location = fake_repo_path / "dist" - assert f"Location: {wheel_location}" in result.output - - self.assert_wheel_contents_correct( - wheel_location=wheel_location, package_name=package_name, version="0.1" - ) - - @pytest.mark.parametrize("existing_dir", [True, False]) - def test_pipeline_package_to_destination( - self, fake_project_cli, existing_dir, tmp_path, fake_metadata - ): - destination = (tmp_path / "in" / "here").resolve() - if existing_dir: - destination.mkdir(parents=True) - - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata - ) - assert result.exit_code == 0 - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "package", PIPELINE_NAME, "--destination", str(destination)], - obj=fake_metadata, - ) - - assert result.exit_code == 0 - success_message = ( - f"Pipeline `{PIPELINE_NAME}` packaged! Location: {destination}" - ) - assert success_message in result.output - - self.assert_wheel_contents_correct(wheel_location=destination) - - def test_pipeline_package_overwrites_wheel( - self, fake_project_cli, tmp_path, fake_metadata - ): - destination = (tmp_path / "in" / "here").resolve() - destination.mkdir(parents=True) - wheel_file = destination / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - wheel_file.touch() - - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata - ) - assert result.exit_code == 0 - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "package", PIPELINE_NAME, "--destination", str(destination)], - obj=fake_metadata, - ) - assert result.exit_code == 0 - - warning_message = f"Package file {wheel_file} will be overwritten!" - success_message = ( - f"Pipeline `{PIPELINE_NAME}` packaged! Location: {destination}" - ) - assert warning_message in result.output - assert success_message in result.output - - self.assert_wheel_contents_correct(wheel_location=destination) - - @pytest.mark.parametrize( - "bad_alias,error_message", - [ - ("bad name", LETTER_ERROR), - ("bad%name", LETTER_ERROR), - ("1bad", FIRST_CHAR_ERROR), - ("a", TOO_SHORT_ERROR), - ], - ) - def test_package_pipeline_bad_alias( - self, fake_project_cli, bad_alias, error_message - ): - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "package", PIPELINE_NAME, "--alias", bad_alias], - ) - assert result.exit_code - assert error_message in result.output - - def test_package_pipeline_no_config( - self, fake_repo_path, fake_project_cli, fake_metadata - ): - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "create", PIPELINE_NAME, "--skip-config"], - obj=fake_metadata, - ) - assert result.exit_code == 0 - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", PIPELINE_NAME], obj=fake_metadata - ) - - assert result.exit_code == 0 - assert f"Pipeline `{PIPELINE_NAME}` packaged!" in result.output - - wheel_location = fake_repo_path / "dist" - assert f"Location: {wheel_location}" in result.output - - # the wheel contents are slightly different (config shouldn't be included), - # which is why we can't call self.assert_wheel_contents_correct here - wheel_file = wheel_location / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - assert wheel_file.is_file() - assert len(list((fake_repo_path / "dist").iterdir())) == 1 - - # pylint: disable=consider-using-with - wheel_contents = set(ZipFile(str(wheel_file)).namelist()) - expected_files = { - f"{PIPELINE_NAME}/__init__.py", - f"{PIPELINE_NAME}/README.md", - f"{PIPELINE_NAME}/nodes.py", - f"{PIPELINE_NAME}/pipeline.py", - "tests/__init__.py", - "tests/test_pipeline.py", - } - assert expected_files <= wheel_contents - assert f"{PIPELINE_NAME}/config/parameters.yml" not in wheel_contents - - def test_package_non_existing_pipeline_dir( - self, fake_package_path, fake_project_cli, fake_metadata - ): - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "non_existing"], obj=fake_metadata - ) - assert result.exit_code == 1 - pipeline_dir = fake_package_path / "pipelines" / "non_existing" - error_message = f"Error: Directory '{pipeline_dir}' doesn't exist." - assert error_message in result.output - - def test_package_empty_pipeline_dir( - self, fake_project_cli, fake_package_path, fake_metadata - ): - pipeline_dir = fake_package_path / "pipelines" / "empty_dir" - pipeline_dir.mkdir() - - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "empty_dir"], obj=fake_metadata - ) - assert result.exit_code == 1 - error_message = f"Error: '{pipeline_dir}' is an empty directory." - assert error_message in result.output - - def test_package_modular_pipeline_with_nested_parameters( - self, fake_repo_path, fake_project_cli, fake_metadata - ): - """ - The setup for the test is as follows: - - Create two modular pipelines, to verify that only the parameter file with matching pipeline - name will be packaged. - - Add a directory with a parameter file to verify that if a project has parameters structured - like below, that the ones inside a directory with the pipeline name are packaged as well - when calling `kedro pipeline package` for a specific pipeline. - - parameters - └── retail - └── params1.ym - """ - CliRunner().invoke( - fake_project_cli, ["pipeline", "create", "retail"], obj=fake_metadata - ) - CliRunner().invoke( - fake_project_cli, - ["pipeline", "create", "retail_banking"], - obj=fake_metadata, - ) - nested_param_path = Path( - fake_repo_path / "conf" / "base" / "parameters" / "retail" - ) - nested_param_path.mkdir(parents=True, exist_ok=True) - (nested_param_path / "params1.yml").touch() - - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "retail"], obj=fake_metadata - ) - - assert result.exit_code == 0 - assert "Pipeline `retail` packaged!" in result.output - - wheel_location = fake_repo_path / "dist" - assert f"Location: {wheel_location}" in result.output - - wheel_name = _get_wheel_name(name="retail", version="0.1") - wheel_file = wheel_location / wheel_name - assert wheel_file.is_file() - assert len(list(wheel_location.iterdir())) == 1 - - # pylint: disable=consider-using-with - wheel_contents = set(ZipFile(str(wheel_file)).namelist()) - assert "retail/config/parameters/retail/params1.yml" in wheel_contents - assert "retail/config/parameters/retail.yml" in wheel_contents - assert "retail/config/parameters/retail_banking.yml" not in wheel_contents - - def test_pipeline_package_default( - self, fake_repo_path, fake_package_path, fake_project_cli, fake_metadata - ): - _pipeline_name = "data_engineering" - - pipelines_dir = fake_package_path / "pipelines" / _pipeline_name - assert pipelines_dir.is_dir() - - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", _pipeline_name], obj=fake_metadata - ) - assert result.exit_code == 0 - - # test for actual version - wheel_location = fake_repo_path / "dist" - wheel_name = _get_wheel_name(name=_pipeline_name, version="0.1") - wheel_file = wheel_location / wheel_name - - assert wheel_file.is_file() - assert len(list(wheel_location.iterdir())) == 1 - - -@pytest.fixture -def cleanup_pyproject_toml(fake_repo_path): - pyproject_toml = fake_repo_path / "pyproject.toml" - existing_toml = pyproject_toml.read_text() - - yield - - pyproject_toml.write_text(existing_toml) - - -@pytest.mark.usefixtures( - "chdir_to_dummy_project", "patch_log", "cleanup_dist", "cleanup_pyproject_toml" -) -class TestPipelinePackageFromManifest: - def test_pipeline_package_all( # pylint: disable=too-many-locals - self, fake_repo_path, fake_project_cli, fake_metadata, tmp_path, mocker - ): - # pylint: disable=import-outside-toplevel - from kedro.framework.cli import pipeline - - spy = mocker.spy(pipeline, "_package_pipeline") - pyproject_toml = fake_repo_path / "pyproject.toml" - other_dest = tmp_path / "here" - other_dest.mkdir() - project_toml_str = textwrap.dedent( - f""" - [tool.kedro.pipeline.package] - first = {{destination = "{other_dest.as_posix()}"}} - second = {{alias = "ds", env = "local"}} - third = {{}} - """ - ) - with pyproject_toml.open(mode="a") as file: - file.write(project_toml_str) - - for name in ("first", "second", "third"): - CliRunner().invoke( - fake_project_cli, ["pipeline", "create", name], obj=fake_metadata - ) - - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "--all"], obj=fake_metadata - ) - - assert result.exit_code == 0 - assert "Pipelines packaged!" in result.output - assert spy.call_count == 3 - - build_config = toml.loads(project_toml_str) - package_manifest = build_config["tool"]["kedro"]["pipeline"]["package"] - for pipeline_name, packaging_specs in package_manifest.items(): - expected_call = mocker.call(pipeline_name, fake_metadata, **packaging_specs) - assert expected_call in spy.call_args_list - - def test_pipeline_package_all_empty_toml( - self, fake_repo_path, fake_project_cli, fake_metadata, mocker - ): - # pylint: disable=import-outside-toplevel - from kedro.framework.cli import pipeline - - spy = mocker.spy(pipeline, "_package_pipeline") - pyproject_toml = fake_repo_path / "pyproject.toml" - with pyproject_toml.open(mode="a") as file: - file.write("\n[tool.kedro.pipeline.package]\n") - - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "--all"], obj=fake_metadata - ) - - assert result.exit_code == 0 - expected_message = "Nothing to package. Please update your `pyproject.toml`." - assert expected_message in result.output - assert not spy.called - - def test_invalid_toml(self, fake_repo_path, fake_project_cli, fake_metadata): - pyproject_toml = fake_repo_path / "pyproject.toml" - with pyproject_toml.open(mode="a") as file: - file.write("what/toml?") - - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "--all"], obj=fake_metadata - ) - - assert result.exit_code - assert isinstance(result.exception, toml.TomlDecodeError) - - def test_pipeline_package_no_arg_provided(self, fake_project_cli, fake_metadata): - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package"], obj=fake_metadata - ) - assert result.exit_code - expected_message = ( - "Please specify a pipeline name or add " - "'--all' to package all pipelines in `pyproject.toml`." - ) - assert expected_message in result.output diff --git a/tests/framework/cli/pipeline/test_pipeline_pull.py b/tests/framework/cli/pipeline/test_pipeline_pull.py deleted file mode 100644 index 10c300bb68..0000000000 --- a/tests/framework/cli/pipeline/test_pipeline_pull.py +++ /dev/null @@ -1,549 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -import filecmp -import shutil - -import pytest -import yaml -from click import ClickException -from click.testing import CliRunner - -from kedro.framework.cli.pipeline import _get_wheel_name -from kedro.framework.project import settings - -PIPELINE_NAME = "my_pipeline" - - -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "cleanup_dist") -class TestPipelinePullCommand: - def call_pipeline_create(self, cli, metadata): - result = CliRunner().invoke( - cli, ["pipeline", "create", PIPELINE_NAME], obj=metadata - ) - assert result.exit_code == 0 - - def call_pipeline_package(self, cli, metadata, alias=None, destination=None): - options = ["--alias", alias] if alias else [] - options += ["--destination", str(destination)] if destination else [] - result = CliRunner().invoke( - cli, - ["pipeline", "package", PIPELINE_NAME, *options], - obj=metadata, - ) - assert result.exit_code == 0 - - def call_pipeline_delete(self, cli, metadata): - result = CliRunner().invoke( - cli, ["pipeline", "delete", "-y", PIPELINE_NAME], obj=metadata - ) - assert result.exit_code == 0 - - def assert_package_files_exist(self, source_path): - assert {f.name for f in source_path.iterdir()} == { - "__init__.py", - "nodes.py", - "pipeline.py", - "README.md", - } - - @pytest.mark.parametrize("env", [None, "local"]) - @pytest.mark.parametrize("alias", [None, "alias_path"]) - def test_pull_local_whl( - self, - fake_project_cli, - fake_repo_path, - fake_package_path, - env, - alias, - fake_metadata, - ): - """ - Test for pulling a valid wheel file locally. - """ - # pylint: disable=too-many-locals - self.call_pipeline_create(fake_project_cli, fake_metadata) - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - - source_path = fake_package_path / "pipelines" / PIPELINE_NAME - config_path = ( - fake_repo_path / settings.CONF_SOURCE / "base" / "pipelines" / PIPELINE_NAME - ) - test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - # Make sure the files actually deleted before pulling from the wheel file. - assert not source_path.exists() - assert not test_path.exists() - assert not config_path.exists() - - wheel_file = ( - fake_repo_path / "dist" / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - ) - assert wheel_file.is_file() - - options = ["-e", env] if env else [] - options += ["--alias", alias] if alias else [] - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "pull", str(wheel_file), *options], - obj=fake_metadata, - ) - assert result.exit_code == 0 - - pipeline_name = alias or PIPELINE_NAME - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name - config_env = env or "base" - params_config = ( - fake_repo_path - / settings.CONF_SOURCE - / config_env - / "parameters" - / f"{pipeline_name}.yml" - ) - - self.assert_package_files_exist(source_dest) - assert params_config.is_file() - actual_test_files = {f.name for f in test_dest.iterdir()} - expected_test_files = {"__init__.py", "test_pipeline.py"} - assert actual_test_files == expected_test_files - - @pytest.mark.parametrize("env", [None, "local"]) - @pytest.mark.parametrize("alias", [None, "alias_path"]) - def test_pull_local_whl_compare( - self, - fake_project_cli, - fake_repo_path, - fake_package_path, - env, - alias, - fake_metadata, - ): - """ - Test for pulling a valid wheel file locally, unpack it into another location and - check that unpacked files are identical to the ones in the original modular pipeline. - """ - # pylint: disable=too-many-locals - pipeline_name = "another_pipeline" - self.call_pipeline_create(fake_project_cli, fake_metadata) - self.call_pipeline_package(fake_project_cli, fake_metadata, pipeline_name) - - source_path = fake_package_path / "pipelines" / PIPELINE_NAME - test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - source_params_config = ( - fake_repo_path - / settings.CONF_SOURCE - / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" - ) - - wheel_file = ( - fake_repo_path / "dist" / _get_wheel_name(name=pipeline_name, version="0.1") - ) - assert wheel_file.is_file() - - options = ["-e", env] if env else [] - options += ["--alias", alias] if alias else [] - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "pull", str(wheel_file), *options], - obj=fake_metadata, - ) - assert result.exit_code == 0 - - pipeline_name = alias or pipeline_name - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name - config_env = env or "base" - dest_params_config = ( - fake_repo_path - / settings.CONF_SOURCE - / config_env - / "parameters" - / f"{pipeline_name}.yml" - ) - - assert not filecmp.dircmp(source_path, source_dest).diff_files - assert not filecmp.dircmp(test_path, test_dest).diff_files - assert source_params_config.read_bytes() == dest_params_config.read_bytes() - - def test_pull_whl_fs_args( - self, fake_project_cli, fake_repo_path, mocker, tmp_path, fake_metadata - ): - """ - Test for pulling a wheel file with custom fs_args specified. - """ - self.call_pipeline_create(fake_project_cli, fake_metadata) - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - - fs_args_config = tmp_path / "fs_args_config.yml" - with fs_args_config.open(mode="w") as f: - yaml.dump({"fs_arg_1": 1, "fs_arg_2": {"fs_arg_2_nested_1": 2}}, f) - mocked_filesystem = mocker.patch("fsspec.filesystem") - - wheel_file = ( - fake_repo_path / "dist" / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - ) - - options = ["--fs-args", str(fs_args_config)] - CliRunner().invoke( - fake_project_cli, ["pipeline", "pull", str(wheel_file), *options] - ) - - mocked_filesystem.assert_called_once_with( - "file", fs_arg_1=1, fs_arg_2=dict(fs_arg_2_nested_1=2) - ) - - def test_pull_two_dist_info( - self, fake_project_cli, fake_repo_path, mocker, tmp_path, fake_metadata - ): - """ - Test for pulling a wheel file with more than one dist-info directory. - """ - self.call_pipeline_create(fake_project_cli, fake_metadata) - self.call_pipeline_package(fake_project_cli, fake_metadata) - wheel_file = ( - fake_repo_path / "dist" / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - ) - assert wheel_file.is_file() - - (tmp_path / "dummy.dist-info").mkdir() - - mocker.patch( - "kedro.framework.cli.pipeline.tempfile.TemporaryDirectory", - return_value=tmp_path, - ) - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "pull", str(wheel_file)], - obj=fake_metadata, - ) - assert result.exit_code - assert "Error: More than 1 or no dist-info files found" in result.output - - @pytest.mark.parametrize("env", [None, "local"]) - @pytest.mark.parametrize("alias", [None, "alias_path"]) - def test_pull_tests_missing( - self, - fake_project_cli, - fake_repo_path, - fake_package_path, - env, - alias, - fake_metadata, - ): - """ - Test for pulling a valid wheel file locally, but `tests` directory is missing - from the wheel file. - """ - # pylint: disable=too-many-locals - self.call_pipeline_create(fake_project_cli, fake_metadata) - test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - shutil.rmtree(test_path) - assert not test_path.exists() - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - - source_path = fake_package_path / "pipelines" / PIPELINE_NAME - source_params_config = ( - fake_repo_path - / settings.CONF_SOURCE - / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" - ) - # Make sure the files actually deleted before pulling from the wheel file. - assert not source_path.exists() - assert not source_params_config.exists() - - wheel_file = ( - fake_repo_path / "dist" / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - ) - assert wheel_file.is_file() - - options = ["-e", env] if env else [] - options += ["--alias", alias] if alias else [] - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "pull", str(wheel_file), *options], - obj=fake_metadata, - ) - assert result.exit_code == 0 - - pipeline_name = alias or PIPELINE_NAME - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name - config_env = env or "base" - params_config = ( - fake_repo_path - / settings.CONF_SOURCE - / config_env - / "parameters" - / f"{pipeline_name}.yml" - ) - - self.assert_package_files_exist(source_dest) - assert params_config.is_file() - assert not test_dest.exists() - - @pytest.mark.parametrize("env", [None, "local"]) - @pytest.mark.parametrize("alias", [None, "alias_path"]) - def test_pull_config_missing( - self, - fake_project_cli, - fake_repo_path, - fake_package_path, - env, - alias, - fake_metadata, - ): - """ - Test for pulling a valid wheel file locally, but `config` directory is missing - from the wheel file. - """ - # pylint: disable=too-many-locals - self.call_pipeline_create(fake_project_cli, fake_metadata) - source_params_config = ( - fake_repo_path - / settings.CONF_SOURCE - / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" - ) - source_params_config.unlink() - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - - source_path = fake_package_path / "pipelines" / PIPELINE_NAME - test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - # Make sure the files actually deleted before pulling from the wheel file. - assert not source_path.exists() - assert not test_path.exists() - - wheel_file = ( - fake_repo_path / "dist" / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - ) - assert wheel_file.is_file() - - options = ["-e", env] if env else [] - options += ["--alias", alias] if alias else [] - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "pull", str(wheel_file), *options], - obj=fake_metadata, - ) - assert result.exit_code == 0 - - pipeline_name = alias or PIPELINE_NAME - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name - config_env = env or "base" - dest_params_config = ( - fake_repo_path - / settings.CONF_SOURCE - / config_env - / "parameters" - / f"{pipeline_name}.yml" - ) - - self.assert_package_files_exist(source_dest) - assert not dest_params_config.exists() - actual_test_files = {f.name for f in test_dest.iterdir()} - expected_test_files = {"__init__.py", "test_pipeline.py"} - assert actual_test_files == expected_test_files - - @pytest.mark.parametrize("env", [None, "local"]) - @pytest.mark.parametrize("alias", [None, "alias_path"]) - def test_pull_from_pypi( - self, - fake_project_cli, - fake_repo_path, - mocker, - tmp_path, - fake_package_path, - env, - alias, - fake_metadata, - ): - """ - Test for pulling a valid wheel file from pypi. - """ - # pylint: disable=too-many-locals - self.call_pipeline_create(fake_project_cli, fake_metadata) - # We mock the `pip download` call, and manually create a package wheel file - # to simulate the pypi scenario instead - self.call_pipeline_package( - fake_project_cli, fake_metadata, destination=tmp_path - ) - wheel_file = tmp_path / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - assert wheel_file.is_file() - self.call_pipeline_delete(fake_project_cli, fake_metadata) - - source_path = fake_package_path / "pipelines" / PIPELINE_NAME - test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - source_params_config = ( - fake_repo_path - / settings.CONF_SOURCE - / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" - ) - # Make sure the files actually deleted before pulling from pypi. - assert not source_path.exists() - assert not test_path.exists() - assert not source_params_config.exists() - - python_call_mock = mocker.patch("kedro.framework.cli.pipeline.python_call") - mocker.patch( - "kedro.framework.cli.pipeline.tempfile.TemporaryDirectory", - return_value=tmp_path, - ) - - options = ["-e", env] if env else [] - options += ["--alias", alias] if alias else [] - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "pull", PIPELINE_NAME, *options], - obj=fake_metadata, - ) - assert result.exit_code == 0 - - python_call_mock.assert_called_once_with( - "pip", - ["download", "--no-deps", "--dest", str(tmp_path), PIPELINE_NAME], - ) - - pipeline_name = alias or PIPELINE_NAME - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name - config_env = env or "base" - dest_params_config = ( - fake_repo_path - / settings.CONF_SOURCE - / config_env - / "parameters" - / f"{pipeline_name}.yml" - ) - - self.assert_package_files_exist(source_dest) - assert dest_params_config.is_file() - actual_test_files = {f.name for f in test_dest.iterdir()} - expected_test_files = {"__init__.py", "test_pipeline.py"} - assert actual_test_files == expected_test_files - - def test_invalid_pull_from_pypi( - self, fake_project_cli, mocker, tmp_path, fake_metadata - ): - """ - Test for pulling package from pypi, and it cannot be found. - """ - - pypi_error_message = ( - "ERROR: Could not find a version that satisfies the requirement" - ) - python_call_mock = mocker.patch( - "kedro.framework.cli.pipeline.python_call", - side_effect=ClickException(pypi_error_message), - ) - mocker.patch( - "kedro.framework.cli.pipeline.tempfile.TemporaryDirectory", - return_value=tmp_path, - ) - - invalid_pypi_name = "non_existent" - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "pull", invalid_pypi_name], - obj=fake_metadata, - ) - assert result.exit_code - - python_call_mock.assert_called_once_with( - "pip", ["download", "--no-deps", "--dest", str(tmp_path), invalid_pypi_name] - ) - - assert pypi_error_message in result.stdout - - def test_pull_from_pypi_more_than_one_wheel_file( - self, fake_project_cli, mocker, tmp_path, fake_metadata - ): - """ - Test for pulling a wheel file with `pip download`, but there are more than one wheel - file to unzip. - """ - # We mock the `pip download` call, and manually create a package wheel file - # to simulate the pypi scenario instead - self.call_pipeline_create(fake_project_cli, fake_metadata) - self.call_pipeline_package( - fake_project_cli, fake_metadata, destination=tmp_path - ) - self.call_pipeline_package( - fake_project_cli, fake_metadata, alias="another", destination=tmp_path - ) - mocker.patch("kedro.framework.cli.pipeline.python_call") - mocker.patch( - "kedro.framework.cli.pipeline.tempfile.TemporaryDirectory", - return_value=tmp_path, - ) - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "pull", PIPELINE_NAME], obj=fake_metadata - ) - - assert result.exit_code - assert "Error: More than 1 or no wheel files found:" in result.output - - def test_pull_unsupported_protocol_by_fsspec( - self, fake_project_cli, fake_metadata, tmp_path, mocker - ): - protocol = "unsupported" - exception_message = f"Protocol not known: {protocol}" - error_message = "Error: More than 1 or no wheel files found:" - package_path = f"{protocol}://{PIPELINE_NAME}" - - python_call_mock = mocker.patch("kedro.framework.cli.pipeline.python_call") - filesystem_mock = mocker.patch( - "fsspec.filesystem", side_effect=ValueError(exception_message) - ) - mocker.patch( - "kedro.framework.cli.pipeline.tempfile.TemporaryDirectory", - return_value=tmp_path, - ) - - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "pull", package_path], obj=fake_metadata - ) - - assert result.exit_code - filesystem_mock.assert_called_once_with(protocol) - python_call_mock.assert_called_once_with( - "pip", ["download", "--no-deps", "--dest", str(tmp_path), package_path] - ) - assert exception_message in result.output - assert "Trying to use 'pip download'..." in result.output - assert error_message in result.output diff --git a/tests/framework/cli/pipeline/test_pipeline_requirements.py b/tests/framework/cli/pipeline/test_pipeline_requirements.py deleted file mode 100644 index f986612ab2..0000000000 --- a/tests/framework/cli/pipeline/test_pipeline_requirements.py +++ /dev/null @@ -1,288 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -import pkg_resources -import pytest -from click.testing import CliRunner - -from kedro.framework.cli.pipeline import _get_wheel_name - -PIPELINE_NAME = "my_pipeline" - -# Inspired by test cases given in https://www.python.org/dev/peps/pep-0508/. -# These are all valid requirement specifications that can be used in both -# requirements.txt and in METADATA Requires-Dist. -VALID_REQUIREMENTS = """A -A.B-C_D -aa -name -name<=1 -name>=3 -name>=3,<2 -name==1.2.3 -name!=1.2.3 # inline comment -# whole line comment -name@http://foo.com -name [fred,bar] @ http://foo.com ; python_version=='2.7' -name[quux, strange];python_version<'2.7' and platform_version=='2' -name; os_name=='a' or os_name=='b' -requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7" -pip @ https://github.com/pypa/pip/archive/1.3.1.zip#sha1=da9234ees -""" - - -@pytest.mark.usefixtures("chdir_to_dummy_project", "cleanup_dist") -class TestPipelineRequirements: - """Many of these tests follow the pattern: - - create a pipeline with some sort of requirements.txt - - package the pipeline - - delete the pipeline and pull in the packaged one - - assert the project's modified requirements.in is as expected - """ - - def call_pipeline_create(self, cli, metadata): - result = CliRunner().invoke( - cli, ["pipeline", "create", PIPELINE_NAME], obj=metadata - ) - assert result.exit_code == 0 - - def call_pipeline_package(self, cli, metadata): - result = CliRunner().invoke( - cli, - ["pipeline", "package", PIPELINE_NAME], - obj=metadata, - ) - assert result.exit_code == 0 - - def call_pipeline_delete(self, cli, metadata): - result = CliRunner().invoke( - cli, ["pipeline", "delete", "-y", PIPELINE_NAME], obj=metadata - ) - assert result.exit_code == 0 - - def call_pipeline_pull(self, cli, metadata, repo_path): - wheel_file = ( - repo_path / "dist" / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - ) - assert wheel_file.is_file() - - result = CliRunner().invoke( - cli, - ["pipeline", "pull", str(wheel_file)], - obj=metadata, - ) - assert result.exit_code == 0 - - def test_existing_project_requirements_txt( - self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path - ): - """Pipeline requirements.txt and project requirements.txt, but no project - requirements.in.""" - self.call_pipeline_create(fake_project_cli, fake_metadata) - pipeline_requirements_txt = ( - fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" - ) - pipeline_requirements_txt.write_text(VALID_REQUIREMENTS) - - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - - packaged_requirements = pkg_resources.parse_requirements(VALID_REQUIREMENTS) - project_requirements_in = fake_repo_path / "src" / "requirements.in" - pulled_requirements = pkg_resources.parse_requirements( - project_requirements_in.read_text() - ) - # Packaged requirements expected to be a subset of pulled requirements due to - # default project level requirements.txt (e.g. black, flake8), which should be - # preserved - assert set(packaged_requirements) <= set(pulled_requirements) - - def test_existing_project_requirements_in( - self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path - ): - """Pipeline requirements.txt and a pre-existing project requirements.in.""" - project_requirements_in = fake_repo_path / "src" / "requirements.in" - initial_dependency = "some_package==0.1.0" - project_requirements_in.write_text(initial_dependency) - self.call_pipeline_create(fake_project_cli, fake_metadata) - pipeline_requirements_txt = ( - fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" - ) - pipeline_requirements_txt.write_text(VALID_REQUIREMENTS) - - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - - packaged_requirements = pkg_resources.parse_requirements(VALID_REQUIREMENTS) - existing_requirements = pkg_resources.parse_requirements(initial_dependency) - pulled_requirements = pkg_resources.parse_requirements( - project_requirements_in.read_text() - ) - # Requirements after pulling a pipeline expected to be the union of - # requirements packaged and requirements already existing at project level - assert set(pulled_requirements) == set(packaged_requirements) | set( - existing_requirements - ) - - def test_missing_project_requirements_in_and_txt( - self, - fake_project_cli, - fake_metadata, - fake_package_path, - fake_repo_path, - ): - """Pipeline requirements.txt without requirements.in or requirements.txt at - project level.""" - # Remove project requirements.txt - project_requirements_txt = fake_repo_path / "src" / "requirements.txt" - project_requirements_txt.unlink() - - self.call_pipeline_create(fake_project_cli, fake_metadata) - pipeline_requirements_txt = ( - fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" - ) - - pipeline_requirements_txt.write_text(VALID_REQUIREMENTS) - packaged_requirements = pkg_resources.parse_requirements(VALID_REQUIREMENTS) - - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - - project_requirements_in = fake_repo_path / "src" / "requirements.in" - - assert not project_requirements_txt.exists() - assert project_requirements_in.exists() - pulled_requirements = pkg_resources.parse_requirements( - project_requirements_in.read_text() - ) - assert set(packaged_requirements) == set(pulled_requirements) - - def test_no_requirements( - self, - fake_project_cli, - fake_metadata, - fake_repo_path, - ): - """No pipeline requirements.txt, and also no requirements.in or requirements.txt - at project level.""" - # Remove project requirements.txt - project_requirements_txt = fake_repo_path / "src" / "requirements.txt" - project_requirements_txt.unlink() - - self.call_pipeline_create(fake_project_cli, fake_metadata) - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - - project_requirements_in = fake_repo_path / "src" / "requirements.in" - project_requirements_txt = fake_repo_path / "src" / "requirements.txt" - assert not project_requirements_txt.exists() - assert not project_requirements_in.exists() - - def test_all_requirements_already_covered( - self, fake_project_cli, fake_metadata, fake_repo_path, fake_package_path - ): - """All requirements from pipeline requirements.txt already exist at project - level requirements.txt.""" - self.call_pipeline_create(fake_project_cli, fake_metadata) - pipeline_requirements_txt = ( - fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" - ) - project_requirements_txt = fake_repo_path / "src" / "requirements.txt" - pipeline_requirements_txt.write_text(VALID_REQUIREMENTS) - project_requirements_txt.write_text(VALID_REQUIREMENTS) - - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - - # requirements.txt expected to be copied into requirements.in without any - # addition - project_requirements_in = fake_repo_path / "src" / "requirements.in" - assert project_requirements_in.exists() - assert project_requirements_in.read_text() == VALID_REQUIREMENTS - - def test_no_pipeline_requirements_txt( - self, fake_project_cli, fake_metadata, fake_repo_path - ): - """No pipeline requirements.txt and no project requirements.in does not - create project requirements.in.""" - self.call_pipeline_create(fake_project_cli, fake_metadata) - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - - project_requirements_in = fake_repo_path / "src" / "requirements.in" - assert not project_requirements_in.exists() - - def test_empty_pipeline_requirements_txt( - self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path - ): - """Empty pipeline requirements.txt and no project requirements.in does not - create project requirements.in.""" - self.call_pipeline_create(fake_project_cli, fake_metadata) - pipeline_requirements_txt = ( - fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" - ) - pipeline_requirements_txt.touch() - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - - project_requirements_in = fake_repo_path / "src" / "requirements.in" - assert not project_requirements_in.exists() - - @pytest.mark.parametrize( - "requirement", - [ - "--extra-index-url https://this.wont.work", - "-r other_requirements.txt", - "./path/to/package.whl", - "http://some.website.com/package.whl", - ], - ) - def test_invalid_requirements( - self, requirement, fake_project_cli, fake_metadata, fake_package_path - ): - """Options that are valid in requirements.txt but cannot be packaged using - setup.py.""" - self.call_pipeline_create(fake_project_cli, fake_metadata) - pipeline_requirements_txt = ( - fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" - ) - pipeline_requirements_txt.write_text(requirement) - - result = CliRunner().invoke( - fake_project_cli, - ["pipeline", "package", PIPELINE_NAME], - obj=fake_metadata, - ) - assert result.exit_code == 1 - assert "InvalidRequirement: Parse error" in result.output diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index 63bd66c684..9296f02465 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - import shutil import pytest @@ -33,8 +5,9 @@ from click.testing import CliRunner from kedro.extras.datasets.pandas import CSVDataSet -from kedro.io import DataCatalog, MemoryDataSet -from kedro.pipeline import Pipeline, node +from kedro.io import DataCatalog, MemoryDataset +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline @pytest.fixture @@ -45,19 +18,15 @@ def fake_load_context(mocker): ) -@pytest.fixture(autouse=True) -def mocked_logging(mocker): - # Disable logging.config.dictConfig in KedroSession._setup_logging as - # it changes logging.config and affects other unit tests - return mocker.patch("logging.config.dictConfig") - - PIPELINE_NAME = "pipeline" @pytest.fixture def mock_pipelines(mocker): - dummy_pipelines = {PIPELINE_NAME: Pipeline([]), "second": Pipeline([])} + dummy_pipelines = { + PIPELINE_NAME: modular_pipeline([]), + "second": modular_pipeline([]), + } return mocker.patch("kedro.framework.cli.catalog.pipelines", dummy_pipelines) @@ -74,8 +43,8 @@ def test_list_all_pipelines(self, fake_project_cli, fake_metadata, mocker): assert not result.exit_code expected_dict = { - "DataSets in 'pipeline' pipeline": {}, - "DataSets in 'second' pipeline": {}, + "Datasets in 'pipeline' pipeline": {}, + "Datasets in 'second' pipeline": {}, } yaml_dump_mock.assert_called_once_with(expected_dict) @@ -89,7 +58,7 @@ def test_list_specific_pipelines(self, fake_project_cli, fake_metadata, mocker): ) assert not result.exit_code - expected_dict = {f"DataSets in '{PIPELINE_NAME}' pipeline": {}} + expected_dict = {f"Datasets in '{PIPELINE_NAME}' pipeline": {}} yaml_dump_mock.assert_called_once_with(expected_dict) def test_not_found_pipeline(self, fake_project_cli, fake_metadata): @@ -101,7 +70,7 @@ def test_not_found_pipeline(self, fake_project_cli, fake_metadata): assert result.exit_code expected_output = ( - "Error: `fake` pipeline not found! Existing pipelines: pipeline, second" + "Error: 'fake' pipeline not found! Existing pipelines: pipeline, second" ) assert expected_output in result.output @@ -112,9 +81,9 @@ def test_no_param_datasets_in_respose( mocked_context = fake_load_context.return_value catalog_data_sets = { "iris_data": CSVDataSet("test.csv"), - "intermediate": MemoryDataSet(), - "parameters": MemoryDataSet(), - "params:data_ratio": MemoryDataSet(), + "intermediate": MemoryDataset(), + "parameters": MemoryDataset(), + "params:data_ratio": MemoryDataset(), "not_used": CSVDataSet("test2.csv"), } @@ -134,15 +103,15 @@ def test_no_param_datasets_in_respose( assert not result.exit_code # 'parameters' and 'params:data_ratio' should not appear in the response expected_dict = { - f"DataSets in '{PIPELINE_NAME}' pipeline": { + f"Datasets in '{PIPELINE_NAME}' pipeline": { "Datasets mentioned in pipeline": { "CSVDataSet": ["iris_data"], - "MemoryDataSet": ["intermediate"], + "MemoryDataset": ["intermediate"], }, "Datasets not mentioned in pipeline": {"CSVDataSet": ["not_used"]}, } } - key = f"DataSets in '{PIPELINE_NAME}' pipeline" + key = f"Datasets in '{PIPELINE_NAME}' pipeline" assert yaml_dump_mock.call_count == 1 assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key] @@ -170,14 +139,14 @@ def test_default_dataset( assert not result.exit_code expected_dict = { - f"DataSets in '{PIPELINE_NAME}' pipeline": { + f"Datasets in '{PIPELINE_NAME}' pipeline": { "Datasets mentioned in pipeline": { "CSVDataSet": ["some_dataset"], - "DefaultDataSet": ["intermediate"], + "DefaultDataset": ["intermediate"], } } } - key = f"DataSets in '{PIPELINE_NAME}' pipeline" + key = f"Datasets in '{PIPELINE_NAME}' pipeline" assert yaml_dump_mock.call_count == 1 assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key] @@ -186,9 +155,9 @@ def identity(data): return data # pragma: no cover -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures("chdir_to_dummy_project") class TestCatalogCreateCommand: - PIPELINE_NAME = "de" + PIPELINE_NAME = "data_engineering" @staticmethod @pytest.fixture(params=["base"]) @@ -202,7 +171,7 @@ def catalog_path(request, fake_repo_path): def test_pipeline_argument_is_required(self, fake_project_cli): result = CliRunner().invoke(fake_project_cli, ["catalog", "create"]) assert result.exit_code - expected_output = "Error: Missing option '--pipeline'." + expected_output = "Error: Missing option '--pipeline' / '-p'." assert expected_output in result.output @pytest.mark.usefixtures("fake_load_context") @@ -217,7 +186,7 @@ def test_not_found_pipeline(self, fake_project_cli, fake_metadata, mock_pipeline existing_pipelines = ", ".join(sorted(mock_pipelines.keys())) expected_output = ( - f"Error: `fake` pipeline not found! Existing " + f"Error: 'fake' pipeline not found! Existing " f"pipelines: {existing_pipelines}\n" ) assert expected_output in result.output @@ -241,10 +210,10 @@ def test_catalog_is_created_in_base_by_default( assert data_catalog_file.is_file() expected_catalog_config = { - "example_test_x": {"type": "MemoryDataSet"}, - "example_test_y": {"type": "MemoryDataSet"}, - "example_train_x": {"type": "MemoryDataSet"}, - "example_train_y": {"type": "MemoryDataSet"}, + "example_test_x": {"type": "MemoryDataset"}, + "example_test_y": {"type": "MemoryDataset"}, + "example_train_x": {"type": "MemoryDataset"}, + "example_train_y": {"type": "MemoryDataset"}, } catalog_config = yaml.safe_load(data_catalog_file.read_text()) assert catalog_config == expected_catalog_config @@ -281,7 +250,7 @@ def test_no_missing_datasets( } mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets) mocked_context.project_path = fake_repo_path - mock_pipelines[self.PIPELINE_NAME] = Pipeline( + mock_pipelines[self.PIPELINE_NAME] = modular_pipeline( [node(identity, "input_data", "output_data")] ) @@ -322,9 +291,9 @@ def test_missing_datasets_appended( expected_catalog_config = { "example_test_x": catalog_config["example_test_x"], - "example_test_y": {"type": "MemoryDataSet"}, - "example_train_x": {"type": "MemoryDataSet"}, - "example_train_y": {"type": "MemoryDataSet"}, + "example_test_y": {"type": "MemoryDataset"}, + "example_train_x": {"type": "MemoryDataset"}, + "example_train_y": {"type": "MemoryDataset"}, } catalog_config = yaml.safe_load(data_catalog_file.read_text()) assert catalog_config == expected_catalog_config @@ -337,4 +306,4 @@ def test_bad_env(self, fake_project_cli, fake_metadata): result = CliRunner().invoke(fake_project_cli, cmd, obj=fake_metadata) assert result.exit_code - assert "Unable to instantiate Kedro session" in result.output + assert "Unable to instantiate Kedro Catalog" in result.output diff --git a/tests/framework/cli/test_cli.py b/tests/framework/cli/test_cli.py index fbfbe24b58..a21aacc7c3 100644 --- a/tests/framework/cli/test_cli.py +++ b/tests/framework/cli/test_cli.py @@ -1,39 +1,12 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +# pylint: disable=too-many-lines from collections import namedtuple from itertools import cycle -from os.path import join +from os import rename from pathlib import Path import anyconfig import click from click.testing import CliRunner -from mock import patch from pytest import fixture, mark, raises from kedro import __version__ as version @@ -41,6 +14,7 @@ from kedro.framework.cli.catalog import catalog_cli from kedro.framework.cli.cli import KedroCLI, _init_plugins, cli from kedro.framework.cli.jupyter import jupyter_cli +from kedro.framework.cli.micropkg import micropkg_cli from kedro.framework.cli.pipeline import pipeline_cli from kedro.framework.cli.project import project_group from kedro.framework.cli.registry import registry_cli @@ -117,29 +91,19 @@ def test_print_version(self): assert result_abr.exit_code == 0 assert version in result_abr.output - def test_info_contains_qb(self): - """Check that `kedro info` output contains - reference to QuantumBlack.""" - result = CliRunner().invoke(cli, ["info"]) - - assert result.exit_code == 0 - assert "QuantumBlack" in result.output - - def test_info_contains_plugin_versions(self, entry_point, mocker): - get_distribution = mocker.patch("pkg_resources.get_distribution") - get_distribution().version = "1.0.2" - entry_point.module_name = "bob.fred" + def test_info_contains_plugin_versions(self, entry_point): + entry_point.dist.version = "1.0.2" + entry_point.module = "bob.fred" result = CliRunner().invoke(cli, ["info"]) assert result.exit_code == 0 assert ( - "bob: 1.0.2 (entry points:cli_hooks,global,hooks,init,line_magic,project)" + "bob: 1.0.2 (entry points:cli_hooks,global,hooks,init,line_magic,project,starters)" in result.output ) entry_point.load.assert_not_called() - @mark.usefixtures("entry_points") def test_info_no_plugins(self): result = CliRunner().invoke(cli, ["info"]) assert result.exit_code == 0 @@ -156,20 +120,6 @@ def test_help(self): assert result.exit_code == 0 assert "-h, --help Show this message and exit." in result.output - @patch("webbrowser.open") - def test_docs(self, patched_browser): - """Check that `kedro docs` opens a correct file in the browser.""" - result = CliRunner().invoke(cli, ["docs"]) - - assert result.exit_code == 0 - for each in ("Opening file", join("html", "index.html")): - assert each in result.output - - assert patched_browser.call_count == 1 - args, _ = patched_browser.call_args - for each in ("file://", join("kedro", "framework", "html", "index.html")): - assert each in args[0] - class TestCommandCollection: def test_found(self): @@ -322,37 +272,46 @@ def test_project_groups(self, entry_points, entry_point): entry_point.load.return_value = "groups" groups = load_entry_points("project") assert groups == ["groups"] - entry_points.assert_called_once_with(group="kedro.project_commands") + entry_points.return_value.select.assert_called_once_with( + group="kedro.project_commands" + ) - def test_project_error_is_caught(self, entry_points, entry_point): + def test_project_error_is_caught(self, entry_points, entry_point, caplog): entry_point.load.side_effect = Exception() - with raises(KedroCliError, match="Loading project commands"): - load_entry_points("project") - - entry_points.assert_called_once_with(group="kedro.project_commands") + entry_point.module = "project" + load_entry_points("project") + assert "Failed to load project commands" in caplog.text + entry_points.return_value.select.assert_called_once_with( + group="kedro.project_commands" + ) def test_global_groups(self, entry_points, entry_point): entry_point.load.return_value = "groups" groups = load_entry_points("global") assert groups == ["groups"] - entry_points.assert_called_once_with(group="kedro.global_commands") + entry_points.return_value.select.assert_called_once_with( + group="kedro.global_commands" + ) - def test_global_error_is_caught(self, entry_points, entry_point): + def test_global_error_is_caught(self, entry_points, entry_point, caplog): entry_point.load.side_effect = Exception() - with raises(KedroCliError, match="Loading global commands from"): - load_entry_points("global") - entry_points.assert_called_once_with(group="kedro.global_commands") + entry_point.module = "global" + load_entry_points("global") + assert "Failed to load global commands" in caplog.text + entry_points.return_value.select.assert_called_once_with( + group="kedro.global_commands" + ) def test_init(self, entry_points, entry_point): _init_plugins() - entry_points.assert_called_once_with(group="kedro.init") + entry_points.return_value.select.assert_called_once_with(group="kedro.init") entry_point.load().assert_called_once_with() def test_init_error_is_caught(self, entry_points, entry_point): - entry_point.load.side_effect = Exception() - with raises(KedroCliError, match="Initializing"): + entry_point.load.return_value.side_effect = Exception() + with raises(Exception): _init_plugins() - entry_points.assert_called_once_with(group="kedro.init") + entry_points.return_value.select.assert_called_once_with(group="kedro.init") class TestKedroCLI: @@ -366,11 +325,13 @@ def test_project_commands_no_clipy(self, mocker, fake_metadata): "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata ) kedro_cli = KedroCLI(fake_metadata.project_path) - assert len(kedro_cli.project_groups) == 5 + print(kedro_cli.project_groups) + assert len(kedro_cli.project_groups) == 6 assert kedro_cli.project_groups == [ catalog_cli, jupyter_cli, pipeline_cli, + micropkg_cli, project_group, registry_cli, ] @@ -403,11 +364,12 @@ def test_project_commands_valid_clipy(self, mocker, fake_metadata): "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata ) kedro_cli = KedroCLI(fake_metadata.project_path) - assert len(kedro_cli.project_groups) == 6 + assert len(kedro_cli.project_groups) == 7 assert kedro_cli.project_groups == [ catalog_cli, jupyter_cli, pipeline_cli, + micropkg_cli, project_group, registry_cli, cli, @@ -439,11 +401,12 @@ def test_kedro_cli_with_project(self, mocker, fake_metadata): assert len(kedro_cli.global_groups) == 2 assert kedro_cli.global_groups == [cli, create_cli] - assert len(kedro_cli.project_groups) == 6 + assert len(kedro_cli.project_groups) == 7 assert kedro_cli.project_groups == [ catalog_cli, jupyter_cli, pipeline_cli, + micropkg_cli, project_group, registry_cli, cli, @@ -455,7 +418,7 @@ def test_kedro_cli_with_project(self, mocker, fake_metadata): assert "Project specific commands from Kedro" in result.output -@mark.usefixtures("chdir_to_dummy_project", "patch_log") +@mark.usefixtures("chdir_to_dummy_project") class TestRunCommand: @staticmethod @fixture(params=["run_config.yml", "run_config.json"]) @@ -474,7 +437,7 @@ def fake_run_config(request, fake_root_dir): return config_path @staticmethod - @fixture() + @fixture def fake_run_config_with_params(fake_run_config, request): config = anyconfig.load(fake_run_config) config["run"].update(request.param) @@ -497,80 +460,144 @@ def test_run_successfully( to_outputs=[], load_versions={}, pipeline_name=None, + namespace=None, ) runner = fake_session.run.call_args_list[0][1]["runner"] assert isinstance(runner, SequentialRunner) assert not runner._is_async - def test_run_with_pipeline_filters( - self, fake_project_cli, fake_metadata, fake_session, mocker + @mark.parametrize( + "nodes_input, nodes_expected", + [ + ["splitting_data", ("splitting_data",)], + ["splitting_data,training_model", ("splitting_data", "training_model")], + ["splitting_data, training_model", ("splitting_data", "training_model")], + ], + ) + def test_run_specific_nodes( + self, + fake_project_cli, + fake_metadata, + fake_session, + mocker, + nodes_input, + nodes_expected, ): - from_nodes = ["--from-nodes", "splitting_data"] - to_nodes = ["--to-nodes", "training_model"] - tags = ["--tag", "de"] + nodes_command = "--nodes=" + nodes_input result = CliRunner().invoke( - fake_project_cli, ["run", *from_nodes, *to_nodes, *tags], obj=fake_metadata + fake_project_cli, ["run", nodes_command], obj=fake_metadata ) assert not result.exit_code fake_session.run.assert_called_once_with( - tags=("de",), + tags=(), runner=mocker.ANY, - node_names=(), - from_nodes=from_nodes[1:], - to_nodes=to_nodes[1:], + node_names=nodes_expected, + from_nodes=[], + to_nodes=[], from_inputs=[], to_outputs=[], load_versions={}, pipeline_name=None, + namespace=None, ) runner = fake_session.run.call_args_list[0][1]["runner"] assert isinstance(runner, SequentialRunner) assert not runner._is_async - def test_with_sequential_runner_and_parallel_flag( - self, fake_project_cli, fake_session + @mark.parametrize( + "tags_input, tags_expected", + [ + ["tag1", ("tag1",)], + ["tag1,tag2", ("tag1", "tag2")], + ["tag1, tag2", ("tag1", "tag2")], + ], + ) + def test_run_with_tags( + self, + fake_project_cli, + fake_metadata, + fake_session, + mocker, + tags_input, + tags_expected, ): + tags_command = "--tags=" + tags_input result = CliRunner().invoke( - fake_project_cli, ["run", "--parallel", "--runner=SequentialRunner"] + fake_project_cli, ["run", tags_command], obj=fake_metadata + ) + assert not result.exit_code + + fake_session.run.assert_called_once_with( + tags=tags_expected, + runner=mocker.ANY, + node_names=(), + from_nodes=[], + to_nodes=[], + from_inputs=[], + to_outputs=[], + load_versions={}, + pipeline_name=None, + namespace=None, ) - assert result.exit_code - assert "Please use either --parallel or --runner" in result.stdout - fake_session.return_value.run.assert_not_called() + runner = fake_session.run.call_args_list[0][1]["runner"] + assert isinstance(runner, SequentialRunner) + assert not runner._is_async - def test_run_successfully_parallel_via_flag( + def test_run_with_pipeline_filters( self, fake_project_cli, fake_metadata, fake_session, mocker ): + from_nodes = ["--from-nodes", "splitting_data"] + to_nodes = ["--to-nodes", "training_model"] + namespace = ["--namespace", "fake_namespace"] + tags = ["--tags", "de"] result = CliRunner().invoke( - fake_project_cli, ["run", "--parallel"], obj=fake_metadata + fake_project_cli, + ["run", *from_nodes, *to_nodes, *tags, *namespace], + obj=fake_metadata, ) assert not result.exit_code + fake_session.run.assert_called_once_with( - tags=(), + tags=("de",), runner=mocker.ANY, node_names=(), - from_nodes=[], - to_nodes=[], + from_nodes=from_nodes[1:], + to_nodes=to_nodes[1:], from_inputs=[], to_outputs=[], load_versions={}, pipeline_name=None, + namespace="fake_namespace", ) runner = fake_session.run.call_args_list[0][1]["runner"] - assert isinstance(runner, ParallelRunner) + assert isinstance(runner, SequentialRunner) assert not runner._is_async - def test_run_successfully_parallel_via_name( - self, fake_project_cli, fake_metadata, fake_session + def test_run_successfully_parallel( + self, fake_project_cli, fake_metadata, fake_session, mocker ): result = CliRunner().invoke( fake_project_cli, ["run", "--runner=ParallelRunner"], obj=fake_metadata ) assert not result.exit_code + fake_session.run.assert_called_once_with( + tags=(), + runner=mocker.ANY, + node_names=(), + from_nodes=[], + to_nodes=[], + from_inputs=[], + to_outputs=[], + load_versions={}, + pipeline_name=None, + namespace=None, + ) + runner = fake_session.run.call_args_list[0][1]["runner"] assert isinstance(runner, ParallelRunner) assert not runner._is_async @@ -608,6 +635,7 @@ def test_run_with_config( to_outputs=[], load_versions={}, pipeline_name="pipeline1", + namespace=None, ) @mark.parametrize( @@ -651,33 +679,43 @@ def test_run_with_params_in_config( to_outputs=[], load_versions={}, pipeline_name="pipeline1", + namespace=None, ) mock_session_create.assert_called_once_with( - env=mocker.ANY, extra_params=expected + env=mocker.ANY, conf_source=None, extra_params=expected ) @mark.parametrize( "cli_arg,expected_extra_params", [ ("foo:bar", {"foo": "bar"}), + ("foo=bar", {"foo": "bar"}), ( "foo:123.45, bar:1a,baz:678. ,qux:1e-2,quux:0,quuz:", { "foo": 123.45, "bar": "1a", - "baz": 678, + "baz": 678.0, "qux": 0.01, "quux": 0, - "quuz": "", + "quuz": None, }, ), ("foo:bar,baz:fizz:buzz", {"foo": "bar", "baz": "fizz:buzz"}), + ("foo=fizz:buzz", {"foo": "fizz:buzz"}), + ("foo:fizz=buzz", {"foo": "fizz=buzz"}), ( "foo:bar, baz: https://example.com", {"foo": "bar", "baz": "https://example.com"}, ), + ("foo:bar, foo:fizz buzz", {"foo": "fizz buzz"}), ("foo:bar,baz:fizz buzz", {"foo": "bar", "baz": "fizz buzz"}), - ("foo:bar, foo : fizz buzz ", {"foo": "fizz buzz"}), + ("foo.nested:bar", {"foo": {"nested": "bar"}}), + ("foo.nested=123.45", {"foo": {"nested": 123.45}}), + ( + "foo.nested_1.double_nest:123.45,foo.nested_2:1a", + {"foo": {"nested_1": {"double_nest": 123.45}, "nested_2": "1a"}}, + ), ], ) def test_run_extra_params( @@ -696,7 +734,7 @@ def test_run_extra_params( assert not result.exit_code mock_session_create.assert_called_once_with( - env=mocker.ANY, extra_params=expected_extra_params + env=mocker.ANY, conf_source=None, extra_params=expected_extra_params ) @mark.parametrize("bad_arg", ["bad", "foo:bar,bad"]) @@ -706,7 +744,7 @@ def test_bad_extra_params(self, fake_project_cli, fake_metadata, bad_arg): ) assert result.exit_code assert ( - "Item `bad` must contain a key and a value separated by `:`" + "Item `bad` must contain a key and a value separated by `:` or `=`." in result.stdout ) @@ -741,6 +779,47 @@ def test_reformat_load_versions( to_outputs=[], load_versions={ds: t}, pipeline_name=None, + namespace=None, + ) + + @mark.parametrize( + "lv_input, lv_dict", + [ + [ + "dataset1:time1", + { + "dataset1": "time1", + }, + ], + [ + "dataset1:time1,dataset2:time2", + {"dataset1": "time1", "dataset2": "time2"}, + ], + [ + "dataset1:time1, dataset2:time2", + {"dataset1": "time1", "dataset2": "time2"}, + ], + ], + ) + def test_split_load_versions( + self, fake_project_cli, fake_metadata, fake_session, lv_input, lv_dict, mocker + ): + result = CliRunner().invoke( + fake_project_cli, ["run", "--load-versions", lv_input], obj=fake_metadata + ) + assert not result.exit_code, result.output + + fake_session.run.assert_called_once_with( + tags=(), + runner=mocker.ANY, + node_names=(), + from_nodes=[], + to_nodes=[], + from_inputs=[], + to_outputs=[], + load_versions=lv_dict, + pipeline_name=None, + namespace=None, ) def test_fail_reformat_load_versions(self, fake_project_cli, fake_metadata): @@ -751,8 +830,183 @@ def test_fail_reformat_load_versions(self, fake_project_cli, fake_metadata): assert result.exit_code, result.output expected_output = ( - f"Error: Expected the form of `load_version` to be " - f"`dataset_name:YYYY-MM-DDThh.mm.ss.sssZ`," + f"Error: Expected the form of 'load_version' to be " + f"'dataset_name:YYYY-MM-DDThh.mm.ss.sssZ'," f"found {load_version} instead\n" ) assert expected_output in result.output + + def test_fail_split_load_versions(self, fake_project_cli, fake_metadata): + load_version = "2020-05-12T12.00.00" + result = CliRunner().invoke( + fake_project_cli, + ["run", "--load-versions", load_version], + obj=fake_metadata, + ) + assert result.exit_code, result.output + + expected_output = ( + f"Error: Expected the form of 'load_version' to be " + f"'dataset_name:YYYY-MM-DDThh.mm.ss.sssZ'," + f"found {load_version} instead\n" + ) + assert expected_output in result.output + + @mark.parametrize( + "from_nodes, expected", + [ + (["--from-nodes", "A,B,C"], ["A", "B", "C"]), + ( + ["--from-nodes", "two_inputs([A0,B0]) -> [C1]"], + ["two_inputs([A0,B0]) -> [C1]"], + ), + ( + ["--from-nodes", "two_outputs([A0]) -> [B1,C1]"], + ["two_outputs([A0]) -> [B1,C1]"], + ), + ( + ["--from-nodes", "multi_in_out([A0,B0]) -> [C1,D1]"], + ["multi_in_out([A0,B0]) -> [C1,D1]"], + ), + ( + ["--from-nodes", "two_inputs([A0,B0]) -> [C1],X,Y,Z"], + ["two_inputs([A0,B0]) -> [C1]", "X", "Y", "Z"], + ), + ], + ) + def test_safe_split_option_arguments( + self, + fake_project_cli, + fake_metadata, + fake_session, + mocker, + from_nodes, + expected, + ): + CliRunner().invoke(fake_project_cli, ["run", *from_nodes], obj=fake_metadata) + + fake_session.run.assert_called_once_with( + tags=(), + runner=mocker.ANY, + node_names=(), + from_nodes=expected, + to_nodes=[], + from_inputs=[], + to_outputs=[], + load_versions={}, + pipeline_name=None, + namespace=None, + ) + + def test_run_with_alternative_conf_source(self, fake_project_cli, fake_metadata): + # check that Kedro runs successfully with an alternative conf_source + rename("conf", "alternate_conf") + result = CliRunner().invoke( + fake_project_cli, + ["run", "--conf-source", "alternate_conf"], + obj=fake_metadata, + ) + assert result.exit_code == 0 + + def test_run_with_non_existent_conf_source(self, fake_project_cli, fake_metadata): + # check that an error is thrown if target conf_source doesn't exist + result = CliRunner().invoke( + fake_project_cli, + ["run", "--conf-source", "nonexistent_dir"], + obj=fake_metadata, + ) + assert result.exit_code, result.output + expected_output = ( + "Error: Invalid value for '--conf-source': Path 'nonexistent_dir'" + " does not exist." + ) + assert expected_output in result.output + + # the following tests should be deleted in 0.19.0 + + def test_both_node_flags( + self, + fake_project_cli, + fake_metadata, + fake_session, + mocker, + ): + nodes_input = ["splitting_data", "training_model"] + nodes_expected = ("splitting_data", "training_model") + node_command = "--node=" + nodes_input[0] + nodes_command = "--nodes=" + nodes_input[1] + result = CliRunner().invoke( + fake_project_cli, ["run", node_command, nodes_command], obj=fake_metadata + ) + assert not result.exit_code + + fake_session.run.assert_called_once_with( + tags=(), + runner=mocker.ANY, + node_names=nodes_expected, + from_nodes=[], + to_nodes=[], + from_inputs=[], + to_outputs=[], + load_versions={}, + pipeline_name=None, + namespace=None, + ) + + def test_both_tag_flags( + self, + fake_project_cli, + fake_metadata, + fake_session, + mocker, + ): + tags_input = ["tag1", "tag2"] + tags_expected = ("tag1", "tag2") + tag_command = "--tag=" + tags_input[0] + tags_command = "--tags=" + tags_input[1] + result = CliRunner().invoke( + fake_project_cli, ["run", tag_command, tags_command], obj=fake_metadata + ) + assert not result.exit_code + + fake_session.run.assert_called_once_with( + tags=tags_expected, + runner=mocker.ANY, + node_names=(), + from_nodes=[], + to_nodes=[], + from_inputs=[], + to_outputs=[], + load_versions={}, + pipeline_name=None, + namespace=None, + ) + + def test_both_load_version_flags( + self, fake_project_cli, fake_metadata, fake_session, mocker + ): + lv_input = ["dataset1:time1", "dataset2:time2"] + lv_dict = {"dataset1": "time1", "dataset2": "time2"} + + load_version_command = "--load-version=" + lv_input[0] + load_versions_command = "--load-versions=" + lv_input[1] + + result = CliRunner().invoke( + fake_project_cli, + ["run", load_version_command, load_versions_command], + obj=fake_metadata, + ) + assert not result.exit_code, result.output + + fake_session.run.assert_called_once_with( + tags=(), + runner=mocker.ANY, + node_names=(), + from_nodes=[], + to_nodes=[], + from_inputs=[], + to_outputs=[], + load_versions=lv_dict, + pipeline_name=None, + namespace=None, + ) diff --git a/tests/framework/cli/test_cli_hooks.py b/tests/framework/cli/test_cli_hooks.py index a7f870a4a4..0f7866f45f 100644 --- a/tests/framework/cli/test_cli_hooks.py +++ b/tests/framework/cli/test_cli_hooks.py @@ -1,39 +1,13 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from __future__ import annotations + import logging from collections import namedtuple -from typing import List import pytest from click.testing import CliRunner from kedro.framework.cli.cli import KedroCLI, cli -from kedro.framework.cli.hooks import cli_hook_impl +from kedro.framework.cli.hooks import cli_hook_impl, get_cli_hook_manager, manager from kedro.framework.startup import ProjectMetadata logger = logging.getLogger(__name__) @@ -43,6 +17,22 @@ ) +@pytest.fixture(autouse=True) +def reset_hook_manager(): + """Due to singleton nature of the `_cli_hook_manager`, the `_cli_hook_manager` + must be reset to `None` so that a new `CLIHookManager` gets created at the point + where `FakeEntryPoint` and `fake_plugin_distribution` exist within the same scope. + Additionally, this prevents `CLIHookManager` to be set from scope outside of this + testing module. + """ + manager._cli_hook_manager = None + yield + hook_manager = get_cli_hook_manager() + plugins = hook_manager.get_plugins() + for plugin in plugins: + hook_manager.unregister(plugin) + + class FakeEntryPoint: name = "fake-plugin" group = "kedro.cli_hooks" @@ -54,12 +44,24 @@ class FakeCLIHooks: def before_command_run( self, project_metadata: ProjectMetadata, - command_args: List[str], + command_args: list[str], ): print( f"Before command `{' '.join(command_args)}` run for project {project_metadata}" ) + @cli_hook_impl + def after_command_run( + self, + project_metadata: ProjectMetadata, + command_args: list[str], + exit_code: int, + ): + print( + f"After command `{' '.join(command_args)}` run for project {project_metadata} " + f"(exit: {exit_code})" + ) + return FakeCLIHooks() @@ -72,7 +74,7 @@ def fake_plugin_distribution(mocker): version="0.1", ) mocker.patch( - "pluggy.manager.importlib_metadata.distributions", + "pluggy._manager.importlib_metadata.distributions", return_value=[fake_distribution], ) return fake_distribution @@ -80,17 +82,21 @@ def fake_plugin_distribution(mocker): class TestKedroCLIHooks: @pytest.mark.parametrize( - "command", - ["-V", "info", "pipeline list", "run --pipeline=test"], + "command, exit_code", + [("-V", 0), ("info", 2), ("pipeline list", 2), ("starter", 0)], ) def test_kedro_cli_should_invoke_cli_hooks_from_plugin( self, caplog, command, + exit_code, mocker, fake_metadata, fake_plugin_distribution, + entry_points, # pylint: disable=unused-argument ): + caplog.set_level(logging.DEBUG, logger="kedro") + Module = namedtuple("Module", ["cli"]) mocker.patch( "kedro.framework.cli.cli.importlib.import_module", @@ -115,3 +121,9 @@ def test_kedro_cli_should_invoke_cli_hooks_from_plugin( f"Before command `{command}` run for project {fake_metadata}" in result.output ) + + # 'pipeline list' and 'info' aren't actually in the click structure and + # return exit code 2 ('invalid usage of some shell built-in command') + assert ( + f"After command `{command}` run for project {fake_metadata} (exit: {exit_code})" + ) in result.output diff --git a/tests/framework/cli/test_jupyter.py b/tests/framework/cli/test_jupyter.py index 3df59f377c..8f363bac3e 100644 --- a/tests/framework/cli/test_jupyter.py +++ b/tests/framework/cli/test_jupyter.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import json import shutil from pathlib import Path @@ -32,82 +5,14 @@ import pytest from click.testing import CliRunner -from jupyter_client.kernelspec import NATIVE_KERNEL_NAME, KernelSpecManager - -from kedro.framework.cli.jupyter import ( - SingleKernelSpecManager, - _export_nodes, - collect_line_magic, +from jupyter_client.kernelspec import ( + KernelSpecManager, + find_kernel_specs, + get_kernel_spec, ) -from kedro.framework.cli.utils import KedroCliError - - -@pytest.fixture(autouse=True) -def mocked_logging(mocker): - # Disable logging.config.dictConfig in KedroSession._setup_logging as - # it changes logging.config and affects other unit tests - return mocker.patch("logging.config.dictConfig") - - -def test_collect_line_magic(entry_points, entry_point): - entry_point.load.return_value = "line_magic" - line_magics = collect_line_magic() - assert line_magics == ["line_magic"] - entry_points.assert_called_once_with(group="kedro.line_magic") - - -class TestSingleKernelSpecManager: - def test_overridden_values(self): - assert SingleKernelSpecManager.whitelist == [NATIVE_KERNEL_NAME] - - def test_renaming_default_kernel(self, mocker): - """ - Make sure the default kernel display_name is changed. - """ - mocker.patch.object( - KernelSpecManager, - "get_kernel_spec", - return_value=mocker.Mock(display_name="default"), - ) - manager = SingleKernelSpecManager() - manager.default_kernel_name = "New Kernel Name" - new_kernel_spec = manager.get_kernel_spec(NATIVE_KERNEL_NAME) - assert new_kernel_spec.display_name == "New Kernel Name" - - def test_non_default_kernel_untouched(self, mocker): - """ - Make sure the non-default kernel display_name is not changed. - In theory the function will never be called like that, - but let's not make extra assumptions. - """ - mocker.patch.object( - KernelSpecManager, - "get_kernel_spec", - return_value=mocker.Mock(display_name="default"), - ) - manager = SingleKernelSpecManager() - manager.default_kernel_name = "New Kernel Name" - new_kernel_spec = manager.get_kernel_spec("another_kernel") - assert new_kernel_spec.display_name == "default" - - -def default_jupyter_options(command, address="127.0.0.1", all_kernels=False): - cmd = [ - command, - "--ip", - address, - "--MappingKernelManager.cull_idle_timeout=30", - "--MappingKernelManager.cull_interval=30", - ] - - if not all_kernels: - cmd += [ - "--NotebookApp.kernel_spec_manager_class=" - "kedro.framework.cli.jupyter.SingleKernelSpecManager", - "--KernelSpecManager.default_kernel_name='CLITestingProject'", - ] - return "jupyter", cmd +from kedro.framework.cli.jupyter import _create_kernel, _export_nodes +from kedro.framework.cli.utils import KedroCliError @pytest.fixture(autouse=True) @@ -116,142 +21,173 @@ def python_call_mock(mocker): @pytest.fixture -def fake_ipython_message(mocker): - return mocker.patch("kedro.framework.cli.jupyter.ipython_message") +def create_kernel_mock(mocker): + return mocker.patch("kedro.framework.cli.jupyter._create_kernel") -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") -class TestJupyterNotebookCommand: - def test_default_kernel( - self, python_call_mock, fake_project_cli, fake_ipython_message, fake_metadata - ): +@pytest.mark.usefixtures( + "chdir_to_dummy_project", "create_kernel_mock", "python_call_mock" +) +class TestJupyterSetupCommand: + def test_happy_path(self, fake_project_cli, fake_metadata, create_kernel_mock): result = CliRunner().invoke( fake_project_cli, - ["jupyter", "notebook", "--ip", "0.0.0.0"], + ["jupyter", "setup"], obj=fake_metadata, ) assert not result.exit_code, result.stdout - fake_ipython_message.assert_called_once_with(False) - python_call_mock.assert_called_once_with( - *default_jupyter_options("notebook", "0.0.0.0") + kernel_name = f"kedro_{fake_metadata.package_name}" + display_name = f"Kedro ({fake_metadata.package_name})" + create_kernel_mock.assert_called_once_with(kernel_name, display_name) + + def test_fail_no_jupyter(self, fake_project_cli, mocker): + mocker.patch.dict("sys.modules", {"notebook": None}) + result = CliRunner().invoke(fake_project_cli, ["jupyter", "notebook"]) + + assert result.exit_code + error = ( + "Module 'notebook' not found. Make sure to install required project " + "dependencies by running the 'pip install -r src/requirements.txt' command first." ) + assert error in result.output + - def test_all_kernels( - self, python_call_mock, fake_project_cli, fake_ipython_message, fake_metadata +@pytest.mark.usefixtures( + "chdir_to_dummy_project", "create_kernel_mock", "python_call_mock" +) +class TestJupyterNotebookCommand: + def test_happy_path( + self, python_call_mock, fake_project_cli, fake_metadata, create_kernel_mock ): result = CliRunner().invoke( fake_project_cli, - ["jupyter", "notebook", "--all-kernels"], + ["jupyter", "notebook", "--random-arg", "value"], obj=fake_metadata, ) assert not result.exit_code, result.stdout - fake_ipython_message.assert_called_once_with(True) + kernel_name = f"kedro_{fake_metadata.package_name}" + display_name = f"Kedro ({fake_metadata.package_name})" + create_kernel_mock.assert_called_once_with(kernel_name, display_name) python_call_mock.assert_called_once_with( - *default_jupyter_options("notebook", all_kernels=True) - ) - - @pytest.mark.parametrize("help_flag", ["-h", "--help"]) - def test_help( - self, help_flag, fake_project_cli, fake_ipython_message, fake_metadata - ): - result = CliRunner().invoke( - fake_project_cli, ["jupyter", "notebook", help_flag], obj=fake_metadata + "jupyter", + [ + "notebook", + f"--MultiKernelManager.default_kernel_name={kernel_name}", + "--random-arg", + "value", + ], ) - assert not result.exit_code, result.stdout - fake_ipython_message.assert_not_called() - @pytest.mark.parametrize("env_flag", ["--env", "-e"]) - def test_env(self, env_flag, fake_project_cli, python_call_mock, fake_metadata): + @pytest.mark.parametrize("env_flag,env", [("--env", "base"), ("-e", "local")]) + def test_env(self, env_flag, env, fake_project_cli, fake_metadata, mocker): """This tests passing an environment variable to the jupyter subprocess.""" + mock_environ = mocker.patch("os.environ", {}) result = CliRunner().invoke( fake_project_cli, - ["jupyter", "notebook", env_flag, "base"], + ["jupyter", "notebook", env_flag, env], obj=fake_metadata, ) - assert not result.exit_code - - args, kwargs = python_call_mock.call_args - assert args == default_jupyter_options("notebook") - assert "env" in kwargs - assert kwargs["env"]["KEDRO_ENV"] == "base" + assert not result.exit_code, result.stdout + assert mock_environ["KEDRO_ENV"] == env - def test_fail_no_jupyter_core(self, fake_project_cli, mocker): - mocker.patch.dict("sys.modules", {"jupyter_core": None}) + def test_fail_no_jupyter(self, fake_project_cli, mocker): + mocker.patch.dict("sys.modules", {"notebook": None}) result = CliRunner().invoke(fake_project_cli, ["jupyter", "notebook"]) assert result.exit_code error = ( - "Module `jupyter_core` not found. Make sure to install required project " - "dependencies by running the `kedro install` command first." + "Module 'notebook' not found. Make sure to install required project " + "dependencies by running the 'pip install -r src/requirements.txt' command first." ) assert error in result.output -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures( + "chdir_to_dummy_project", "create_kernel_mock", "python_call_mock" +) class TestJupyterLabCommand: - def test_default_kernel( - self, python_call_mock, fake_project_cli, fake_ipython_message, fake_metadata + def test_happy_path( + self, python_call_mock, fake_project_cli, fake_metadata, create_kernel_mock ): result = CliRunner().invoke( fake_project_cli, - ["jupyter", "lab", "--ip", "0.0.0.0"], + ["jupyter", "lab", "--random-arg", "value"], obj=fake_metadata, ) assert not result.exit_code, result.stdout - fake_ipython_message.assert_called_once_with(False) + kernel_name = f"kedro_{fake_metadata.package_name}" + display_name = f"Kedro ({fake_metadata.package_name})" + create_kernel_mock.assert_called_once_with(kernel_name, display_name) python_call_mock.assert_called_once_with( - *default_jupyter_options("lab", "0.0.0.0") - ) - - def test_all_kernels( - self, python_call_mock, fake_project_cli, fake_ipython_message, fake_metadata - ): - result = CliRunner().invoke( - fake_project_cli, ["jupyter", "lab", "--all-kernels"], obj=fake_metadata - ) - assert not result.exit_code, result.stdout - fake_ipython_message.assert_called_once_with(True) - python_call_mock.assert_called_once_with( - *default_jupyter_options("lab", all_kernels=True) - ) - - @pytest.mark.parametrize("help_flag", ["-h", "--help"]) - def test_help( - self, help_flag, fake_project_cli, fake_ipython_message, fake_metadata - ): - result = CliRunner().invoke( - fake_project_cli, ["jupyter", "lab", help_flag], obj=fake_metadata + "jupyter", + [ + "lab", + f"--MultiKernelManager.default_kernel_name={kernel_name}", + "--random-arg", + "value", + ], ) - assert not result.exit_code, result.stdout - fake_ipython_message.assert_not_called() - @pytest.mark.parametrize("env_flag", ["--env", "-e"]) - def test_env(self, env_flag, fake_project_cli, python_call_mock, fake_metadata): + @pytest.mark.parametrize("env_flag,env", [("--env", "base"), ("-e", "local")]) + def test_env(self, env_flag, env, fake_project_cli, fake_metadata, mocker): """This tests passing an environment variable to the jupyter subprocess.""" + mock_environ = mocker.patch("os.environ", {}) result = CliRunner().invoke( fake_project_cli, - ["jupyter", "lab", env_flag, "base"], + ["jupyter", "lab", env_flag, env], obj=fake_metadata, ) - assert not result.exit_code - - args, kwargs = python_call_mock.call_args - assert args == default_jupyter_options("lab") - assert "env" in kwargs - assert kwargs["env"]["KEDRO_ENV"] == "base" + assert not result.exit_code, result.stdout + assert mock_environ["KEDRO_ENV"] == env - def test_fail_no_jupyter_core(self, fake_project_cli, mocker): - mocker.patch.dict("sys.modules", {"jupyter_core": None}) + def test_fail_no_jupyter(self, fake_project_cli, mocker): + mocker.patch.dict("sys.modules", {"jupyterlab": None}) result = CliRunner().invoke(fake_project_cli, ["jupyter", "lab"]) assert result.exit_code error = ( - "Module `jupyter_core` not found. Make sure to install required project " - "dependencies by running the `kedro install` command first." + "Module 'jupyterlab' not found. Make sure to install required project " + "dependencies by running the 'pip install -r src/requirements.txt' command first." ) assert error in result.output +@pytest.fixture +def cleanup_kernel(): + yield + if "my_kernel_name" in find_kernel_specs(): + KernelSpecManager().remove_kernel_spec("my_kernel_name") + + +@pytest.mark.usefixtures("cleanup_kernel") +class TestCreateKernel: + def test_create_new_kernel(self): + _create_kernel("my_kernel_name", "My display name") + kernel_spec = get_kernel_spec("my_kernel_name") + assert kernel_spec.display_name == "My display name" + assert kernel_spec.language == "python" + assert kernel_spec.argv[-2:] == ["--ext", "kedro.ipython"] + kernel_files = {file.name for file in Path(kernel_spec.resource_dir).iterdir()} + assert kernel_files == { + "kernel.json", + "logo-32x32.png", + "logo-64x64.png", + "logo-svg.svg", + } + + def test_kernel_install_replaces(self): + _create_kernel("my_kernel_name", "My display name 1") + _create_kernel("my_kernel_name", "My display name 2") + kernel_spec = get_kernel_spec("my_kernel_name") + assert kernel_spec.display_name == "My display name 2" + + def test_error(self, mocker): + mocker.patch("ipykernel.kernelspec.install", side_effect=ValueError) + pattern = "Cannot setup kedro kernel for Jupyter" + with pytest.raises(KedroCliError, match=pattern): + _create_kernel("my_kernel_name", "My display name") + + @pytest.fixture def cleanup_nodes_dir(fake_package_path): yield @@ -260,7 +196,7 @@ def cleanup_nodes_dir(fake_package_path): shutil.rmtree(str(nodes_dir)) -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "cleanup_nodes_dir") +@pytest.mark.usefixtures("chdir_to_dummy_project", "cleanup_nodes_dir") class TestConvertNotebookCommand: @pytest.fixture def fake_export_nodes(self, mocker): @@ -271,7 +207,7 @@ def tmp_file_path(self): with NamedTemporaryFile() as f: yield Path(f.name) - # pylint: disable=too-many-arguments + # noqa: too-many-arguments def test_convert_one_file_overwrite( self, mocker, @@ -363,7 +299,7 @@ def test_convert_without_filepath_and_all_flag( "add '--all' to convert all notebooks.\n" ) assert result.exit_code - assert result.stdout == expected_output + assert expected_output in result.stdout def test_non_unique_notebook_names_error( self, fake_project_cli, mocker, fake_metadata diff --git a/tests/framework/cli/test_project.py b/tests/framework/cli/test_project.py index 21b7be9859..d965113ea8 100644 --- a/tests/framework/cli/test_project.py +++ b/tests/framework/cli/test_project.py @@ -1,33 +1,4 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - # pylint: disable=unused-argument -import subprocess import sys from pathlib import Path @@ -37,13 +8,6 @@ from kedro.framework.cli.project import NO_DEPENDENCY_MESSAGE -@pytest.fixture(autouse=True) -def mocked_logging(mocker): - # Disable logging.config.dictConfig in KedroSession._setup_logging as - # it changes logging.config and affects other unit tests - return mocker.patch("logging.config.dictConfig") - - @pytest.fixture(autouse=True) def call_mock(mocker): return mocker.patch("kedro.framework.cli.project.call") @@ -54,17 +18,12 @@ def python_call_mock(mocker): return mocker.patch("kedro.framework.cli.project.python_call") -@pytest.fixture -def fake_ipython_message(mocker): - return mocker.patch("kedro.framework.cli.project.ipython_message") - - @pytest.fixture def fake_copyfile(mocker): return mocker.patch("shutil.copyfile") -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures("chdir_to_dummy_project") class TestActivateNbstripoutCommand: @staticmethod @pytest.fixture() @@ -99,9 +58,7 @@ def test_install_successfully( call_mock.assert_called_once_with(["nbstripout", "--install"]) fake_git_repo.assert_called_once_with( - ["git", "rev-parse", "--git-dir"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + ["git", "rev-parse", "--git-dir"], capture_output=True ) def test_nbstripout_not_installed( @@ -145,7 +102,7 @@ def test_no_git_executable( assert "Git executable not found. Install Git first." in result.stdout -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures("chdir_to_dummy_project") class TestTestCommand: def test_happy_path(self, fake_project_cli, python_call_mock): result = CliRunner().invoke(fake_project_cli, ["test", "--random-arg", "value"]) @@ -169,7 +126,7 @@ def test_pytest_not_installed( python_call_mock.assert_not_called() -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures("chdir_to_dummy_project") class TestLintCommand: @pytest.mark.parametrize("files", [(), ("src",)]) def test_lint( @@ -181,6 +138,7 @@ def test_lint( fake_repo_path, fake_metadata, ): + mocker.patch("kedro.framework.cli.project._check_module_importable") result = CliRunner().invoke( fake_project_cli, ["lint", *files], obj=fake_metadata ) @@ -193,7 +151,7 @@ def test_lint( expected_calls = [ mocker.call("black", expected_files), mocker.call("flake8", expected_files), - mocker.call("isort", ("-rc",) + expected_files), + mocker.call("isort", expected_files), ] assert python_call_mock.call_args_list == expected_calls @@ -217,6 +175,7 @@ def test_lint_check_only( fake_repo_path, fake_metadata, ): + mocker.patch("kedro.framework.cli.project._check_module_importable") result = CliRunner().invoke( fake_project_cli, ["lint", check_flag, *files], obj=fake_metadata ) @@ -229,22 +188,29 @@ def test_lint_check_only( expected_calls = [ mocker.call("black", ("--check",) + expected_files), mocker.call("flake8", expected_files), - mocker.call("isort", ("-c", "-rc") + expected_files), + mocker.call("isort", ("--check",) + expected_files), ] assert python_call_mock.call_args_list == expected_calls - @pytest.mark.parametrize("module_name", ["flake8", "isort"]) + @pytest.mark.parametrize( + "module_name,side_effects", + [("flake8", [ImportError, None, None]), ("isort", [None, ImportError, None])], + ) def test_import_not_installed( self, fake_project_cli, python_call_mock, module_name, + side_effects, mocker, fake_repo_path, fake_metadata, ): - mocker.patch.dict("sys.modules", {module_name: None}) + # pretending we have the other linting dependencies, but not the + mocker.patch( + "kedro.framework.cli.utils.import_module", side_effect=side_effects + ) result = CliRunner().invoke(fake_project_cli, ["lint"], obj=fake_metadata) expected_message = NO_DEPENDENCY_MESSAGE.format( @@ -263,296 +229,45 @@ def test_pythonpath_env_var( assert mocked_environ == {"PYTHONPATH": str(fake_repo_path / "src")} -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "fake_copyfile") -class TestInstallCommand: - def test_install_compile_default( - self, - python_call_mock, - fake_project_cli, - fake_repo_path, - fake_copyfile, - mocker, - fake_metadata, - ): - """Test that the requirements are compiled by default - if requirements.in doesn't exist""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - requirements_in = fake_repo_path / "src" / "requirements.in" - requirements_txt = fake_repo_path / "src" / "requirements.txt" - expected_calls = [ - mocker.call("piptools", ["compile", "-q", str(requirements_in)]), - mocker.call("pip", ["install", "-U", "-r", str(requirements_txt)]), - ] - assert python_call_mock.mock_calls == expected_calls - fake_copyfile.assert_called_once_with( - str(requirements_txt), str(requirements_in) - ) - - def test_install_compile_force( - self, - python_call_mock, - fake_project_cli, - fake_repo_path, - fake_copyfile, - mocker, - fake_metadata, - ): - """Test that the requirements are compiled if requirements.in exists - and --build-reqs CLI option is specified""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - mocker.patch.object(Path, "is_file", return_value=True) - result = CliRunner().invoke( - fake_project_cli, ["install", "--build-reqs"], obj=fake_metadata - ) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - requirements_in = fake_repo_path / "src" / "requirements.in" - requirements_txt = fake_repo_path / "src" / "requirements.txt" - expected_calls = [ - mocker.call("piptools", ["compile", "-q", str(requirements_in)]), - mocker.call("pip", ["install", "-U", "-r", str(requirements_txt)]), - ] - assert python_call_mock.mock_calls == expected_calls - fake_copyfile.assert_not_called() - - def test_install_no_compile_default( - self, - python_call_mock, - fake_project_cli, - fake_repo_path, - fake_copyfile, - mocker, - fake_metadata, - ): - """Test that the requirements aren't compiled by default - if requirements.in exists""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - mocker.patch.object(Path, "is_file", return_value=True) - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - requirements_txt = fake_repo_path / "src" / "requirements.txt" - python_call_mock.assert_called_once_with( - "pip", ["install", "-U", "-r", str(requirements_txt)] - ) - fake_copyfile.assert_not_called() - - def test_install_no_compile_force( - self, - python_call_mock, - fake_project_cli, - fake_repo_path, - fake_copyfile, - mocker, - fake_metadata, - ): - """Test that the requirements aren't compiled if requirements.in doesn't exist - and --no-build-reqs CLI option is specified""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - result = CliRunner().invoke( - fake_project_cli, ["install", "--no-build-reqs"], obj=fake_metadata - ) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - requirements_txt = fake_repo_path / "src" / "requirements.txt" - python_call_mock.assert_called_once_with( - "pip", ["install", "-U", "-r", str(requirements_txt)] - ) - fake_copyfile.assert_not_called() - - def test_with_env_file( - self, - python_call_mock, - call_mock, - fake_project_cli, - mocker, - fake_repo_path, - fake_copyfile, - fake_metadata, - ): - mocker.patch("kedro.framework.cli.project.os").name = "posix" - # Pretend env file exists: - mocker.patch.object(Path, "is_file", return_value=True) - - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.stdout - assert "Requirements installed!" in result.output - - requirements_txt = fake_repo_path / "src" / "requirements.txt" - expected_calls = [ - mocker.call("pip", ["install", "-U", "-r", str(requirements_txt)]) - ] - assert python_call_mock.mock_calls == expected_calls - - call_mock.assert_called_once_with( - [ - "conda", - "env", - "update", - "--file", - str(fake_repo_path / "src/environment.yml"), - "--prune", - ] - ) - fake_copyfile.assert_not_called() - - def test_windows( - self, fake_project_cli, mocker, fake_repo_path, fake_copyfile, fake_metadata - ): - mock_subprocess = mocker.patch("kedro.framework.cli.project.subprocess") - mock_subprocess.Popen.return_value.communicate.return_value = ("", b"") - # pretend we are on Windows - mocker.patch("kedro.framework.cli.project.os").name = "nt" - - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.stdout - assert "Requirements installed!" in result.output - - requirements_in = fake_repo_path / "src" / "requirements.in" - requirements_txt = fake_repo_path / "src" / "requirements.txt" - command = [ - sys.executable, - "-m", - "pip", - "install", - "-U", - "-r", - str(requirements_txt), - ] - mock_subprocess.Popen.assert_called_once_with( - command, - creationflags=mock_subprocess.CREATE_NEW_CONSOLE, - stderr=mock_subprocess.PIPE, - ) - fake_copyfile.assert_called_once_with( - str(requirements_txt), str(requirements_in) - ) - - def test_windows_err( - self, fake_project_cli, mocker, fake_repo_path, fake_copyfile, fake_metadata - ): - mock_subprocess = mocker.patch("kedro.framework.cli.project.subprocess") - mock_subprocess.Popen.return_value.communicate.return_value = ( - "", - b"Error in dependencies", - ) - # pretend we are on Windows - mocker.patch("kedro.framework.cli.project.os").name = "nt" - - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert result.exit_code, result.stdout - assert "Error in dependencies" in result.output - - def test_install_working_with_unimportable_pipelines( - self, - fake_project_cli, - mocker, - fake_metadata, - ): - """Test kedro install works even if pipelines are not importable""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - pipeline_registry = ( - fake_metadata.source_dir - / fake_metadata.package_name - / "pipeline_registry.py" - ) - pipeline_registry.write_text("import this_is_not_a_real_thing") - - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - @pytest.mark.parametrize("os_name", ["posix", "nt"]) - def test_install_missing_requirements_in_and_txt( - self, fake_project_cli, mocker, fake_metadata, os_name - ): - """Test error when neither requirements.txt nor requirements.in exists.""" - mocker.patch("kedro.framework.cli.project.os").name = os_name - mocker.patch.object(Path, "is_file", return_value=False) - result = CliRunner().invoke( - fake_project_cli, ["install", "--build-reqs"], obj=fake_metadata - ) - assert result.exit_code # Error expected - assert isinstance(result.exception, FileNotFoundError) - assert "No project requirements.in or requirements.txt found" in str( - result.exception - ) - - -@pytest.fixture -def os_mock(mocker): - return mocker.patch("kedro.framework.cli.project.os") - - -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "os_mock") +@pytest.mark.usefixtures("chdir_to_dummy_project") class TestIpythonCommand: def test_happy_path( self, call_mock, fake_project_cli, - fake_ipython_message, - os_mock, fake_repo_path, fake_metadata, ): result = CliRunner().invoke( - fake_project_cli, - ["ipython", "--random-arg", "value"], - obj=fake_metadata, + fake_project_cli, ["ipython", "--random-arg", "value"], obj=fake_metadata ) assert not result.exit_code, result.stdout - fake_ipython_message.assert_called_once_with() - call_mock.assert_called_once_with(["ipython", "--random-arg", "value"]) - os_mock.environ.__setitem__.assert_called_once_with( - "IPYTHONDIR", str(fake_repo_path / ".ipython") + call_mock.assert_called_once_with( + [ + "ipython", + "--ext", + "kedro.ipython", + "--random-arg", + "value", + ] ) - @pytest.mark.parametrize("help_flag", ["-h", "--help"]) - def test_help( - self, - help_flag, - call_mock, - fake_project_cli, - fake_ipython_message, - fake_metadata, - ): - result = CliRunner().invoke( - fake_project_cli, ["ipython", help_flag], obj=fake_metadata - ) - assert not result.exit_code, result.stdout - fake_ipython_message.assert_not_called() - call_mock.assert_called_once_with(["ipython", help_flag]) - @pytest.mark.parametrize("env_flag,env", [("--env", "base"), ("-e", "local")]) def test_env( self, env_flag, env, fake_project_cli, - call_mock, - fake_repo_path, - os_mock, mocker, fake_metadata, ): """This tests starting ipython with specific env.""" + mock_environ = mocker.patch("os.environ", {}) result = CliRunner().invoke( fake_project_cli, ["ipython", env_flag, env], obj=fake_metadata ) assert not result.exit_code, result.stdout - - calls = [ - mocker.call("IPYTHONDIR", str(fake_repo_path / ".ipython")), - mocker.call("KEDRO_ENV", env), - ] - os_mock.environ.__setitem__.assert_has_calls(calls) + assert mock_environ["KEDRO_ENV"] == env def test_fail_no_ipython(self, fake_project_cli, mocker): mocker.patch.dict("sys.modules", {"IPython": None}) @@ -560,13 +275,13 @@ def test_fail_no_ipython(self, fake_project_cli, mocker): assert result.exit_code error = ( - "Module `IPython` not found. Make sure to install required project " - "dependencies by running the `kedro install` command first." + "Module 'IPython' not found. Make sure to install required project " + "dependencies by running the 'pip install -r src/requirements.txt' command first." ) assert error in result.output -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures("chdir_to_dummy_project") class TestPackageCommand: def test_happy_path( self, call_mock, fake_project_cli, mocker, fake_repo_path, fake_metadata @@ -578,32 +293,29 @@ def test_happy_path( mocker.call( [ sys.executable, - "setup.py", - "clean", - "--all", - "bdist_egg", - "--dist-dir", + "-m", + "build", + "--wheel", + "--outdir", "../dist", ], cwd=str(fake_repo_path / "src"), ), mocker.call( [ - sys.executable, - "setup.py", - "clean", - "--all", - "bdist_wheel", - "--dist-dir", - "../dist", + "tar", + "--exclude=local/*.yml", + "-czf", + f"dist/conf-{fake_metadata.package_name}.tar.gz", + f"--directory={fake_metadata.project_path}", + "conf", ], - cwd=str(fake_repo_path / "src"), ), ] ) -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures("chdir_to_dummy_project") class TestBuildDocsCommand: def test_happy_path( self, @@ -658,9 +370,9 @@ def test_open_docs(self, open_flag, fake_project_cli, mocker, fake_metadata): patched_browser.assert_called_once_with(expected_path) -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "fake_copyfile") +@pytest.mark.usefixtures("chdir_to_dummy_project", "fake_copyfile") class TestBuildReqsCommand: - def test_requirements_file_exists( + def test_compile_from_requirements_file( self, python_call_mock, fake_project_cli, @@ -678,11 +390,15 @@ def test_requirements_file_exists( python_call_mock.assert_called_once_with( "piptools", - ["compile", "-q", str(fake_repo_path / "src" / "requirements.in")], + [ + "compile", + str(fake_repo_path / "src" / "requirements.txt"), + "--output-file", + str(fake_repo_path / "src" / "requirements.lock"), + ], ) - fake_copyfile.assert_not_called() - def test_requirements_file_doesnt_exist( + def test_compile_from_input_and_to_output_file( self, python_call_mock, fake_project_cli, @@ -690,18 +406,28 @@ def test_requirements_file_doesnt_exist( fake_copyfile, fake_metadata, ): - # File does not exist: - requirements_in = fake_repo_path / "src" / "requirements.in" - requirements_txt = fake_repo_path / "src" / "requirements.txt" + # File exists: + input_file = fake_repo_path / "src" / "dev-requirements.txt" + with open(input_file, "a", encoding="utf-8") as file: + file.write("") + output_file = fake_repo_path / "src" / "dev-requirements.lock" - result = CliRunner().invoke(fake_project_cli, ["build-reqs"], obj=fake_metadata) + result = CliRunner().invoke( + fake_project_cli, + [ + "build-reqs", + "--input-file", + str(input_file), + "--output-file", + str(output_file), + ], + obj=fake_metadata, + ) assert not result.exit_code, result.stdout assert "Requirements built!" in result.stdout python_call_mock.assert_called_once_with( - "piptools", ["compile", "-q", str(requirements_in)] - ) - fake_copyfile.assert_called_once_with( - str(requirements_txt), str(requirements_in) + "piptools", + ["compile", str(input_file), "--output-file", str(output_file)], ) @pytest.mark.parametrize( @@ -715,7 +441,7 @@ def test_extra_args( extra_args, fake_metadata, ): - requirements_in = fake_repo_path / "src" / "requirements.in" + requirements_txt = fake_repo_path / "src" / "requirements.txt" result = CliRunner().invoke( fake_project_cli, ["build-reqs"] + extra_args, obj=fake_metadata @@ -724,5 +450,24 @@ def test_extra_args( assert not result.exit_code, result.stdout assert "Requirements built!" in result.stdout - call_args = ["compile", "-q"] + extra_args + [str(requirements_in)] + call_args = ( + ["compile"] + + extra_args + + [str(requirements_txt)] + + ["--output-file", str(fake_repo_path / "src" / "requirements.lock")] + ) python_call_mock.assert_called_once_with("piptools", call_args) + + @pytest.mark.parametrize("os_name", ["posix", "nt"]) + def test_missing_requirements_txt( + self, fake_project_cli, mocker, fake_metadata, os_name, fake_repo_path + ): + """Test error when input file requirements.txt doesn't exists.""" + requirements_txt = fake_repo_path / "src" / "requirements.txt" + + mocker.patch("kedro.framework.cli.project.os").name = os_name + mocker.patch.object(Path, "is_file", return_value=False) + result = CliRunner().invoke(fake_project_cli, ["build-reqs"], obj=fake_metadata) + assert result.exit_code # Error expected + assert isinstance(result.exception, FileNotFoundError) + assert f"File '{requirements_txt}' not found" in str(result.exception) diff --git a/tests/framework/cli/test_registry.py b/tests/framework/cli/test_registry.py index 2b1fdda700..3c84efa999 100644 --- a/tests/framework/cli/test_registry.py +++ b/tests/framework/cli/test_registry.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import pytest from click.testing import CliRunner @@ -37,19 +10,19 @@ def yaml_dump_mock(mocker): @pytest.fixture def pipelines_dict(): pipelines = { - "de": ["split_data (split_data)"], - "ds": [ + "data_engineering": ["split_data (split_data)"], + "data_science": [ "train_model (train_model)", "predict (predict)", "report_accuracy (report_accuracy)", ], - "dp": ["data_processing.split_data (split_data)"], + "data_processing": ["data_processing.split_data (split_data)"], } - pipelines["__default__"] = pipelines["de"] + pipelines["ds"] + pipelines["__default__"] = pipelines["data_engineering"] + pipelines["data_science"] return pipelines -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures("chdir_to_dummy_project") def test_list_registered_pipelines( fake_project_cli, fake_metadata, yaml_dump_mock, pipelines_dict ): @@ -61,9 +34,12 @@ def test_list_registered_pipelines( yaml_dump_mock.assert_called_once_with(sorted(pipelines_dict.keys())) -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") +@pytest.mark.usefixtures("chdir_to_dummy_project") class TestRegistryDescribeCommand: - @pytest.mark.parametrize("pipeline_name", ["de", "ds", "dp", "__default__"]) + @pytest.mark.parametrize( + "pipeline_name", + ["data_engineering", "data_science", "data_processing", "__default__"], + ) def test_describe_registered_pipeline( self, fake_project_cli, @@ -89,8 +65,8 @@ def test_registered_pipeline_not_found(self, fake_project_cli, fake_metadata): assert result.exit_code expected_output = ( - "Error: `missing` pipeline not found. Existing pipelines: " - "[__default__, de, dp, ds]\n" + "Error: 'missing' pipeline not found. Existing pipelines: " + "[__default__, data_engineering, data_processing, data_science]\n" ) assert expected_output in result.output diff --git a/tests/framework/cli/test_starters.py b/tests/framework/cli/test_starters.py index c2abc462e9..03e16bc29c 100644 --- a/tests/framework/cli/test_starters.py +++ b/tests/framework/cli/test_starters.py @@ -1,38 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """This module contains unit test for the cli command 'kedro new' """ +from __future__ import annotations -import json import shutil from pathlib import Path -from typing import Dict import pytest import yaml @@ -40,9 +11,13 @@ from cookiecutter.exceptions import RepositoryCloneFailed from kedro import __version__ as version -from kedro.framework.cli.starters import _STARTER_ALIASES, TEMPLATE_PATH +from kedro.framework.cli.starters import ( + _OFFICIAL_STARTER_SPECS, + TEMPLATE_PATH, + KedroStarterSpec, +) -FILES_IN_TEMPLATE = 36 +FILES_IN_TEMPLATE = 29 @pytest.fixture @@ -63,7 +38,7 @@ def mock_cookiecutter(mocker): return mocker.patch("cookiecutter.main.cookiecutter") -def _write_yaml(filepath: Path, config: Dict): +def _write_yaml(filepath: Path, config: dict): filepath.parent.mkdir(parents=True, exist_ok=True) yaml_str = yaml.dump(config) filepath.write_text(yaml_str) @@ -73,7 +48,7 @@ def _make_cli_prompt_input(project_name="", repo_name="", python_package=""): return "\n".join([project_name, repo_name, python_package]) -# pylint: disable=too-many-arguments +# noqa: too-many-arguments def _assert_template_ok( result, project_name="New Kedro Project", @@ -93,9 +68,11 @@ def _assert_template_ok( assert len(generated_files) == FILES_IN_TEMPLATE assert full_path.exists() assert (full_path / ".gitignore").is_file() - assert project_name in (full_path / "README.md").read_text() - assert "KEDRO" in (full_path / ".gitignore").read_text() - assert kedro_version in (full_path / "src" / "requirements.txt").read_text() + assert project_name in (full_path / "README.md").read_text(encoding="utf-8") + assert "KEDRO" in (full_path / ".gitignore").read_text(encoding="utf-8") + assert kedro_version in (full_path / "src" / "requirements.txt").read_text( + encoding="utf-8" + ) assert (full_path / "src" / python_package / "__init__.py").is_file() @@ -104,15 +81,44 @@ def test_starter_list(fake_kedro_cli): result = CliRunner().invoke(fake_kedro_cli, ["starter", "list"]) assert result.exit_code == 0, result.output - for alias in _STARTER_ALIASES: + for alias in _OFFICIAL_STARTER_SPECS: assert alias in result.output -def test_cookiecutter_json_matches_prompts_yml(): - """Validate the contents of the default config file.""" - cookiecutter_json = json.loads((TEMPLATE_PATH / "cookiecutter.json").read_text()) - prompts_yml = yaml.safe_load((TEMPLATE_PATH / "prompts.yml").read_text()) - assert set(cookiecutter_json) == set(prompts_yml) | {"kedro_version"} +def test_starter_list_with_starter_plugin(fake_kedro_cli, entry_point): + """Check that `kedro starter list` prints out the plugin starters.""" + entry_point.load.return_value = [KedroStarterSpec("valid_starter", "valid_path")] + entry_point.module = "valid_starter_module" + result = CliRunner().invoke(fake_kedro_cli, ["starter", "list"]) + assert result.exit_code == 0, result.output + assert "valid_starter_module" in result.output + + +@pytest.mark.parametrize( + "specs,expected", + [ + ( + [{"alias": "valid_starter", "template_path": "valid_path"}], + "should be a 'KedroStarterSpec'", + ), + ( + [ + KedroStarterSpec("duplicate", "duplicate"), + KedroStarterSpec("duplicate", "duplicate"), + ], + "has been ignored as it is already defined by", + ), + ], +) +def test_starter_list_with_invalid_starter_plugin( + fake_kedro_cli, entry_point, specs, expected +): + """Check that `kedro starter list` prints out the plugin starters.""" + entry_point.load.return_value = specs + entry_point.module = "invalid_starter" + result = CliRunner().invoke(fake_kedro_cli, ["starter", "list"]) + assert result.exit_code == 0, result.output + assert expected in result.output @pytest.mark.usefixtures("chdir_to_tmp") @@ -139,61 +145,83 @@ def test_custom_project_name(self, fake_kedro_cli): python_package="my_project", ) - def test_custom_repo_name(self, fake_kedro_cli): + def test_custom_project_name_with_hyphen_and_underscore_and_number( + self, fake_kedro_cli + ): result = CliRunner().invoke( fake_kedro_cli, ["new"], - input=_make_cli_prompt_input(repo_name="my-repo"), + input=_make_cli_prompt_input(project_name="My-Project_ 1"), ) _assert_template_ok( result, - project_name="New Kedro Project", - repo_name="my-repo", - python_package="new_kedro_project", + project_name="My-Project_ 1", + repo_name="my-project--1", + python_package="my_project__1", ) - def test_custom_python_package(self, fake_kedro_cli): + def test_no_prompts(self, fake_kedro_cli): + shutil.copytree(TEMPLATE_PATH, "template") + (Path("template") / "prompts.yml").unlink() + result = CliRunner().invoke(fake_kedro_cli, ["new", "--starter", "template"]) + _assert_template_ok(result) + + def test_empty_prompts(self, fake_kedro_cli): + shutil.copytree(TEMPLATE_PATH, "template") + _write_yaml(Path("template") / "prompts.yml", {}) + result = CliRunner().invoke(fake_kedro_cli, ["new", "--starter", "template"]) + _assert_template_ok(result) + + def test_custom_prompt_valid_input(self, fake_kedro_cli): + shutil.copytree(TEMPLATE_PATH, "template") + _write_yaml( + Path("template") / "prompts.yml", + { + "project_name": {"title": "Project Name"}, + "custom_value": { + "title": "Custom Value", + "regex_validator": "^\\w+(-*\\w+)*$", + }, + }, + ) + custom_input = "\n".join(["my-project", "My Project"]) result = CliRunner().invoke( fake_kedro_cli, - ["new"], - input=_make_cli_prompt_input(python_package="my_package"), + ["new", "--starter", "template"], + input=custom_input, ) _assert_template_ok( result, - project_name="New Kedro Project", - repo_name="new-kedro-project", - python_package="my_package", + project_name="My Project", + repo_name="my-project", + python_package="my_project", ) - def test_custom_all(self, fake_kedro_cli): + def test_custom_prompt_for_essential_variable(self, fake_kedro_cli): + shutil.copytree(TEMPLATE_PATH, "template") + _write_yaml( + Path("template") / "prompts.yml", + { + "project_name": {"title": "Project Name"}, + "repo_name": { + "title": "Custom Repo Name", + "regex_validator": "^[a-zA-Z_]\\w{1,}$", + }, + }, + ) + custom_input = "\n".join(["My Project", "my_custom_repo"]) result = CliRunner().invoke( fake_kedro_cli, - ["new"], - input=_make_cli_prompt_input( - project_name="My Project", - repo_name="my-repo", - python_package="my_package", - ), + ["new", "--starter", "template"], + input=custom_input, ) _assert_template_ok( result, project_name="My Project", - repo_name="my-repo", - python_package="my_package", + repo_name="my_custom_repo", + python_package="my_project", ) - def test_no_prompts(self, fake_kedro_cli): - shutil.copytree(TEMPLATE_PATH, "template") - (Path("template") / "prompts.yml").unlink() - result = CliRunner().invoke(fake_kedro_cli, ["new", "--starter", "template"]) - _assert_template_ok(result) - - def test_empty_prompts(self, fake_kedro_cli): - shutil.copytree(TEMPLATE_PATH, "template") - _write_yaml(Path("template") / "prompts.yml", {}) - result = CliRunner().invoke(fake_kedro_cli, ["new", "--starter", "template"]) - _assert_template_ok(result) - @pytest.mark.usefixtures("chdir_to_tmp") class TestNewFromUserPromptsInvalid: @@ -209,46 +237,64 @@ def test_fail_if_dir_exists(self, fake_kedro_cli): assert result.exit_code != 0 assert "directory already exists" in result.output - @pytest.mark.parametrize( - "repo_name", [".repo\nvalid", "re!po\nvalid", "-repo\nvalid", "repo-\nvalid"] - ) - def test_bad_repo_name(self, fake_kedro_cli, repo_name): - """Check the error if the repository name is invalid.""" + def test_prompt_no_title(self, fake_kedro_cli): + shutil.copytree(TEMPLATE_PATH, "template") + _write_yaml(Path("template") / "prompts.yml", {"repo_name": {}}) + result = CliRunner().invoke(fake_kedro_cli, ["new", "--starter", "template"]) + assert result.exit_code != 0 + assert "Each prompt must have a title field to be valid" in result.output + + def test_prompt_bad_yaml(self, fake_kedro_cli): + shutil.copytree(TEMPLATE_PATH, "template") + (Path("template") / "prompts.yml").write_text("invalid\tyaml", encoding="utf-8") + result = CliRunner().invoke(fake_kedro_cli, ["new", "--starter", "template"]) + assert result.exit_code != 0 + assert "Failed to generate project: could not load prompts.yml" in result.output + + def test_invalid_project_name_special_characters(self, fake_kedro_cli): result = CliRunner().invoke( - fake_kedro_cli, ["new"], input=_make_cli_prompt_input(repo_name=repo_name) + fake_kedro_cli, + ["new"], + input=_make_cli_prompt_input(project_name="My $Project!"), ) assert result.exit_code != 0 assert ( - "is an invalid value.\nIt must contain only word symbols" in result.output + "is an invalid value for Project Name.\nIt must contain only alphanumeric symbols" + in result.output ) - @pytest.mark.parametrize( - "python_package", - ["0package\nvalid", "_\nvalid", "package-name\nvalid", "package name\nvalid"], - ) - def test_bad_python_package(self, fake_kedro_cli, python_package): - """Check the error if the package name is invalid.""" + def test_invalid_project_name_too_short(self, fake_kedro_cli): result = CliRunner().invoke( fake_kedro_cli, ["new"], - input=_make_cli_prompt_input(python_package=python_package), + input=_make_cli_prompt_input(project_name="P"), ) assert result.exit_code != 0 - assert "is an invalid value.\nIt must start with a letter" in result.output - - def test_prompt_no_title(self, fake_kedro_cli): - shutil.copytree(TEMPLATE_PATH, "template") - _write_yaml(Path("template") / "prompts.yml", {"repo_name": {}}) - result = CliRunner().invoke(fake_kedro_cli, ["new", "--starter", "template"]) - assert result.exit_code != 0 - assert "Each prompt must have a title field to be valid" in result.output + assert ( + "is an invalid value for Project Name.\nIt must contain only alphanumeric symbols" + in result.output + ) - def test_prompt_bad_yaml(self, fake_kedro_cli): + def test_custom_prompt_invalid_input(self, fake_kedro_cli): shutil.copytree(TEMPLATE_PATH, "template") - (Path("template") / "prompts.yml").write_text("invalid\tyaml") - result = CliRunner().invoke(fake_kedro_cli, ["new", "--starter", "template"]) + _write_yaml( + Path("template") / "prompts.yml", + { + "project_name": {"title": "Project Name"}, + "custom_value": { + "title": "Custom Value", + "regex_validator": "^\\w+(-*\\w+)*$", + }, + }, + ) + custom_input = "\n".join(["My Project", "My Project"]) + result = CliRunner().invoke( + fake_kedro_cli, + ["new", "--starter", "template"], + input=custom_input, + ) assert result.exit_code != 0 - assert "Failed to generate project: could not load prompts.yml" in result.output + assert "'My Project' is an invalid value" in result.output @pytest.mark.usefixtures("chdir_to_tmp") @@ -268,6 +314,19 @@ def test_required_keys_only(self, fake_kedro_cli): ) _assert_template_ok(result, **config) + def test_custom_required_keys(self, fake_kedro_cli): + """Test project created from config.""" + config = { + "project_name": "Project X", + "repo_name": "projectx", + "python_package": "proj_x", + } + _write_yaml(Path("config.yml"), config) + result = CliRunner().invoke( + fake_kedro_cli, ["new", "-v", "--config", "config.yml"] + ) + _assert_template_ok(result, **config) + def test_custom_kedro_version(self, fake_kedro_cli): """Test project created from config.""" config = { @@ -322,7 +381,7 @@ def test_no_prompts(self, fake_kedro_cli): result = CliRunner().invoke( fake_kedro_cli, ["new", "--starter", "template", "--config", "config.yml"] ) - _assert_template_ok(result) + _assert_template_ok(result, **config) def test_empty_prompts(self, fake_kedro_cli): config = { @@ -336,7 +395,7 @@ def test_empty_prompts(self, fake_kedro_cli): result = CliRunner().invoke( fake_kedro_cli, ["new", "--starter", "template", "--config", "config.yml"] ) - _assert_template_ok(result) + _assert_template_ok(result, **config) @pytest.mark.usefixtures("chdir_to_tmp") @@ -357,13 +416,13 @@ def test_output_dir_does_not_exist(self, fake_kedro_cli): def test_config_missing_key(self, fake_kedro_cli): """Check the error if keys are missing from config file.""" config = { - "project_name": "My Project", + "python_package": "my_project", "repo_name": "my-project", } _write_yaml(Path("config.yml"), config) result = CliRunner().invoke(fake_kedro_cli, ["new", "-v", "-c", "config.yml"]) assert result.exit_code != 0 - assert "python_package not found in config file" in result.output + assert "project_name not found in config file" in result.output def test_config_does_not_exist(self, fake_kedro_cli): """Check the error if the config file does not exist.""" @@ -380,7 +439,7 @@ def test_config_empty(self, fake_kedro_cli): def test_config_bad_yaml(self, fake_kedro_cli): """Check the error if config YAML is invalid.""" - Path("config.yml").write_text("invalid\tyaml") + Path("config.yml").write_text("invalid\tyaml", encoding="utf-8") result = CliRunner().invoke(fake_kedro_cli, ["new", "-v", "-c", "config.yml"]) assert result.exit_code != 0 assert "Failed to generate project: could not load config" in result.output @@ -422,7 +481,7 @@ def test_alias(self, fake_kedro_cli, mock_determine_repo_dir, mock_cookiecutter) input=_make_cli_prompt_input(), ) kwargs = { - "template": "git+https://github.com/quantumblacklabs/kedro-starters.git", + "template": "git+https://github.com/kedro-org/kedro-starters.git", "checkout": version, "directory": "spaceflights", } @@ -438,7 +497,7 @@ def test_alias_custom_checkout( input=_make_cli_prompt_input(), ) kwargs = { - "template": "git+https://github.com/quantumblacklabs/kedro-starters.git", + "template": "git+https://github.com/kedro-org/kedro-starters.git", "checkout": "my_checkout", "directory": "spaceflights", } @@ -519,7 +578,7 @@ def test_invalid_starter(self, fake_kedro_cli): @pytest.mark.parametrize( "starter, repo", [ - ("spaceflights", "https://github.com/quantumblacklabs/kedro-starters.git"), + ("spaceflights", "https://github.com/kedro-org/kedro-starters.git"), ( "git+https://github.com/fake/fake.git", "https://github.com/fake/fake.git", diff --git a/tests/framework/conftest.py b/tests/framework/conftest.py new file mode 100644 index 0000000000..3923d9f559 --- /dev/null +++ b/tests/framework/conftest.py @@ -0,0 +1,26 @@ +import pytest + +from kedro.framework.project import configure_logging + + +@pytest.fixture +def default_logging_config(): + logging_config = { + "version": 1, + "disable_existing_loggers": False, + "handlers": { + "rich": {"class": "kedro.logging.RichHandler", "rich_tracebacks": True} + }, + "loggers": {"kedro": {"level": "INFO"}}, + "root": {"handlers": ["rich"]}, + } + return logging_config + + +@pytest.fixture(autouse=True) +def reset_logging(request, default_logging_config): + yield + if "nologreset" in request.keywords: + return + + configure_logging(default_logging_config) diff --git a/tests/framework/context/test_context.py b/tests/framework/context/test_context.py index 8c13543c2d..d468fe8dd4 100644 --- a/tests/framework/context/test_context.py +++ b/tests/framework/context/test_context.py @@ -1,82 +1,58 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from __future__ import annotations + import configparser import json +import logging import re -import sys +import textwrap from pathlib import Path, PurePath, PurePosixPath, PureWindowsPath -from time import sleep -from typing import Any, Dict +from typing import Any import pandas as pd import pytest import toml import yaml +from attrs.exceptions import FrozenInstanceError from pandas.util.testing import assert_frame_equal from kedro import __version__ as kedro_version -from kedro.config import MissingConfigException -from kedro.extras.datasets.pandas import CSVDataSet -from kedro.framework.context import KedroContext, KedroContextError +from kedro.config import ConfigLoader, MissingConfigException +from kedro.framework.context import KedroContext from kedro.framework.context.context import ( _convert_paths_to_absolute_posix, _is_relative_path, - _validate_layers_for_transcoding, + _update_nested_dict, ) -from kedro.framework.hooks import get_hook_manager, hook_impl +from kedro.framework.hooks import _create_hook_manager from kedro.framework.project import ( - Validator, - _ProjectPipelines, + ValidationError, _ProjectSettings, configure_project, pipelines, ) -from kedro.io import DataCatalog -from kedro.io.core import Version, generate_timestamp -from kedro.pipeline import Pipeline, node -from kedro.runner import ParallelRunner, SequentialRunner MOCK_PACKAGE_NAME = "mock_package_name" -def _write_yaml(filepath: Path, config: Dict): +class BadCatalog: # pylint: disable=too-few-public-methods + """ + Catalog class that doesn't subclass `DataCatalog`, for testing only. + """ + + +def _write_yaml(filepath: Path, config: dict): filepath.parent.mkdir(parents=True, exist_ok=True) yaml_str = yaml.dump(config) filepath.write_text(yaml_str) -def _write_toml(filepath: Path, config: Dict): +def _write_toml(filepath: Path, config: dict): filepath.parent.mkdir(parents=True, exist_ok=True) toml_str = toml.dumps(config) filepath.write_text(toml_str) -def _write_json(filepath: Path, config: Dict): +def _write_json(filepath: Path, config: dict): filepath.parent.mkdir(parents=True, exist_ok=True) json_str = json.dumps(config) filepath.write_text(json_str) @@ -156,63 +132,32 @@ def prepare_project_dir(tmp_path, base_config, local_config, env): _write_toml(tmp_path / "pyproject.toml", pyproject_toml_payload) -class RegistrationHooks: - @hook_impl - def register_catalog( - self, catalog, credentials, load_versions, save_version, journal - ) -> DataCatalog: - return DataCatalog.from_config( - catalog, credentials, load_versions, save_version, journal - ) - - -class MockSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=(RegistrationHooks(),)) - - -class BrokenSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=(RegistrationHooks(),)) - _CONFIG_LOADER_CLASS = Validator("CONFIG_LOADER_CLASS", default="it breaks") - - @pytest.fixture -def broken_settings(mocker): - mocked_settings = BrokenSettings() - mocker.patch("kedro.framework.session.session.settings", mocked_settings) - mocker.patch("kedro.framework.context.context.settings", mocked_settings) - return mocker.patch("kedro.framework.project.settings", mocked_settings) +def mock_settings_file_bad_data_catalog_class(tmpdir): + mock_settings_file = tmpdir.join("mock_settings_file.py") + mock_settings_file.write( + textwrap.dedent( + f""" + from {__name__} import BadCatalog + DATA_CATALOG_CLASS = BadCatalog + """ + ) + ) + return mock_settings_file @pytest.fixture(autouse=True) def mock_settings(mocker): - mocked_settings = MockSettings() + mocked_settings = _ProjectSettings() mocker.patch("kedro.framework.session.session.settings", mocked_settings) - mocker.patch("kedro.framework.context.context.settings", mocked_settings) return mocker.patch("kedro.framework.project.settings", mocked_settings) -@pytest.fixture(autouse=True) -def mock_pipelines(mocker): - mocker.patch.object( - _ProjectPipelines, - "_get_pipelines_registry_callable", - return_value=_create_pipelines, - ) - - @pytest.fixture def dummy_dataframe(): return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) -def identity(input1: str): - return input1 # pragma: no cover - - -def bad_node(x): - raise ValueError("Oh no!") - - expected_message_middle = ( "There are 2 nodes that have not run.\n" "You can resume the pipeline run by adding the following " @@ -220,7 +165,6 @@ def bad_node(x): ' --from-nodes "nodes3"' ) - expected_message_head = ( "There are 4 nodes that have not run.\n" "You can resume the pipeline run by adding the following " @@ -231,114 +175,49 @@ def bad_node(x): "tool": { "kedro": { "project_name": "mock_project_name", - "project_version": kedro_version, + "kedro_init_version": kedro_version, "package_name": MOCK_PACKAGE_NAME, } } } -def _create_pipelines(): - bad_pipeline_middle = Pipeline( - [ - node(identity, "cars", "boats", name="node1", tags=["tag1"]), - node(identity, "boats", "trains", name="node2"), - node(bad_node, "trains", "ships", name="nodes3"), - node(identity, "ships", "planes", name="node4"), - ], - tags="bad_pipeline", - ) - bad_pipeline_head = Pipeline( - [ - node(bad_node, "cars", "boats", name="node1", tags=["tag1"]), - node(identity, "boats", "trains", name="node2"), - node(identity, "trains", "ships", name="nodes3"), - node(identity, "ships", "planes", name="node4"), - ], - tags="bad_pipeline", - ) - default_pipeline = Pipeline( - [ - node(identity, "cars", "boats", name="node1", tags=["tag1"]), - node(identity, "boats", "trains", name="node2"), - node(identity, "trains", "ships", name="node3"), - node(identity, "ships", "planes", name="node4"), - ], - tags="pipeline", - ) - return { - "__default__": default_pipeline, - "empty": Pipeline([]), - "simple": Pipeline([node(identity, "cars", "boats")]), - "bad_pipeline_middle": bad_pipeline_middle, - "bad_pipeline_head": bad_pipeline_head, - } - - @pytest.fixture(params=[None]) def extra_params(request): return request.param -@pytest.fixture(autouse=True) -def mocked_logging(mocker): - # Disable logging.config.dictConfig in KedroSession._setup_logging as - # it changes logging.config and affects other unit tests - return mocker.patch("logging.config.dictConfig") - - @pytest.fixture def dummy_context( - tmp_path, prepare_project_dir, env, extra_params, mocker + tmp_path, prepare_project_dir, env, extra_params ): # pylint: disable=unused-argument - mocker.patch("kedro.framework.project._validate_module") configure_project(MOCK_PACKAGE_NAME) + config_loader = ConfigLoader(str(tmp_path / "conf"), env=env) context = KedroContext( - MOCK_PACKAGE_NAME, str(tmp_path), env=env, extra_params=extra_params + MOCK_PACKAGE_NAME, + str(tmp_path), + config_loader=config_loader, + hook_manager=_create_hook_manager(), + env=env, + extra_params=extra_params, ) yield context - pipelines._clear(MOCK_PACKAGE_NAME) - - -@pytest.fixture(autouse=True) -def clear_hook_manager(): - yield - hook_manager = get_hook_manager() - plugins = hook_manager.get_plugins() - for plugin in plugins: - hook_manager.unregister(plugin) + pipelines.configure() class TestKedroContext: - def test_deprecate_reading_conf_source_from_context(self, dummy_context): - pattern = ( - "Accessing CONF_SOURCE via the context will be deprecated in Kedro 0.18.0." - ) - with pytest.warns(DeprecationWarning, match=pattern): - assert dummy_context.CONF_SOURCE == "conf" - - def test_deprecate_setting_conf_source_on_context(self, dummy_context): - pattern = ( - "Accessing CONF_SOURCE via the context will be deprecated in Kedro 0.18.0." - ) - with pytest.warns(DeprecationWarning, match=pattern): - dummy_context.CONF_SOURCE = "test_conf" - - @pytest.mark.parametrize("property_name", ["io", "pipeline", "pipelines"]) - def test_deprecate_properties_on_context(self, property_name, dummy_context): - pattern = f"Accessing {property_name} via the context will be deprecated in Kedro 0.18.0." - with pytest.warns(DeprecationWarning, match=pattern): - assert getattr(dummy_context, property_name) - def test_attributes(self, tmp_path, dummy_context): - project_metadata = pyproject_toml_payload["tool"]["kedro"] - assert dummy_context.package_name == project_metadata["package_name"] assert isinstance(dummy_context.project_path, Path) assert dummy_context.project_path == tmp_path.resolve() + def test_immutable_instance(self, dummy_context): + with pytest.raises(FrozenInstanceError): + dummy_context.catalog = 1 + def test_get_catalog_always_using_absolute_path(self, dummy_context): - conf_catalog = dummy_context.config_loader.get("catalog*") + config_loader = dummy_context.config_loader + conf_catalog = config_loader.get("catalog*") # even though the raw configuration uses relative path assert conf_catalog["horses"]["filepath"] == "horses.csv" @@ -349,14 +228,21 @@ def test_get_catalog_always_using_absolute_path(self, dummy_context): ds_path = catalog._data_sets["horses"]._filepath assert PurePath(ds_path.as_posix()).is_absolute() assert ( - ds_path.as_posix() - == (dummy_context._project_path / "horses.csv").as_posix() + ds_path.as_posix() == (dummy_context.project_path / "horses.csv").as_posix() + ) + + def test_get_catalog_validates_transcoded_datasets(self, dummy_context, mocker): + mock_transcode_split = mocker.patch( + "kedro.framework.context.context._transcode_split" ) + catalog = dummy_context.catalog + for dataset_name in catalog._data_sets.keys(): + mock_transcode_split.assert_any_call(dataset_name) - def test_get_catalog_validates_layers(self, dummy_context, mocker): mock_validate = mocker.patch( - "kedro.framework.context.context._validate_layers_for_transcoding" + "kedro.framework.context.context._validate_transcoded_datasets" ) + catalog = dummy_context.catalog mock_validate.assert_called_once_with(catalog) @@ -367,10 +253,17 @@ def test_catalog(self, dummy_context, dummy_dataframe): reloaded_df = dummy_context.catalog.load("cars") assert_frame_equal(reloaded_df, dummy_dataframe) - def test_io(self, dummy_context, dummy_dataframe): - dummy_context.io.save("cars", dummy_dataframe) - reloaded_df = dummy_context.io.load("cars") - assert_frame_equal(reloaded_df, dummy_dataframe) + def test_wrong_catalog_type(self, mock_settings_file_bad_data_catalog_class): + pattern = ( + "Invalid value 'tests.framework.context.test_context.BadCatalog' received " + "for setting 'DATA_CATALOG_CLASS'. " + "It must be a subclass of 'kedro.io.data_catalog.DataCatalog'." + ) + mock_settings = _ProjectSettings( + settings_file=str(mock_settings_file_bad_data_catalog_class) + ) + with pytest.raises(ValidationError, match=re.escape(pattern)): + assert mock_settings.DATA_CATALOG_CLASS @pytest.mark.parametrize( "extra_params", @@ -396,8 +289,8 @@ def test_nested_params(self, param, expected, dummy_context): indirect=True, ) def test_params_missing(self, mocker, extra_params, dummy_context): - mock_config_loader = mocker.patch.object(KedroContext, "config_loader") - mock_config_loader.get.side_effect = MissingConfigException("nope") + mock_config_loader = mocker.patch("kedro.config.ConfigLoader.get") + mock_config_loader.side_effect = MissingConfigException("nope") extra_params = extra_params or {} pattern = "Parameters not found in your Kedro project config" @@ -405,36 +298,6 @@ def test_params_missing(self, mocker, extra_params, dummy_context): actual = dummy_context.params assert actual == extra_params - def test_config_loader(self, dummy_context): - params = dummy_context.config_loader.get("parameters*") - db_conf = dummy_context.config_loader.get("db*") - catalog = dummy_context.config_loader.get("catalog*") - - assert params["param1"] == 1 - assert db_conf["prod"]["url"] == "postgresql://user:pass@url_prod/db" - - assert catalog["trains"]["type"] == "pandas.CSVDataSet" - assert catalog["cars"]["type"] == "pandas.CSVDataSet" - assert catalog["boats"]["type"] == "pandas.CSVDataSet" - assert not catalog["cars"]["save_args"]["index"] - - # pylint: disable=unused-argument - def test_broken_config_loader(self, broken_settings, dummy_context): - pattern = ( - f"Expected an instance of `ConfigLoader`, " - f"got `it breaks` of class `{type('')}` instead." - ) - with pytest.raises(KedroContextError, match=re.escape(pattern)): - _ = dummy_context.config_loader - - def test_default_env(self, dummy_context): - # default environment setting is delegated to config_loader, - # rather than `KedroContext` - assert not dummy_context.env - assert not dummy_context.config_loader.env - assert dummy_context.config_loader.default_run_env == "local" - assert dummy_context.config_loader.base_env == "base" - @pytest.mark.parametrize("env", ["custom_env"], indirect=True) def test_custom_env(self, dummy_context, env): assert dummy_context.env == env @@ -447,191 +310,20 @@ def test_missing_parameters(self, tmp_path, dummy_context): with pytest.warns(UserWarning, match=re.escape(pattern)): _ = dummy_context.catalog - def test_missing_credentials(self, dummy_context): + def test_missing_credentials(self, dummy_context, caplog): + caplog.set_level(logging.DEBUG, logger="kedro") + env_credentials = ( dummy_context.project_path / "conf" / "local" / "credentials.yml" ) env_credentials.unlink() - pattern = "Credentials not found in your Kedro project config." - with pytest.warns(UserWarning, match=re.escape(pattern)): - _ = dummy_context.catalog - - def test_pipeline(self, dummy_context): - assert dummy_context.pipeline.nodes[0].inputs == ["cars"] - assert dummy_context.pipeline.nodes[0].outputs == ["boats"] - assert dummy_context.pipeline.nodes[1].inputs == ["boats"] - assert dummy_context.pipeline.nodes[1].outputs == ["trains"] - - def test_pipelines(self, dummy_context): - assert len(dummy_context.pipelines) == 5 - assert len(dummy_context.pipelines["__default__"].nodes) == 4 - - -class TestKedroContextRun: - def test_deprecate_run(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - pattern = ( - "`kedro.framework.context.KedroContext.run` is now deprecated in favour of " - "`KedroSession.run` and will be removed in Kedro 0.18.0." - ) - with pytest.warns(DeprecationWarning, match=pattern): - dummy_context.run() - - def test_run_output(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - outputs = dummy_context.run() - pd.testing.assert_frame_equal(outputs["planes"], dummy_dataframe) - - def test_run_no_output(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - outputs = dummy_context.run(node_names=["node1"]) - assert not outputs - - def test_default_run(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run() - - log_msgs = [record.getMessage() for record in caplog.records] - log_names = [record.name for record in caplog.records] - - assert "kedro.runner.sequential_runner" in log_names - assert "Pipeline execution completed successfully." in log_msgs - - def test_sequential_run_arg(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(runner=SequentialRunner()) - - log_msgs = [record.getMessage() for record in caplog.records] - log_names = [record.name for record in caplog.records] - assert "kedro.runner.sequential_runner" in log_names - assert "Pipeline execution completed successfully." in log_msgs - - @pytest.mark.skipif( - sys.platform.startswith("win"), reason="Due to bug in parallel runner" - ) - def test_parallel_run_arg(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(runner=ParallelRunner()) - - log_msgs = [record.getMessage() for record in caplog.records] - log_names = [record.name for record in caplog.records] - assert "kedro.runner.parallel_runner" in log_names - assert "Pipeline execution completed successfully." in log_msgs - - def test_run_load_versions(self, dummy_context, dummy_dataframe): - filepath = (dummy_context.project_path / "cars.csv").as_posix() - - old_save_version = generate_timestamp() - old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) - old_csv_data_set = CSVDataSet( - filepath=filepath, - save_args={"sep": ","}, - version=Version(None, old_save_version), - ) - old_csv_data_set.save(old_df) - - sleep(0.5) - new_save_version = generate_timestamp() - new_csv_data_set = CSVDataSet( - filepath=filepath, - save_args={"sep": ","}, - version=Version(None, new_save_version), - ) - new_csv_data_set.save(dummy_dataframe) - - load_versions = {"cars": old_save_version} - dummy_context.run(load_versions=load_versions, pipeline_name="simple") - assert not dummy_context.catalog.load("boats").equals(dummy_dataframe) - assert dummy_context.catalog.load("boats").equals(old_df) - - def test_run_with_empty_pipeline(self, dummy_context): - with pytest.raises(ValueError, match="Pipeline contains no nodes"): - dummy_context.run(pipeline_name="empty") - - @pytest.mark.parametrize( - "pipeline_name,expected_message", - [ - ("bad_pipeline_middle", expected_message_middle), - ("bad_pipeline_head", expected_message_head), - ], # pylint: disable=too-many-arguments - ) - def test_run_failure_prompts_resume_command( - self, dummy_context, dummy_dataframe, caplog, pipeline_name, expected_message - ): - dummy_context.catalog.save("cars", dummy_dataframe) - with pytest.raises(ValueError, match="Oh no"): - dummy_context.run(pipeline_name=pipeline_name) - - actual_messages = [ - record.getMessage() - for record in caplog.records - if record.levelname == "WARNING" - ] - - assert expected_message in actual_messages + _ = dummy_context.catalog - def test_missing_pipeline_name(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - - with pytest.raises(KedroContextError, match="Failed to find the pipeline"): - dummy_context.run(pipeline_name="invalid-name") - - @pytest.mark.parametrize( - "extra_params", - [None, {}, {"foo": "bar", "baz": [1, 2], "qux": None}], - indirect=True, - ) - def test_run_with_extra_params( - self, mocker, dummy_context, dummy_dataframe, extra_params - ): - mock_journal = mocker.patch("kedro.framework.context.context.Journal") - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run() - - assert mock_journal.call_args[0][0]["extra_params"] == extra_params - - def test_run_with_save_version_as_run_id( - self, mocker, dummy_context, dummy_dataframe, caplog - ): - """Test that the default behaviour, with run_id set to None, - creates a journal record with the run_id the same as save_version. - """ - save_version = "2020-01-01T00.00.00.000Z" - mocked_get_save_version = mocker.patch.object( - dummy_context, "_get_save_version", return_value=save_version - ) - - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(load_versions={"boats": save_version}) - - mocked_get_save_version.assert_called_once_with() - log_msg = next( - record.getMessage() - for record in caplog.records - if record.name == "kedro.journal" - ) - assert json.loads(log_msg)["run_id"] == save_version - - def test_run_with_custom_run_id( - self, mocker, dummy_context, dummy_dataframe, caplog - ): - run_id = "001" - mocked_get_run_id = mocker.patch.object( - dummy_context, "_get_run_id", return_value=run_id - ) - - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run() - - # once during run, and twice for each `.catalog` - assert mocked_get_run_id.call_count == 3 - log_msg = next( - record.getMessage() - for record in caplog.records - if record.name == "kedro.journal" - ) - assert json.loads(log_msg)["run_id"] == run_id + # check the logs + log_messages = [record.getMessage() for record in caplog.records] + expected_msg = "Credentials not found in your Kedro project config." + assert any(expected_msg in log_message for log_message in log_messages) @pytest.mark.parametrize( @@ -693,7 +385,7 @@ def test_convert_paths_raises_error_on_relative_project_path(): ], ) def test_convert_paths_to_absolute_posix_for_all_known_filepath_keys( - project_path: Path, input_conf: Dict[str, Any], expected: Dict[str, Any] + project_path: Path, input_conf: dict[str, Any], expected: dict[str, Any] ): assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected @@ -714,7 +406,7 @@ def test_convert_paths_to_absolute_posix_for_all_known_filepath_keys( ], ) def test_convert_paths_to_absolute_posix_not_changing_non_relative_path( - project_path: Path, input_conf: Dict[str, Any], expected: Dict[str, Any] + project_path: Path, input_conf: dict[str, Any], expected: dict[str, Any] ): assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected @@ -730,50 +422,38 @@ def test_convert_paths_to_absolute_posix_not_changing_non_relative_path( ], ) def test_convert_paths_to_absolute_posix_converts_full_windows_path_to_posix( - project_path: Path, input_conf: Dict[str, Any], expected: Dict[str, Any] + project_path: Path, input_conf: dict[str, Any], expected: dict[str, Any] ): assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected @pytest.mark.parametrize( - "layers", + "old_dict, new_dict, expected", [ - {"raw": {"A"}, "interm": {"B", "C"}}, - {"raw": {"A"}, "interm": {"B@2", "B@1"}}, - {"raw": {"C@1"}, "interm": {"A", "B@1", "B@2", "B@3"}}, - ], -) -def test_validate_layers(layers, mocker): - mock_catalog = mocker.MagicMock() - mock_catalog.layers = layers - - _validate_layers_for_transcoding(mock_catalog) # it shouldn't raise any error - - -@pytest.mark.parametrize( - "layers,conflicting_datasets", - [ - ({"raw": {"A", "B@1"}, "interm": {"B@2"}}, ["B@2"]), - ({"raw": {"A"}, "interm": {"B@1", "B@2"}, "prm": {"B@3"}}, ["B@3"]), ( { - "raw": {"A@1"}, - "interm": {"B@1", "B@2"}, - "prm": {"B@3", "B@4"}, - "other": {"A@2"}, + "a": 1, + "b": 2, + "c": { + "d": 3, + }, + }, + {"c": {"d": 5, "e": 4}}, + { + "a": 1, + "b": 2, + "c": {"d": 5, "e": 4}, }, - ["A@2", "B@3", "B@4"], + ), + ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}), + ({"a": 1, "b": 2}, {"b": 3}, {"a": 1, "b": 3}), + ( + {"a": {"a.a": 1, "a.b": 2, "a.c": {"a.c.a": 3}}}, + {"a": {"a.c": {"a.c.b": 4}}}, + {"a": {"a.a": 1, "a.b": 2, "a.c": {"a.c.a": 3, "a.c.b": 4}}}, ), ], ) -def test_validate_layers_error(layers, conflicting_datasets, mocker): - mock_catalog = mocker.MagicMock() - mock_catalog.layers = layers - error_str = ", ".join(conflicting_datasets) - - pattern = ( - f"Transcoded datasets should have the same layer. " - f"Mismatch found for: {error_str}" - ) - with pytest.raises(ValueError, match=re.escape(pattern)): - _validate_layers_for_transcoding(mock_catalog) +def test_update_nested_dict(old_dict: dict, new_dict: dict, expected: dict): + _update_nested_dict(old_dict, new_dict) # _update_nested_dict change dict in place + assert old_dict == expected diff --git a/tests/framework/hooks/__init__.py b/tests/framework/hooks/__init__.py index d2b0d6b3de..e69de29bb2 100644 --- a/tests/framework/hooks/__init__.py +++ b/tests/framework/hooks/__init__.py @@ -1,27 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/framework/hooks/test_manager.py b/tests/framework/hooks/test_manager.py index 038ff034da..42dc3e9f64 100644 --- a/tests/framework/hooks/test_manager.py +++ b/tests/framework/hooks/test_manager.py @@ -1,39 +1,19 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import pytest -from kedro.framework.hooks.manager import _create_hook_manager -from kedro.framework.hooks.specs import DataCatalogSpecs, NodeSpecs, PipelineSpecs +from kedro.framework.hooks.manager import _create_hook_manager, _NullPluginManager +from kedro.framework.hooks.specs import ( + DataCatalogSpecs, + DatasetSpecs, + KedroContextSpecs, + NodeSpecs, + PipelineSpecs, +) @pytest.mark.parametrize( "hook_specs,hook_name,hook_params", [ + (KedroContextSpecs, "after_context_created", ("context")), ( DataCatalogSpecs, "after_catalog_created", @@ -44,23 +24,22 @@ "feed_dict", "save_version", "load_versions", - "run_id", ), ), ( NodeSpecs, "before_node_run", - ("node", "catalog", "inputs", "is_async", "run_id"), + ("node", "catalog", "inputs", "is_async", "session_id"), ), ( NodeSpecs, "after_node_run", - ("node", "catalog", "inputs", "outputs", "is_async", "run_id"), + ("node", "catalog", "inputs", "outputs", "is_async", "session_id"), ), ( NodeSpecs, "on_node_error", - ("error", "node", "catalog", "inputs", "is_async", "run_id"), + ("error", "node", "catalog", "inputs", "is_async", "session_id"), ), (PipelineSpecs, "before_pipeline_run", ("run_params", "pipeline", "catalog")), (PipelineSpecs, "after_pipeline_run", ("run_params", "pipeline", "catalog")), @@ -69,6 +48,10 @@ "on_pipeline_error", ("error", "run_params", "pipeline", "catalog"), ), + (DatasetSpecs, "before_dataset_loaded", ("dataset_name")), + (DatasetSpecs, "after_dataset_loaded", ("dataset_name", "data")), + (DatasetSpecs, "before_dataset_saved", ("dataset_name", "data")), + (DatasetSpecs, "after_dataset_saved", ("dataset_name", "data")), ], ) def test_hook_manager_can_call_hooks_defined_in_specs( @@ -83,3 +66,10 @@ def test_hook_manager_can_call_hooks_defined_in_specs( # since there hasn't been any hook implementation, the result should be empty # but it shouldn't have raised assert result == [] + + +def test_null_plugin_manager_returns_none_when_called(): + plugin_manager = _NullPluginManager() + assert ( + plugin_manager.hook.before_dataset_saved(dataset_name="mock", data=[]) is None + ) diff --git a/tests/framework/project/test_logging.py b/tests/framework/project/test_logging.py new file mode 100644 index 0000000000..17b8a77052 --- /dev/null +++ b/tests/framework/project/test_logging.py @@ -0,0 +1,147 @@ +# pylint: disable=import-outside-toplevel +import logging +import sys +from pathlib import Path + +import pytest +import yaml + +from kedro.framework.project import LOGGING, configure_logging, configure_project + + +@pytest.fixture +def default_logging_config_with_project(): + logging_config = { + "version": 1, + "disable_existing_loggers": False, + "handlers": { + "rich": {"class": "kedro.logging.RichHandler", "rich_tracebacks": True} + }, + "loggers": {"kedro": {"level": "INFO"}, "test_project": {"level": "INFO"}}, + "root": {"handlers": ["rich"]}, + } + return logging_config + + +def test_default_logging_config(default_logging_config): + assert LOGGING.data == default_logging_config + assert "rich" in {handler.name for handler in logging.getLogger().handlers} + assert logging.getLogger("kedro").level == logging.INFO + + +def test_project_logging_in_default_logging_config(default_logging_config_with_project): + configure_project("test_project") + assert LOGGING.data == default_logging_config_with_project + assert logging.getLogger("kedro").level == logging.INFO + assert logging.getLogger("test_project").level == logging.INFO + + +def test_environment_variable_logging_config(monkeypatch, tmp_path): + config_path = Path(tmp_path) / "logging.yml" + monkeypatch.setenv("KEDRO_LOGGING_CONFIG", config_path.absolute()) + logging_config = {"version": 1, "loggers": {"kedro": {"level": "WARNING"}}} + with config_path.open("w", encoding="utf-8") as f: + yaml.dump(logging_config, f) + from kedro.framework.project import _ProjectLogging + + LOGGING = _ProjectLogging() + + assert LOGGING.data == logging_config + assert logging.getLogger("kedro").level == logging.WARNING + + +def test_configure_logging(): + logging_config = {"version": 1, "loggers": {"kedro": {"level": "WARNING"}}} + configure_logging(logging_config) + assert LOGGING.data == logging_config + assert logging.getLogger("kedro").level == logging.WARNING + + +def test_rich_traceback_enabled(mocker, default_logging_config): + rich_traceback_install = mocker.patch("rich.traceback.install") + rich_pretty_install = mocker.patch("rich.pretty.install") + + LOGGING.configure(default_logging_config) + + rich_traceback_install.assert_called() + rich_pretty_install.assert_called() + + +def test_rich_traceback_not_installed(mocker, default_logging_config): + rich_traceback_install = mocker.patch("rich.traceback.install") + rich_pretty_install = mocker.patch("rich.pretty.install") + rich_handler = { + "class": "kedro.logging.RichHandler", + "rich_tracebacks": False, + } + test_logging_config = default_logging_config + test_logging_config["handlers"]["rich"] = rich_handler + + LOGGING.configure(test_logging_config) + + rich_pretty_install.assert_called_once() + rich_traceback_install.assert_not_called() + + +def test_rich_traceback_configuration(mocker, default_logging_config): + import click + + rich_traceback_install = mocker.patch("rich.traceback.install") + rich_pretty_install = mocker.patch("rich.pretty.install") + + sys_executable_path = str(Path(sys.executable).parent) + traceback_install_defaults = {"suppress": [click, sys_executable_path]} + + rich_handler = { + "class": "kedro.logging.RichHandler", + "rich_tracebacks": True, + "tracebacks_show_locals": True, + } + + test_logging_config = default_logging_config + test_logging_config["handlers"]["rich"] = rich_handler + LOGGING.configure(test_logging_config) + + expected_install_defaults = traceback_install_defaults + expected_install_defaults["show_locals"] = True + rich_traceback_install.assert_called_with(**expected_install_defaults) + rich_pretty_install.assert_called_once() + + +def test_rich_traceback_configuration_extend_suppress(mocker, default_logging_config): + """Test the configuration is not overrided but extend for `suppress`""" + import click + + rich_traceback_install = mocker.patch("rich.traceback.install") + rich_pretty_install = mocker.patch("rich.pretty.install") + + sys_executable_path = str(Path(sys.executable).parent) + traceback_install_defaults = {"suppress": [click, sys_executable_path]} + fake_path = "dummy" + rich_handler = { + "class": "kedro.logging.RichHandler", + "rich_tracebacks": True, + "tracebacks_suppress": [fake_path], + } + + test_logging_config = default_logging_config + test_logging_config["handlers"]["rich"] = rich_handler + LOGGING.configure(test_logging_config) + + expected_install_defaults = traceback_install_defaults + expected_install_defaults["suppress"].extend([fake_path]) + rich_traceback_install.assert_called_with(**expected_install_defaults) + rich_pretty_install.assert_called_once() + + +def test_rich_traceback_disabled_on_databricks( + mocker, monkeypatch, default_logging_config +): + monkeypatch.setenv("DATABRICKS_RUNTIME_VERSION", "1") + rich_traceback_install = mocker.patch("rich.traceback.install") + rich_pretty_install = mocker.patch("rich.pretty.install") + + LOGGING.configure(default_logging_config) + + rich_traceback_install.assert_not_called() + rich_pretty_install.assert_called() diff --git a/tests/framework/project/test_pipeline_discovery.py b/tests/framework/project/test_pipeline_discovery.py new file mode 100644 index 0000000000..b6f23b69ca --- /dev/null +++ b/tests/framework/project/test_pipeline_discovery.py @@ -0,0 +1,260 @@ +import shutil +import sys +import textwrap +import warnings +from pathlib import Path + +import pytest + +from kedro.framework.project import configure_project, find_pipelines + + +@pytest.fixture +def mock_package_name_with_pipelines(tmp_path, request): + package_name = "test_package" + pipelines_dir = tmp_path / package_name / "pipelines" + pipelines_dir.mkdir(parents=True) + (pipelines_dir / "__init__.py").touch() + for pipeline_name in request.param: + pipeline_dir = pipelines_dir / pipeline_name + pipeline_dir.mkdir() + (pipeline_dir / "__init__.py").write_text( + textwrap.dedent( + f""" + from kedro.pipeline import Pipeline, node, pipeline + + + def create_pipeline(**kwargs) -> Pipeline: + return pipeline([node(lambda: 1, None, "{pipeline_name}")]) + """ + ) + ) + sys.path.insert(0, str(tmp_path)) + yield package_name + sys.path.pop(0) + + # Make sure that any new `test_package.pipeline` module gets loaded. + if f"{package_name}.pipeline" in sys.modules: + del sys.modules[f"{package_name}.pipeline"] + + # Make sure that the `importlib_resources.files` in `find_pipelines` + # will point to the correct `test_package.pipelines` not from cache. + if f"{package_name}.pipelines" in sys.modules: + del sys.modules[f"{package_name}.pipelines"] + + +@pytest.fixture +def pipeline_names(request): + return request.param + + +@pytest.mark.parametrize( + "mock_package_name_with_pipelines,pipeline_names", + [(x, x) for x in [set(), {"my_pipeline"}]], + indirect=True, +) +def test_find_pipelines(mock_package_name_with_pipelines, pipeline_names): + configure_project(mock_package_name_with_pipelines) + pipelines = find_pipelines() + assert set(pipelines) == pipeline_names | {"__default__"} + assert sum(pipelines.values()).outputs() == pipeline_names + + +@pytest.mark.parametrize( + "mock_package_name_with_pipelines,pipeline_names", + [(x, x) for x in [set(), {"good_pipeline"}]], + indirect=True, +) +def test_find_pipelines_skips_modules_without_create_pipelines_function( + mock_package_name_with_pipelines, pipeline_names +): + # Create a module without `create_pipelines` in the `pipelines` dir. + pipelines_dir = Path(sys.path[0]) / mock_package_name_with_pipelines / "pipelines" + pipeline_dir = pipelines_dir / "bad_touch" + pipeline_dir.mkdir() + (pipeline_dir / "__init__.py").touch() + + configure_project(mock_package_name_with_pipelines) + with pytest.warns( + UserWarning, match="module does not expose a 'create_pipeline' function" + ): + pipelines = find_pipelines() + assert set(pipelines) == pipeline_names | {"__default__"} + assert sum(pipelines.values()).outputs() == pipeline_names + + +@pytest.mark.parametrize( + "mock_package_name_with_pipelines,pipeline_names", + [(x, x) for x in [set(), {"my_pipeline"}]], + indirect=True, +) +def test_find_pipelines_skips_modules_with_unexpected_return_value_type( + mock_package_name_with_pipelines, pipeline_names +): + # Define `create_pipelines` so that it does not return a `Pipeline`. + pipelines_dir = Path(sys.path[0]) / mock_package_name_with_pipelines / "pipelines" + pipeline_dir = pipelines_dir / "not_my_pipeline" + pipeline_dir.mkdir() + (pipeline_dir / "__init__.py").write_text( + textwrap.dedent( + """ + from __future__ import annotations + + from kedro.pipeline import Pipeline, node, pipeline + + + def create_pipeline(**kwargs) -> dict[str, Pipeline]: + return { + "pipe1": pipeline([node(lambda: 1, None, "pipe1")]), + "pipe2": pipeline([node(lambda: 2, None, "pipe2")]), + } + """ + ) + ) + + configure_project(mock_package_name_with_pipelines) + with pytest.warns( + UserWarning, + match=( + r"Expected the 'create_pipeline' function in the '\S+' " + r"module to return a 'Pipeline' object, got 'dict' instead." + ), + ): + pipelines = find_pipelines() + assert set(pipelines) == pipeline_names | {"__default__"} + assert sum(pipelines.values()).outputs() == pipeline_names + + +@pytest.mark.parametrize( + "mock_package_name_with_pipelines,pipeline_names", + [(x, x) for x in [set(), {"my_pipeline"}]], + indirect=True, +) +def test_find_pipelines_skips_regular_files_within_the_pipelines_folder( + mock_package_name_with_pipelines, pipeline_names +): + # Create a regular file (not a subdirectory) in the `pipelines` dir. + pipelines_dir = Path(sys.path[0]) / mock_package_name_with_pipelines / "pipelines" + (pipelines_dir / "not_my_pipeline.py").touch() + + configure_project(mock_package_name_with_pipelines) + with warnings.catch_warnings(): + warnings.filterwarnings("error", category=UserWarning) + pipelines = find_pipelines() + assert set(pipelines) == pipeline_names | {"__default__"} + assert sum(pipelines.values()).outputs() == pipeline_names + + +@pytest.mark.parametrize( + "mock_package_name_with_pipelines,pipeline_names", + [(x, x) for x in [set(), {"my_pipeline"}]], + indirect=True, +) +def test_find_pipelines_skips_modules_that_cause_exceptions_upon_import( + mock_package_name_with_pipelines, pipeline_names +): + # Create a module that will result in errors when we try to load it. + pipelines_dir = Path(sys.path[0]) / mock_package_name_with_pipelines / "pipelines" + pipeline_dir = pipelines_dir / "boulevard_of_broken_pipelines" + pipeline_dir.mkdir() + (pipeline_dir / "__init__.py").write_text("I walk a lonely road...") + + configure_project(mock_package_name_with_pipelines) + with pytest.warns( + UserWarning, match=r"An error occurred while importing the '\S+' module." + ): + pipelines = find_pipelines() + assert set(pipelines) == pipeline_names | {"__default__"} + assert sum(pipelines.values()).outputs() == pipeline_names + + +@pytest.mark.parametrize( + "mock_package_name_with_pipelines,pipeline_names", + [(x, x) for x in [set(), {"my_pipeline"}]], + indirect=True, +) +def test_find_pipelines_handles_simplified_project_structure( + mock_package_name_with_pipelines, pipeline_names +): + (Path(sys.path[0]) / mock_package_name_with_pipelines / "pipeline.py").write_text( + textwrap.dedent( + """ + from kedro.pipeline import Pipeline, node, pipeline + + + def create_pipeline(**kwargs) -> Pipeline: + return pipeline([node(lambda: 1, None, "simple_pipeline")]) + """ + ) + ) + + configure_project(mock_package_name_with_pipelines) + pipelines = find_pipelines() + assert set(pipelines) == pipeline_names | {"__default__"} + assert sum(pipelines.values()).outputs() == pipeline_names | {"simple_pipeline"} + + +@pytest.mark.parametrize( + "mock_package_name_with_pipelines,pipeline_names", + [(x, x) for x in [set(), {"my_pipeline"}]], + indirect=True, +) +def test_find_pipelines_skips_unimportable_pipeline_module( + mock_package_name_with_pipelines, pipeline_names +): + (Path(sys.path[0]) / mock_package_name_with_pipelines / "pipeline.py").write_text( + textwrap.dedent( + f""" + import {"".join(pipeline_names)} + + from kedro.pipeline import Pipeline, node, pipeline + + + def create_pipeline(**kwargs) -> Pipeline: + return pipeline([node(lambda: 1, None, "simple_pipeline")]) + """ + ) + ) + + configure_project(mock_package_name_with_pipelines) + with pytest.warns( + UserWarning, match=r"An error occurred while importing the '\S+' module." + ): + pipelines = find_pipelines() + assert set(pipelines) == pipeline_names | {"__default__"} + assert sum(pipelines.values()).outputs() == pipeline_names + + +@pytest.mark.parametrize( + "mock_package_name_with_pipelines,simplified", + [(set(), False), (set(), True)], + indirect=["mock_package_name_with_pipelines"], +) +def test_find_pipelines_handles_project_structure_without_pipelines_dir( + mock_package_name_with_pipelines, simplified +): + # Delete the `pipelines` directory to simulate a project without it. + pipelines_dir = Path(sys.path[0]) / mock_package_name_with_pipelines / "pipelines" + shutil.rmtree(pipelines_dir) + + if simplified: + ( + Path(sys.path[0]) / mock_package_name_with_pipelines / "pipeline.py" + ).write_text( + textwrap.dedent( + """ + from kedro.pipeline import Pipeline, node, pipeline + + + def create_pipeline(**kwargs) -> Pipeline: + return pipeline([node(lambda: 1, None, "simple_pipeline")]) + """ + ) + ) + + configure_project(mock_package_name_with_pipelines) + pipelines = find_pipelines() + assert set(pipelines) == {"__default__"} + assert sum(pipelines.values()).outputs() == ( + {"simple_pipeline"} if simplified else set() + ) diff --git a/tests/framework/project/test_pipeline_registry.py b/tests/framework/project/test_pipeline_registry.py index 933df7b800..b210de009d 100644 --- a/tests/framework/project/test_pipeline_registry.py +++ b/tests/framework/project/test_pipeline_registry.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import sys import textwrap @@ -52,7 +25,15 @@ def register_pipelines(): sys.path.pop(0) -def test_pipelines_without_configure_project_is_empty(): +def test_pipelines_without_configure_project_is_empty( + mock_package_name_with_pipelines_file, # pylint: disable=unused-argument +): + # Reimport `pipelines` from `kedro.framework.project` to ensure that + # it was not set by a pior call to the `configure_project` function. + del sys.modules["kedro.framework.project"] + # pylint: disable=reimported, import-outside-toplevel + from kedro.framework.project import pipelines + assert pipelines == {} @@ -78,17 +59,15 @@ def register_pipelines(): def test_pipelines_after_configuring_project_shows_updated_values( - mock_package_name_with_pipelines_file, mocker + mock_package_name_with_pipelines_file, ): - mocker.patch("kedro.framework.project._validate_module") configure_project(mock_package_name_with_pipelines_file) assert isinstance(pipelines["new_pipeline"], Pipeline) def test_configure_project_should_not_raise_for_unimportable_pipelines( - mock_package_name_with_unimportable_pipelines_file, mocker + mock_package_name_with_unimportable_pipelines_file, ): - mocker.patch("kedro.framework.project._validate_module") # configure_project should not raise error for unimportable pipelines # since pipelines loading is lazy configure_project(mock_package_name_with_unimportable_pipelines_file) diff --git a/tests/framework/project/test_settings.py b/tests/framework/project/test_settings.py index 727486950e..65774e0e37 100644 --- a/tests/framework/project/test_settings.py +++ b/tests/framework/project/test_settings.py @@ -1,67 +1,66 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import sys import textwrap -from unittest import mock import pytest +from kedro.config import ConfigLoader, TemplatedConfigLoader from kedro.framework.context.context import KedroContext -from kedro.framework.project import configure_project, settings +from kedro.framework.project import configure_project, settings, validate_settings +from kedro.framework.session.shelvestore import ShelveStore from kedro.framework.session.store import BaseSessionStore +from kedro.io import DataCatalog -MOCK_CONTEXT_CLASS = mock.patch( - "kedro.framework.context.context.KedroContext", autospec=True -) +class MyContext(KedroContext): + pass + + +class MyDataCatalog(DataCatalog): + pass -def test_settings_without_configure_project_show_default_values(): - assert settings.CONF_SOURCE == "conf" - assert settings.CONTEXT_CLASS is KedroContext - assert settings.SESSION_STORE_CLASS is BaseSessionStore - assert settings.SESSION_STORE_ARGS == {} - assert len(settings.DISABLE_HOOKS_FOR_PLUGINS) == 0 + +class ProjectHooks: # pylint: disable=too-few-public-methods + pass @pytest.fixture def mock_package_name_with_settings_file(tmpdir): + """This mock settings file tests everything that can be customised in settings.py. + Where there are suggestions in the project template settings.py (e.g. as for + CONFIG_LOADER_CLASS), those suggestions should be tested.""" old_settings = settings.as_dict() settings_file_path = tmpdir.mkdir("test_package").join("settings.py") + project_path, package_name, _ = str(settings_file_path).rpartition("test_package") settings_file_path.write( textwrap.dedent( f""" - from {__name__} import MOCK_CONTEXT_CLASS + from {__name__} import ProjectHooks + HOOKS = (ProjectHooks(),) + + DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) + + from kedro.framework.session.shelvestore import ShelveStore + SESSION_STORE_CLASS = ShelveStore + SESSION_STORE_ARGS = {{ + "path": "./sessions" + }} + + from {__name__} import MyContext + CONTEXT_CLASS = MyContext CONF_SOURCE = "test_conf" - CONTEXT_CLASS = MOCK_CONTEXT_CLASS + + from kedro.config import TemplatedConfigLoader + CONFIG_LOADER_CLASS = TemplatedConfigLoader + CONFIG_LOADER_ARGS = {{ + "globals_pattern": "*globals.yml", + }} + + # Class that manages the Data Catalog. + from {__name__} import MyDataCatalog + DATA_CATALOG_CLASS = MyDataCatalog """ ) ) - project_path, package_name, _ = str(settings_file_path).rpartition("test_package") sys.path.insert(0, project_path) yield package_name sys.path.pop(0) @@ -70,9 +69,34 @@ def mock_package_name_with_settings_file(tmpdir): settings.set(key, value) +def test_settings_without_configure_project_shows_default_values(): + assert len(settings.HOOKS) == 0 + assert settings.DISABLE_HOOKS_FOR_PLUGINS.to_list() == [] + assert settings.SESSION_STORE_CLASS is BaseSessionStore + assert settings.SESSION_STORE_ARGS == {} + assert settings.CONTEXT_CLASS is KedroContext + assert settings.CONF_SOURCE == "conf" + assert settings.CONFIG_LOADER_CLASS == ConfigLoader + assert settings.CONFIG_LOADER_ARGS == {} + assert settings.DATA_CATALOG_CLASS == DataCatalog + + def test_settings_after_configuring_project_shows_updated_values( mock_package_name_with_settings_file, ): configure_project(mock_package_name_with_settings_file) + assert len(settings.HOOKS) == 1 and isinstance(settings.HOOKS[0], ProjectHooks) + assert settings.DISABLE_HOOKS_FOR_PLUGINS.to_list() == ["kedro-viz"] + assert settings.SESSION_STORE_CLASS is ShelveStore + assert settings.SESSION_STORE_ARGS == {"path": "./sessions"} + assert settings.CONTEXT_CLASS is MyContext assert settings.CONF_SOURCE == "test_conf" - assert settings.CONTEXT_CLASS is MOCK_CONTEXT_CLASS + assert settings.CONFIG_LOADER_CLASS == TemplatedConfigLoader + assert settings.CONFIG_LOADER_ARGS == {"globals_pattern": "*globals.yml"} + assert settings.DATA_CATALOG_CLASS == MyDataCatalog + + +def test_validate_settings_with_empty_package_name(): + with pytest.raises(ValueError): + configure_project(None) # Simulate outside of project mode + validate_settings() diff --git a/tests/framework/session/conftest.py b/tests/framework/session/conftest.py index 2623f8d8d6..1ac0de6301 100644 --- a/tests/framework/session/conftest.py +++ b/tests/framework/session/conftest.py @@ -1,35 +1,10 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from __future__ import annotations + import logging from logging.handlers import QueueHandler, QueueListener from multiprocessing import Queue from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any import pandas as pd import pytest @@ -38,57 +13,43 @@ from dynaconf.validator import Validator from kedro import __version__ as kedro_version +from kedro.framework.context.context import KedroContext from kedro.framework.hooks import hook_impl -from kedro.framework.hooks.manager import get_hook_manager -from kedro.framework.project import _ProjectPipelines, _ProjectSettings +from kedro.framework.project import ( + _ProjectPipelines, + _ProjectSettings, + configure_project, +) from kedro.framework.session import KedroSession from kedro.io import DataCatalog from kedro.pipeline import Pipeline +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.pipeline.node import Node, node -from kedro.versioning import Journal logger = logging.getLogger(__name__) - -@pytest.fixture -def mock_package_name() -> str: - return "mock_package_name" +MOCK_PACKAGE_NAME = "fake_package" @pytest.fixture -def local_logging_config() -> Dict[str, Any]: - return { - "version": 1, - "formatters": { - "simple": {"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"} - }, - "root": {"level": "INFO", "handlers": ["console"]}, - "loggers": {"kedro": {"level": "INFO", "handlers": ["console"]}}, - "handlers": { - "console": { - "class": "logging.StreamHandler", - "level": "INFO", - "formatter": "simple", - "stream": "ext://sys.stdout", - } - }, - } +def mock_package_name() -> str: + return MOCK_PACKAGE_NAME -def _write_yaml(filepath: Path, config: Dict): +def _write_yaml(filepath: Path, config: dict): filepath.parent.mkdir(parents=True, exist_ok=True) yaml_str = yaml.dump(config) filepath.write_text(yaml_str) -def _write_toml(filepath: Path, config: Dict): +def _write_toml(filepath: Path, config: dict): filepath.parent.mkdir(parents=True, exist_ok=True) toml_str = toml.dumps(config) filepath.write_text(toml_str) def _assert_hook_call_record_has_expected_parameters( - call_record: logging.LogRecord, expected_parameters: List[str] + call_record: logging.LogRecord, expected_parameters: list[str] ): """Assert the given call record has all expected parameters.""" for param in expected_parameters: @@ -119,27 +80,16 @@ def local_config(tmp_path): @pytest.fixture(autouse=True) -def clear_hook_manager(): - yield - hook_manager = get_hook_manager() - plugins = hook_manager.get_plugins() - for plugin in plugins: - hook_manager.unregister(plugin) - - -@pytest.fixture(autouse=True) -def config_dir(tmp_path, local_config, local_logging_config): +def config_dir(tmp_path, local_config): catalog = tmp_path / "conf" / "base" / "catalog.yml" credentials = tmp_path / "conf" / "local" / "credentials.yml" - logging = tmp_path / "conf" / "local" / "logging.yml" pyproject_toml = tmp_path / "pyproject.toml" _write_yaml(catalog, local_config) _write_yaml(credentials, {"dev_s3": "foo"}) - _write_yaml(logging, local_logging_config) payload = { "tool": { "kedro": { - "project_version": kedro_version, + "kedro_init_version": kedro_version, "project_name": "test hooks", "package_name": "test_hooks", } @@ -163,7 +113,7 @@ def dummy_dataframe() -> pd.DataFrame: @pytest.fixture def mock_pipeline() -> Pipeline: - return Pipeline( + return modular_pipeline( [ node(identity_node, "cars", "planes", name="node1"), node(identity_node, "boats", "ships", name="node2"), @@ -220,12 +170,11 @@ class LoggingHooks: def after_catalog_created( self, catalog: DataCatalog, - conf_catalog: Dict[str, Any], - conf_creds: Dict[str, Any], - feed_dict: Dict[str, Any], + conf_catalog: dict[str, Any], + conf_creds: dict[str, Any], + feed_dict: dict[str, Any], save_version: str, - load_versions: Dict[str, str], - run_id: str, + load_versions: dict[str, str], ): logger.info( "Catalog created", @@ -236,7 +185,6 @@ def after_catalog_created( "feed_dict": feed_dict, "save_version": save_version, "load_versions": load_versions, - "run_id": run_id, }, ) @@ -245,9 +193,9 @@ def before_node_run( self, node: Node, catalog: DataCatalog, - inputs: Dict[str, Any], + inputs: dict[str, Any], is_async: str, - run_id: str, + session_id: str, ) -> None: logger.info( "About to run node", @@ -256,7 +204,7 @@ def before_node_run( "catalog": catalog, "inputs": inputs, "is_async": is_async, - "run_id": run_id, + "session_id": session_id, }, ) @@ -265,10 +213,10 @@ def after_node_run( self, node: Node, catalog: DataCatalog, - inputs: Dict[str, Any], - outputs: Dict[str, Any], + inputs: dict[str, Any], + outputs: dict[str, Any], is_async: str, - run_id: str, + session_id: str, ) -> None: logger.info( "Ran node", @@ -278,7 +226,7 @@ def after_node_run( "inputs": inputs, "outputs": outputs, "is_async": is_async, - "run_id": run_id, + "session_id": session_id, }, ) @@ -288,9 +236,9 @@ def on_node_error( error: Exception, node: Node, catalog: DataCatalog, - inputs: Dict[str, Any], + inputs: dict[str, Any], is_async: bool, - run_id: str, + session_id: str, ): logger.info( "Node error", @@ -300,13 +248,13 @@ def on_node_error( "catalog": catalog, "inputs": inputs, "is_async": is_async, - "run_id": run_id, + "session_id": session_id, }, ) @hook_impl def before_pipeline_run( - self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog + self, run_params: dict[str, Any], pipeline: Pipeline, catalog: DataCatalog ) -> None: logger.info( "About to run pipeline", @@ -316,8 +264,8 @@ def before_pipeline_run( @hook_impl def after_pipeline_run( self, - run_params: Dict[str, Any], - run_result: Dict[str, Any], + run_params: dict[str, Any], + run_result: dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ) -> None: @@ -335,7 +283,7 @@ def after_pipeline_run( def on_pipeline_error( self, error: Exception, - run_params: Dict[str, Any], + run_params: dict[str, Any], pipeline: Pipeline, catalog: DataCatalog, ) -> None: @@ -350,57 +298,35 @@ def on_pipeline_error( ) @hook_impl - def before_dataset_loaded(self, dataset_name: str) -> None: - logger.info("Before dataset loaded", extra={"dataset_name": dataset_name}) - - @hook_impl - def after_dataset_loaded(self, dataset_name: str, data: Any) -> None: + def before_dataset_loaded(self, dataset_name: str, node: Node) -> None: logger.info( - "After dataset loaded", extra={"dataset_name": dataset_name, "data": data} + "Before dataset loaded", extra={"dataset_name": dataset_name, "node": node} ) @hook_impl - def before_dataset_saved(self, dataset_name: str, data: Any) -> None: + def after_dataset_loaded(self, dataset_name: str, data: Any, node: Node) -> None: logger.info( - "Before dataset saved", extra={"dataset_name": dataset_name, "data": data} + "After dataset loaded", + extra={"dataset_name": dataset_name, "data": data, "node": node}, ) @hook_impl - def after_dataset_saved(self, dataset_name: str, data: Any) -> None: + def before_dataset_saved(self, dataset_name: str, data: Any, node: Node) -> None: logger.info( - "After dataset saved", extra={"dataset_name": dataset_name, "data": data} + "Before dataset saved", + extra={"dataset_name": dataset_name, "data": data, "node": node}, ) @hook_impl - def register_catalog( - self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: + def after_dataset_saved(self, dataset_name: str, data: Any, node: Node) -> None: logger.info( - "Registering catalog", - extra={ - "catalog": catalog, - "credentials": credentials, - "load_versions": load_versions, - "save_version": save_version, - "journal": journal, - }, + "After dataset saved", + extra={"dataset_name": dataset_name, "data": data, "node": node}, ) - return DataCatalog.from_config( - catalog, credentials, load_versions, save_version, journal - ) - -@pytest.fixture(autouse=True) -def patched_validate_module(mocker): - """Patching this so KedroSession could be created for testing purpose - since KedroSession.create is still calling configure_project at the moment - """ - mocker.patch("kedro.framework.project._validate_module") + @hook_impl + def after_context_created(self, context: KedroContext) -> None: + logger.info("After context created", extra={"context": context}) @pytest.fixture @@ -409,13 +335,6 @@ def project_hooks(): return LoggingHooks() -@pytest.fixture(autouse=True) -def mock_logging(mocker): - # Disable logging.config.dictConfig in KedroSession._setup_logging as - # it changes logging.config and affects other unit tests - return mocker.patch("logging.config.dictConfig") - - @pytest.fixture(autouse=True) def mock_pipelines(mocker, mock_pipeline): def mock_register_pipelines(): @@ -434,9 +353,9 @@ def mock_register_pipelines(): def _mock_imported_settings_paths(mocker, mock_settings): for path in [ - "kedro.framework.context.context.settings", "kedro.framework.session.session.settings", "kedro.framework.project.settings", + "kedro.runner.parallel_runner.settings", ]: mocker.patch(path, mock_settings) return mock_settings @@ -454,6 +373,17 @@ class MockSettings(_ProjectSettings): def mock_session( mock_settings, mock_package_name, tmp_path ): # pylint: disable=unused-argument - return KedroSession.create( + configure_project(mock_package_name) + session = KedroSession.create( mock_package_name, tmp_path, extra_params={"params:key": "value"} ) + yield session + session.close() + + +@pytest.fixture(autouse=True) +def mock_validate_settings(mocker): + # KedroSession eagerly validates that a project's settings.py is correct by + # importing it. settings.py does not actually exists as part of this test suite + # since we are testing session in isolation, so the validation is patched. + mocker.patch("kedro.framework.session.session.validate_settings") diff --git a/tests/framework/session/test_session.py b/tests/framework/session/test_session.py index 9a97a805b8..398d9278c9 100644 --- a/tests/framework/session/test_session.py +++ b/tests/framework/session/test_session.py @@ -1,52 +1,32 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -import json import logging import re import subprocess import textwrap +from collections.abc import Mapping from pathlib import Path import pytest import toml +import yaml +from omegaconf import OmegaConf from kedro import __version__ as kedro_version +from kedro.config import AbstractConfigLoader, ConfigLoader, OmegaConfigLoader +from kedro.framework.cli.utils import _split_params from kedro.framework.context import KedroContext from kedro.framework.project import ( + LOGGING, ValidationError, Validator, + _HasSharedParentClassValidator, + _IsSubclassValidator, _ProjectSettings, - configure_project, ) -from kedro.framework.session import KedroSession, get_current_session -from kedro.framework.session.store import BaseSessionStore, ShelveStore +from kedro.framework.session import KedroSession +from kedro.framework.session.session import KedroSessionError +from kedro.framework.session.shelvestore import ShelveStore +from kedro.framework.session.store import BaseSessionStore -_FAKE_PACKAGE_NAME = "fake_package" _FAKE_PROJECT_NAME = "fake_project" _FAKE_PIPELINE_NAME = "fake_pipeline" @@ -57,11 +37,20 @@ class BadStore: # pylint: disable=too-few-public-methods """ -@pytest.fixture(autouse=True) -def mocked_logging(mocker): - # Disable logging.config.dictConfig in KedroSession._setup_logging as - # it changes logging.config and affects other unit tests - return mocker.patch("logging.config.dictConfig") +class BadConfigLoader: # pylint: disable=too-few-public-methods + """ + ConfigLoader class that doesn't subclass `AbstractConfigLoader`, for testing only. + """ + + +@pytest.fixture +def mock_runner(mocker): + mock_runner = mocker.patch( + "kedro.runner.sequential_runner.SequentialRunner", + autospec=True, + ) + mock_runner.__name__ = "MockRunner" + return mock_runner @pytest.fixture @@ -73,7 +62,6 @@ def _mock_imported_settings_paths(mocker, mock_settings): for path in [ "kedro.framework.project.settings", "kedro.framework.session.session.settings", - "kedro.framework.context.context.settings", ]: mocker.patch(path, mock_settings) return mock_settings @@ -105,6 +93,54 @@ class MockSettings(_ProjectSettings): return _mock_imported_settings_paths(mocker, MockSettings()) +@pytest.fixture +def mock_settings_custom_config_loader_class(mocker): + class MyConfigLoader(ConfigLoader): + pass + + class MockSettings(_ProjectSettings): + _CONFIG_LOADER_CLASS = _HasSharedParentClassValidator( + "CONFIG_LOADER_CLASS", default=lambda *_: MyConfigLoader + ) + + return _mock_imported_settings_paths(mocker, MockSettings()) + + +@pytest.fixture +def mock_settings_omega_config_loader_class(mocker): + class MockSettings(_ProjectSettings): + _CONFIG_LOADER_CLASS = _HasSharedParentClassValidator( + "CONFIG_LOADER_CLASS", default=lambda *_: OmegaConfigLoader + ) + + return _mock_imported_settings_paths(mocker, MockSettings()) + + +@pytest.fixture +def mock_settings_config_loader_args(mocker): + class MockSettings(_ProjectSettings): + _CONFIG_LOADER_ARGS = Validator( + "CONFIG_LOADER_ARGS", + default={"config_patterns": {"spark": ["spark/*"]}}, + ) + + return _mock_imported_settings_paths(mocker, MockSettings()) + + +@pytest.fixture +def mock_settings_file_bad_config_loader_class(tmpdir): + mock_settings_file = tmpdir.join("mock_settings_file.py") + mock_settings_file.write( + textwrap.dedent( + f""" + from {__name__} import BadConfigLoader + CONFIG_LOADER_CLASS = BadConfigLoader + """ + ) + ) + return mock_settings_file + + @pytest.fixture def mock_settings_file_bad_session_store_class(tmpdir): mock_settings_file = tmpdir.join("mock_settings_file.py") @@ -145,7 +181,7 @@ def mock_settings_shelve_session_store(mocker, fake_project): shelve_location = fake_project / "nested" / "sessions" class MockSettings(_ProjectSettings): - _SESSION_STORE_CLASS = Validator( + _SESSION_STORE_CLASS = _IsSubclassValidator( "SESSION_STORE_CLASS", default=lambda *_: ShelveStore ) _SESSION_STORE_ARGS = Validator( @@ -165,35 +201,7 @@ def fake_session_id(mocker): @pytest.fixture -def local_logging_config(): - return { - "version": 1, - "formatters": { - "simple": {"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"} - }, - "root": {"level": "INFO", "handlers": ["console"]}, - "loggers": { - "kedro": {"level": "INFO", "handlers": ["console"], "propagate": False} - }, - "handlers": { - "console": { - "class": "logging.StreamHandler", - "level": "INFO", - "formatter": "simple", - "stream": "ext://sys.stdout", - } - }, - "info_file_handler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "INFO", - "formatter": "simple", - "filename": "logs/info.log", - }, - } - - -@pytest.fixture -def fake_project(mocker, tmp_path, local_logging_config): +def fake_project(tmp_path, mock_package_name): fake_project_dir = Path(tmp_path) / "fake_project" (fake_project_dir / "src").mkdir(parents=True) @@ -201,24 +209,29 @@ def fake_project(mocker, tmp_path, local_logging_config): payload = { "tool": { "kedro": { - "project_version": kedro_version, + "kedro_init_version": kedro_version, "project_name": _FAKE_PROJECT_NAME, - "package_name": _FAKE_PACKAGE_NAME, + "package_name": mock_package_name, } } } toml_str = toml.dumps(payload) - pyproject_toml_path.write_text(toml_str) + pyproject_toml_path.write_text(toml_str, encoding="utf-8") - env_logging = fake_project_dir / "conf" / "base" / "logging.yml" - env_logging.parent.mkdir(parents=True) - env_logging.write_text(json.dumps(local_logging_config)) + (fake_project_dir / "conf" / "base").mkdir(parents=True) (fake_project_dir / "conf" / "local").mkdir() - - mocker.patch("kedro.framework.project._validate_module") return fake_project_dir +@pytest.fixture +def fake_username(mocker): + username = "user1" + mocker.patch( + "kedro.framework.session.session.getpass.getuser", return_value=username + ) + return username + + class FakeException(Exception): """Fake exception class for testing purposes""" @@ -236,25 +249,28 @@ def test_create( fake_project, mock_context_class, fake_session_id, + mock_package_name, mocker, env, extra_params, + fake_username, ): mock_click_ctx = mocker.patch("click.get_current_context").return_value + mocker.patch("sys.argv", ["kedro", "run", "--params=x"]) session = KedroSession.create( - _FAKE_PACKAGE_NAME, fake_project, env=env, extra_params=extra_params + mock_package_name, fake_project, env=env, extra_params=extra_params ) expected_cli_data = { "args": mock_click_ctx.args, "params": mock_click_ctx.params, "command_name": mock_click_ctx.command.name, - "command_path": mock_click_ctx.command_path, + "command_path": "kedro run --params=x", } expected_store = { "project_path": fake_project, "session_id": fake_session_id, - "package_name": _FAKE_PACKAGE_NAME, + "package_name": mock_package_name, "cli": expected_cli_data, } if env: @@ -262,91 +278,169 @@ def test_create( if extra_params: expected_store["extra_params"] = extra_params - assert session.store == expected_store - # called for logging setup - mock_context_class.assert_called_once_with( - project_path=fake_project, - package_name=_FAKE_PACKAGE_NAME, - env=env, - extra_params=extra_params, - ) + expected_store["username"] = fake_username + assert session.store == expected_store assert session.load_context() is mock_context_class.return_value + assert isinstance(session._get_config_loader(), ConfigLoader) + + @pytest.mark.usefixtures("mock_settings") + def test_create_multiple_sessions(self, fake_project, mock_package_name): + with KedroSession.create(mock_package_name, fake_project): + with KedroSession.create(mock_package_name, fake_project): + pass @pytest.mark.usefixtures("mock_settings_context_class") def test_create_no_env_extra_params( - self, fake_project, mock_context_class, fake_session_id, mocker + self, + fake_project, + mock_context_class, + fake_session_id, + mock_package_name, + mocker, + fake_username, ): mock_click_ctx = mocker.patch("click.get_current_context").return_value - session = KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + mocker.patch("sys.argv", ["kedro", "run", "--params=x"]) + session = KedroSession.create(mock_package_name, fake_project) expected_cli_data = { "args": mock_click_ctx.args, "params": mock_click_ctx.params, "command_name": mock_click_ctx.command.name, - "command_path": mock_click_ctx.command_path, + "command_path": "kedro run --params=x", } expected_store = { "project_path": fake_project, "session_id": fake_session_id, - "package_name": _FAKE_PACKAGE_NAME, + "package_name": mock_package_name, "cli": expected_cli_data, } - assert session.store == expected_store - mock_context_class.assert_called_once_with( - project_path=fake_project, - package_name=_FAKE_PACKAGE_NAME, - env=None, - extra_params=None, - ) + expected_store["username"] = fake_username + assert session.store == expected_store assert session.load_context() is mock_context_class.return_value + assert isinstance(session._get_config_loader(), ConfigLoader) @pytest.mark.usefixtures("mock_settings") - def test_load_context_with_envvar(self, fake_project, monkeypatch, mocker): + def test_load_context_with_envvar( + self, fake_project, monkeypatch, mock_package_name, mocker + ): mocker.patch("kedro.config.config.ConfigLoader.get") + monkeypatch.setenv("KEDRO_ENV", "my_fake_env") - session = KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + session = KedroSession.create(mock_package_name, fake_project) result = session.load_context() assert isinstance(result, KedroContext) assert result.__class__.__name__ == "KedroContext" assert result.env == "my_fake_env" + @pytest.mark.usefixtures("mock_settings") + def test_load_config_loader_with_envvar( + self, fake_project, monkeypatch, mock_package_name, mocker + ): + mocker.patch("kedro.config.config.ConfigLoader.get") + monkeypatch.setenv("KEDRO_ENV", "my_fake_env") + + session = KedroSession.create(mock_package_name, fake_project) + result = session._get_config_loader() + + assert isinstance(result, ConfigLoader) + assert result.__class__.__name__ == "ConfigLoader" + assert result.env == "my_fake_env" + @pytest.mark.usefixtures("mock_settings_custom_context_class") - def test_load_context_custom_context_class(self, fake_project): - session = KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + def test_load_context_custom_context_class(self, fake_project, mock_package_name): + session = KedroSession.create(mock_package_name, fake_project) result = session.load_context() assert isinstance(result, KedroContext) assert result.__class__.__name__ == "MyContext" + @pytest.mark.usefixtures("mock_settings_custom_config_loader_class") + def test_load_config_loader_custom_config_loader_class( + self, fake_project, mock_package_name + ): + session = KedroSession.create(mock_package_name, fake_project) + result = session._get_config_loader() + + assert isinstance(result, AbstractConfigLoader) + assert result.__class__.__name__ == "MyConfigLoader" + + @pytest.mark.usefixtures("mock_settings_config_loader_args") + def test_load_config_loader_args(self, fake_project, mock_package_name, mocker): + session = KedroSession.create(mock_package_name, fake_project) + result = session._get_config_loader() + + assert isinstance(result, ConfigLoader) + assert result.config_patterns["catalog"] == [ + "catalog*", + "catalog*/**", + "**/catalog*", + ] + assert result.config_patterns["spark"] == ["spark/*"] + mocker.patch( + "kedro.config.config.ConfigLoader.get", + return_value=["spark/*"], + ) + assert result["spark"] == ["spark/*"] + + def test_broken_config_loader(self, mock_settings_file_bad_config_loader_class): + pattern = ( + "Invalid value 'tests.framework.session.test_session.BadConfigLoader' received " + "for setting 'CONFIG_LOADER_CLASS'. " + "It must be a subclass of 'kedro.config.abstract_config.AbstractConfigLoader'." + ) + mock_settings = _ProjectSettings( + settings_file=str(mock_settings_file_bad_config_loader_class) + ) + with pytest.raises(ValidationError, match=re.escape(pattern)): + assert mock_settings.CONFIG_LOADER_CLASS + + def test_logging_is_not_reconfigure( + self, fake_project, caplog, mock_package_name, mocker + ): + caplog.set_level(logging.DEBUG, logger="kedro") + + mock_logging = mocker.patch.object(LOGGING, "configure") + session = KedroSession.create(mock_package_name, fake_project) + session.close() + + mock_logging.assert_not_called() + @pytest.mark.usefixtures("mock_settings_context_class") - def test_default_store(self, fake_project, fake_session_id, caplog): - session = KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + def test_default_store( + self, fake_project, fake_session_id, caplog, mock_package_name + ): + caplog.set_level(logging.DEBUG, logger="kedro") + + session = KedroSession.create(mock_package_name, fake_project) assert isinstance(session.store, dict) assert session._store.__class__ is BaseSessionStore assert session._store._path == (fake_project / "sessions").as_posix() assert session._store._session_id == fake_session_id session.close() expected_log_messages = [ - "`read()` not implemented for `BaseSessionStore`. Assuming empty store.", - "`save()` not implemented for `BaseSessionStore`. Skipping the step.", + "'read()' not implemented for 'BaseSessionStore'. Assuming empty store.", + "'save()' not implemented for 'BaseSessionStore'. Skipping the step.", ] actual_log_messages = [ rec.getMessage() for rec in caplog.records - if rec.name == STORE_LOGGER_NAME and rec.levelno == logging.INFO + if rec.name == STORE_LOGGER_NAME and rec.levelno == logging.DEBUG ] assert actual_log_messages == expected_log_messages @pytest.mark.usefixtures("mock_settings_shelve_session_store") - def test_shelve_store(self, fake_project, fake_session_id, caplog, mocker): + def test_shelve_store( + self, fake_project, fake_session_id, caplog, mock_package_name, mocker + ): mocker.patch("pathlib.Path.is_file", return_value=True) shelve_location = fake_project / "nested" / "sessions" - other = KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + other = KedroSession.create(mock_package_name, fake_project) assert other._store.__class__ is ShelveStore assert other._store._path == shelve_location.as_posix() assert other._store._location == shelve_location / fake_session_id / "store" @@ -358,15 +452,15 @@ def test_shelve_store(self, fake_project, fake_session_id, caplog, mocker): actual_log_messages = [ rec.getMessage() for rec in caplog.records - if rec.name == STORE_LOGGER_NAME and rec.levelno == logging.INFO + if rec.name == STORE_LOGGER_NAME and rec.levelno == logging.DEBUG ] assert not actual_log_messages def test_wrong_store_type(self, mock_settings_file_bad_session_store_class): pattern = ( - "Invalid value `tests.framework.session.test_session.BadStore` received " - "for setting `SESSION_STORE_CLASS`. " - "It must be a subclass of `kedro.framework.session.store.BaseSessionStore`." + "Invalid value 'tests.framework.session.test_session.BadStore' received " + "for setting 'SESSION_STORE_CLASS'. " + "It must be a subclass of 'kedro.framework.session.store.BaseSessionStore'." ) mock_settings = _ProjectSettings( settings_file=str(mock_settings_file_bad_session_store_class) @@ -376,25 +470,26 @@ def test_wrong_store_type(self, mock_settings_file_bad_session_store_class): assert mock_settings.SESSION_STORE_CLASS @pytest.mark.usefixtures("mock_settings_bad_session_store_args") - def test_wrong_store_args(self, fake_project): + def test_wrong_store_args(self, fake_project, mock_package_name): classpath = f"{BaseSessionStore.__module__}.{BaseSessionStore.__qualname__}" pattern = ( f"Store config must only contain arguments valid for " - f"the constructor of `{classpath}`." + f"the constructor of '{classpath}'." ) with pytest.raises(ValueError, match=re.escape(pattern)): - KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + KedroSession.create(mock_package_name, fake_project) def test_store_uncaught_error( self, fake_project, fake_session_id, mock_settings_uncaught_session_store_exception, + mock_package_name, ): classpath = f"{BaseSessionStore.__module__}.{BaseSessionStore.__qualname__}" - pattern = f"Failed to instantiate session store of type `{classpath}`." + pattern = f"Failed to instantiate session store of type '{classpath}'." with pytest.raises(ValueError, match=re.escape(pattern)): - KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + KedroSession.create(mock_package_name, fake_project) mock_settings_uncaught_session_store_exception.assert_called_once_with( path="path", session_id=fake_session_id @@ -404,7 +499,7 @@ def test_store_uncaught_error( @pytest.mark.parametrize("fake_git_status", ["dirty", ""]) @pytest.mark.parametrize("fake_commit_hash", ["fake_commit_hash"]) def test_git_describe( - self, fake_project, fake_commit_hash, fake_git_status, mocker + self, fake_project, fake_commit_hash, fake_git_status, mock_package_name, mocker ): """Test that git information is added to the session store""" mocker.patch( @@ -412,7 +507,7 @@ def test_git_describe( side_effect=[fake_commit_hash.encode(), fake_git_status.encode()], ) - session = KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + session = KedroSession.create(mock_package_name, fake_project) expected_git_info = { "commit_sha": fake_commit_hash, "dirty": bool(fake_git_status), @@ -428,80 +523,142 @@ def test_git_describe( NotADirectoryError, ], ) - def test_git_describe_error(self, fake_project, exception, mocker, caplog): + def test_git_describe_error( + self, fake_project, exception, mock_package_name, mocker, caplog + ): """Test that git information is not added to the session store if call to git fails """ + caplog.set_level(logging.DEBUG, logger="kedro") + mocker.patch("subprocess.check_output", side_effect=exception) - session = KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + session = KedroSession.create(mock_package_name, fake_project) assert "git" not in session.store - expected_log_messages = [f"Unable to git describe {fake_project}"] + expected_log_message = f"Unable to git describe {fake_project}" actual_log_messages = [ rec.getMessage() for rec in caplog.records - if rec.name == SESSION_LOGGER_NAME and rec.levelno == logging.WARN + if rec.name == SESSION_LOGGER_NAME and rec.levelno == logging.DEBUG + ] + assert expected_log_message in actual_log_messages + + def test_get_username_error(self, fake_project, mock_package_name, mocker, caplog): + """Test that username information is not added to the session store + if call to getuser() fails + """ + caplog.set_level(logging.DEBUG, logger="kedro") + + mocker.patch("subprocess.check_output") + mocker.patch("getpass.getuser", side_effect=FakeException("getuser error")) + session = KedroSession.create(mock_package_name, fake_project) + assert "username" not in session.store + + expected_log_messages = [ + "Unable to get username. Full exception: getuser error" + ] + actual_log_messages = [ + rec.getMessage() + for rec in caplog.records + if rec.name == SESSION_LOGGER_NAME and rec.levelno == logging.DEBUG ] assert actual_log_messages == expected_log_messages @pytest.mark.usefixtures("mock_settings") - def test_log_error(self, fake_project): + def test_log_error(self, fake_project, mock_package_name): """Test logging the error by the session""" # test that the error is not swallowed by the session with pytest.raises(FakeException), KedroSession.create( - _FAKE_PACKAGE_NAME, fake_project + mock_package_name, fake_project ) as session: raise FakeException exception = session.store["exception"] assert exception["type"] == "tests.framework.session.test_session.FakeException" - assert exception["value"] == "" + assert not exception["value"] assert any( "raise FakeException" in tb_line for tb_line in exception["traceback"] ) - @pytest.mark.usefixtures("mock_settings") - def test_get_current_session(self, fake_project, mocker): - assert get_current_session(silent=True) is None # no sessions yet - - pattern = "There is no active Kedro session" - with pytest.raises(RuntimeError, match=pattern): - get_current_session() - - mocker.patch("kedro.framework.project._validate_module") - configure_project(_FAKE_PACKAGE_NAME) - session1 = KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) - session2 = KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + @pytest.mark.usefixtures("mock_settings_context_class") + @pytest.mark.parametrize("fake_pipeline_name", [None, _FAKE_PIPELINE_NAME]) + def test_run( + self, + fake_project, + fake_session_id, + fake_pipeline_name, + mock_context_class, + mock_package_name, + mock_runner, + mocker, + ): + """Test running the project via the session""" - with session1: - assert get_current_session() is session1 + mock_hook = mocker.patch( + "kedro.framework.session.session._create_hook_manager" + ).return_value.hook + mock_pipelines = mocker.patch( + "kedro.framework.session.session.pipelines", + return_value={ + _FAKE_PIPELINE_NAME: mocker.Mock(), + "__default__": mocker.Mock(), + }, + ) + mock_context = mock_context_class.return_value + mock_catalog = mock_context._get_catalog.return_value + mock_runner.__name__ = "SequentialRunner" + mock_pipeline = mock_pipelines.__getitem__.return_value.filter.return_value - pattern = ( - "Cannot activate the session as another active session already exists" - ) - with pytest.raises(RuntimeError, match=pattern), session2: - pass # pragma: no cover + with KedroSession.create(mock_package_name, fake_project) as session: + session.run(runner=mock_runner, pipeline_name=fake_pipeline_name) - # session has been closed, so no current sessions should be available - assert get_current_session(silent=True) is None + record_data = { + "session_id": fake_session_id, + "project_path": fake_project.as_posix(), + "env": mock_context.env, + "kedro_version": kedro_version, + "tags": None, + "from_nodes": None, + "to_nodes": None, + "node_names": None, + "from_inputs": None, + "to_outputs": None, + "load_versions": None, + "extra_params": {}, + "pipeline_name": fake_pipeline_name, + "namespace": None, + "runner": mock_runner.__name__, + } - with session2: - assert get_current_session() is session2 + mock_hook.before_pipeline_run.assert_called_once_with( + run_params=record_data, pipeline=mock_pipeline, catalog=mock_catalog + ) + mock_runner.run.assert_called_once_with( + mock_pipeline, mock_catalog, session._hook_manager, fake_session_id + ) + mock_hook.after_pipeline_run.assert_called_once_with( + run_params=record_data, + run_result=mock_runner.run.return_value, + pipeline=mock_pipeline, + catalog=mock_catalog, + ) @pytest.mark.usefixtures("mock_settings_context_class") @pytest.mark.parametrize("fake_pipeline_name", [None, _FAKE_PIPELINE_NAME]) - def test_run( + def test_run_multiple_times( # pylint: disable=too-many-locals self, fake_project, fake_session_id, fake_pipeline_name, mock_context_class, + mock_package_name, + mock_runner, mocker, ): - """Test running the project via the session""" + """Test running the project more than once via the session""" mock_hook = mocker.patch( - "kedro.framework.session.session.get_hook_manager" + "kedro.framework.session.session._create_hook_manager" ).return_value.hook mock_pipelines = mocker.patch( "kedro.framework.session.session.pipelines", @@ -512,14 +669,20 @@ def test_run( ) mock_context = mock_context_class.return_value mock_catalog = mock_context._get_catalog.return_value - mock_runner = mocker.Mock() mock_pipeline = mock_pipelines.__getitem__.return_value.filter.return_value - with KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) as session: - session.run(runner=mock_runner, pipeline_name=fake_pipeline_name) + message = ( + "A run has already been completed as part of the active KedroSession. " + "KedroSession has a 1-1 mapping with runs, and thus only one run should be" + " executed per session." + ) + with pytest.raises(Exception, match=message): + with KedroSession.create(mock_package_name, fake_project) as session: + session.run(runner=mock_runner, pipeline_name=fake_pipeline_name) + session.run(runner=mock_runner, pipeline_name=fake_pipeline_name) record_data = { - "run_id": fake_session_id, + "session_id": fake_session_id, "project_path": fake_project.as_posix(), "env": mock_context.env, "kedro_version": kedro_version, @@ -532,13 +695,17 @@ def test_run( "load_versions": None, "extra_params": {}, "pipeline_name": fake_pipeline_name, + "namespace": None, + "runner": mock_runner.__name__, } mock_hook.before_pipeline_run.assert_called_once_with( - run_params=record_data, pipeline=mock_pipeline, catalog=mock_catalog + run_params=record_data, + pipeline=mock_pipeline, + catalog=mock_catalog, ) mock_runner.run.assert_called_once_with( - mock_pipeline, mock_catalog, fake_session_id + mock_pipeline, mock_catalog, session._hook_manager, fake_session_id ) mock_hook.after_pipeline_run.assert_called_once_with( run_params=record_data, @@ -548,8 +715,9 @@ def test_run( ) @pytest.mark.usefixtures("mock_settings_context_class") - def test_run_non_existent_pipeline(self, fake_project, mocker): - mock_runner = mocker.Mock() + def test_run_non_existent_pipeline( + self, fake_project, mock_package_name, mock_runner + ): pattern = ( "Failed to find the pipeline named 'doesnotexist'. " @@ -557,7 +725,7 @@ def test_run_non_existent_pipeline(self, fake_project, mocker): "by the 'register_pipelines' function." ) with pytest.raises(ValueError, match=re.escape(pattern)): - with KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) as session: + with KedroSession.create(mock_package_name, fake_project) as session: session.run(runner=mock_runner, pipeline_name="doesnotexist") @pytest.mark.usefixtures("mock_settings_context_class") @@ -568,11 +736,13 @@ def test_run_exception( # pylint: disable=too-many-locals fake_session_id, fake_pipeline_name, mock_context_class, + mock_package_name, + mock_runner, mocker, ): - """Test exception being raise during the run""" + """Test exception being raised during the run""" mock_hook = mocker.patch( - "kedro.framework.session.session.get_hook_manager" + "kedro.framework.session.session._create_hook_manager" ).return_value.hook mock_pipelines = mocker.patch( "kedro.framework.session.session.pipelines", @@ -584,17 +754,16 @@ def test_run_exception( # pylint: disable=too-many-locals mock_context = mock_context_class.return_value mock_catalog = mock_context._get_catalog.return_value error = FakeException("You shall not pass!") - mock_runner = mocker.Mock() mock_runner.run.side_effect = error # runner.run() raises an error mock_pipeline = mock_pipelines.__getitem__.return_value.filter.return_value with pytest.raises(FakeException), KedroSession.create( - _FAKE_PACKAGE_NAME, fake_project + mock_package_name, fake_project ) as session: session.run(runner=mock_runner, pipeline_name=fake_pipeline_name) record_data = { - "run_id": fake_session_id, + "session_id": fake_session_id, "project_path": fake_project.as_posix(), "env": mock_context.env, "kedro_version": kedro_version, @@ -607,6 +776,8 @@ def test_run_exception( # pylint: disable=too-many-locals "load_versions": None, "extra_params": {}, "pipeline_name": fake_pipeline_name, + "namespace": None, + "runner": mock_runner.__name__, } mock_hook.on_pipeline_error.assert_called_once_with( @@ -623,14 +794,142 @@ def test_run_exception( # pylint: disable=too-many-locals assert exception["value"] == "You shall not pass!" assert exception["traceback"] + @pytest.mark.usefixtures("mock_settings_context_class") + @pytest.mark.parametrize("fake_pipeline_name", [None, _FAKE_PIPELINE_NAME]) + def test_run_broken_pipeline_multiple_times( # pylint: disable=too-many-locals + self, + fake_project, + fake_session_id, + fake_pipeline_name, + mock_context_class, + mock_package_name, + mock_runner, + mocker, + ): + """Test exception being raised during the first run and + a second run is allowed to be executed in the same session.""" + mock_hook = mocker.patch( + "kedro.framework.session.session._create_hook_manager" + ).return_value.hook + mock_pipelines = mocker.patch( + "kedro.framework.session.session.pipelines", + return_value={ + _FAKE_PIPELINE_NAME: mocker.Mock(), + "__default__": mocker.Mock(), + }, + ) + mock_context = mock_context_class.return_value + mock_catalog = mock_context._get_catalog.return_value + session = KedroSession.create(mock_package_name, fake_project) -@pytest.mark.usefixtures("mock_settings") -def test_setup_logging_using_absolute_path(fake_project, mocked_logging): - KedroSession.create(_FAKE_PACKAGE_NAME, fake_project) + broken_runner = mocker.patch( + "kedro.runner.SequentialRunner", + autospec=True, + ) + broken_runner.__name__ = "BrokenRunner" + error = FakeException("You shall not pass!") + broken_runner.run.side_effect = error # runner.run() raises an error + mock_pipeline = mock_pipelines.__getitem__.return_value.filter.return_value - mocked_logging.assert_called_once() - call_args = mocked_logging.call_args[0][0] + with pytest.raises(FakeException): + # Execute run with broken runner + session.run(runner=broken_runner, pipeline_name=fake_pipeline_name) - expected_log_filepath = (fake_project / "logs" / "info.log").as_posix() - actual_log_filepath = call_args["info_file_handler"]["filename"] - assert actual_log_filepath == expected_log_filepath + record_data = { + "session_id": fake_session_id, + "project_path": fake_project.as_posix(), + "env": mock_context.env, + "kedro_version": kedro_version, + "tags": None, + "from_nodes": None, + "to_nodes": None, + "node_names": None, + "from_inputs": None, + "to_outputs": None, + "load_versions": None, + "extra_params": {}, + "pipeline_name": fake_pipeline_name, + "namespace": None, + "runner": broken_runner.__name__, + } + + mock_hook.on_pipeline_error.assert_called_once_with( + error=error, + run_params=record_data, + pipeline=mock_pipeline, + catalog=mock_catalog, + ) + mock_hook.after_pipeline_run.assert_not_called() + + # Execute run another time with fixed runner + fixed_runner = mock_runner + session.run(runner=fixed_runner, pipeline_name=fake_pipeline_name) + + fixed_runner.run.assert_called_once_with( + mock_pipeline, mock_catalog, session._hook_manager, fake_session_id + ) + + record_data["runner"] = "MockRunner" + mock_hook.after_pipeline_run.assert_called_once_with( + run_params=record_data, + run_result=fixed_runner.run.return_value, + pipeline=mock_pipeline, + catalog=mock_catalog, + ) + + @pytest.mark.usefixtures("mock_settings_context_class") + def test_session_raise_error_with_invalid_runner_instance( + self, + fake_project, + mock_package_name, + mocker, + ): + mocker.patch( + "kedro.framework.session.session.pipelines", + return_value={ + "__default__": mocker.Mock(), + }, + ) + mock_runner_class = mocker.patch("kedro.runner.SequentialRunner") + + session = KedroSession.create(mock_package_name, fake_project) + with pytest.raises( + KedroSessionError, + match="KedroSession expect an instance of Runner instead of a class.", + ): + # Execute run with SequentialRunner class instead of SequentialRunner() + session.run(runner=mock_runner_class) + + +@pytest.fixture +def fake_project_with_logging_file_handler(fake_project): + logging_config = { + "version": 1, + "handlers": {"info_file_handler": {"filename": "logs/info.log"}}, + } + logging_yml = fake_project / "conf" / "base" / "logging.yml" + logging_yml.write_text(yaml.dump(logging_config)) + return fake_project + + +def get_all_values(mapping: Mapping): + for value in mapping.values(): + yield value + if isinstance(value, Mapping): + yield from get_all_values(value) + + +@pytest.mark.parametrize("params", ["a=1,b.c=2", "a=1,b=2,c=3", ""]) +def test_no_DictConfig_in_store( + params, + mock_package_name, + fake_project, +): + extra_params = _split_params(None, None, params) + session = KedroSession.create( + mock_package_name, fake_project, extra_params=extra_params + ) + + assert not any( + OmegaConf.is_config(value) for value in get_all_values(session._store) + ) diff --git a/tests/framework/session/test_session_extension_hooks.py b/tests/framework/session/test_session_extension_hooks.py index ea4e5da304..75f3568a5e 100644 --- a/tests/framework/session/test_session_extension_hooks.py +++ b/tests/framework/session/test_session_extension_hooks.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import logging import re import sys @@ -36,11 +9,17 @@ from dynaconf.validator import Validator from kedro.framework.context.context import _convert_paths_to_absolute_posix -from kedro.framework.hooks import hook_impl -from kedro.framework.project import _ProjectPipelines, _ProjectSettings +from kedro.framework.hooks import _create_hook_manager, hook_impl +from kedro.framework.hooks.manager import _register_hooks, _register_hooks_setuptools +from kedro.framework.project import ( + _ProjectPipelines, + _ProjectSettings, + pipelines, + settings, +) from kedro.framework.session import KedroSession from kedro.io import DataCatalog, MemoryDataSet -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node, pipeline from kedro.pipeline.node import Node from kedro.runner import ParallelRunner from kedro.runner.runner import _run_node_async @@ -56,6 +35,7 @@ ) logger = logging.getLogger("tests.framework.session.conftest") +logger.setLevel(logging.DEBUG) def broken_node(): @@ -64,7 +44,7 @@ def broken_node(): @pytest.fixture def broken_pipeline(): - return Pipeline( + return pipeline( [ node(broken_node, None, "A", name="node1"), node(broken_node, None, "B", name="node2"), @@ -86,31 +66,12 @@ def mock_get_pipelines_registry_callable(): return mock_get_pipelines_registry_callable() -@pytest.fixture -def mock_pipelines(mocker, mock_pipeline): - def mock_get_pipelines_registry_callable(): - return { - "__default__": mock_pipeline, - "pipe": mock_pipeline, - } - - mocker.patch.object( - _ProjectPipelines, - "_get_pipelines_registry_callable", - return_value=mock_get_pipelines_registry_callable, - ) - return mock_get_pipelines_registry_callable() - - class TestCatalogHooks: - def test_after_catalog_created_hook(self, mocker, mock_session, caplog): + def test_after_catalog_created_hook(self, mock_session, caplog): context = mock_session.load_context() - fake_run_id = mocker.sentinel.fake_run_id - mocker.patch.object(context, "_get_run_id", return_value=fake_run_id) - project_path = context.project_path catalog = context.catalog - config_loader = context.config_loader + config_loader = mock_session._get_config_loader() relevant_records = [ r for r in caplog.records if r.getMessage() == "Catalog created" @@ -125,24 +86,29 @@ def test_after_catalog_created_hook(self, mocker, mock_session, caplog): # save_version is only passed during a run, not on the property getter assert record.save_version is None assert record.load_versions is None - assert record.run_id is fake_run_id - def test_after_catalog_created_hook_default_run_id( + def test_after_catalog_created_hook_on_session_run( self, mocker, mock_session, dummy_dataframe, caplog ): context = mock_session.load_context() fake_save_version = mocker.sentinel.fake_save_version - mocker.patch.object( - context, "_get_save_version", return_value=fake_save_version + + mocker.patch( + "kedro.framework.session.KedroSession.store", + new_callable=mocker.PropertyMock, + return_value={ + "session_id": fake_save_version, + "save_version": fake_save_version, + }, ) catalog = context.catalog - config_loader = context.config_loader + config_loader = mock_session._get_config_loader() project_path = context.project_path catalog.save("cars", dummy_dataframe) catalog.save("boats", dummy_dataframe) - context.run() + mock_session.run() relevant_records = [ r for r in caplog.records if r.getMessage() == "Catalog created" @@ -156,7 +122,6 @@ def test_after_catalog_created_hook_default_run_id( ) assert record.save_version is fake_save_version assert record.load_versions is None - assert record.run_id is record.save_version class TestPipelineHooks: @@ -166,7 +131,7 @@ def test_before_and_after_pipeline_run_hooks( ): context = mock_session.load_context() catalog = context.catalog - default_pipeline = context.pipeline + default_pipeline = pipelines["__default__"] catalog.save("cars", dummy_dataframe) catalog.save("boats", dummy_dataframe) mock_session.run() @@ -226,7 +191,8 @@ def test_on_node_error_hook_sequential_runner(self, caplog, mock_session): assert len(on_node_error_calls) == 1 call_record = on_node_error_calls[0] _assert_hook_call_record_has_expected_parameters( - call_record, ["error", "node", "catalog", "inputs", "is_async", "run_id"] + call_record, + ["error", "node", "catalog", "inputs", "is_async", "session_id"], ) expected_error = ValueError("broken") assert_exceptions_equal(call_record.error, expected_error) @@ -249,11 +215,10 @@ def test_before_and_after_node_run_hooks_sequential_runner( assert len(before_node_run_calls) == 1 call_record = before_node_run_calls[0] _assert_hook_call_record_has_expected_parameters( - call_record, ["node", "catalog", "inputs", "is_async", "run_id"] + call_record, ["node", "catalog", "inputs", "is_async", "session_id"] ) # sanity check a couple of important parameters assert call_record.inputs["cars"].to_dict() == dummy_dataframe.to_dict() - assert call_record.run_id == mock_session.session_id # test after node run hook after_node_run_calls = [ @@ -262,11 +227,11 @@ def test_before_and_after_node_run_hooks_sequential_runner( assert len(after_node_run_calls) == 1 call_record = after_node_run_calls[0] _assert_hook_call_record_has_expected_parameters( - call_record, ["node", "catalog", "inputs", "outputs", "is_async", "run_id"] + call_record, + ["node", "catalog", "inputs", "outputs", "is_async", "session_id"], ) # sanity check a couple of important parameters assert call_record.outputs["planes"].to_dict() == dummy_dataframe.to_dict() - assert call_record.run_id == mock_session.session_id @SKIP_ON_WINDOWS @pytest.mark.usefixtures("mock_broken_pipelines") @@ -285,7 +250,7 @@ def test_on_node_error_hook_parallel_runner(self, mock_session, logs_listener): for call_record in on_node_error_records: _assert_hook_call_record_has_expected_parameters( call_record, - ["error", "node", "catalog", "inputs", "is_async", "run_id"], + ["error", "node", "catalog", "inputs", "is_async", "session_id"], ) expected_error = ValueError("broken") assert_exceptions_equal(call_record.error, expected_error) @@ -535,8 +500,8 @@ def test_broken_input_update( catalog.save("boats", dummy_dataframe) pattern = ( - "`before_node_run` must return either None or a dictionary " - "mapping dataset names to updated values, got `MockDatasetReplacement`" + "'before_node_run' must return either None or a dictionary " + "mapping dataset names to updated values, got 'MockDatasetReplacement'" ) with pytest.raises(TypeError, match=re.escape(pattern)): mock_session_with_broken_before_node_run_hooks.run() @@ -551,22 +516,35 @@ def test_broken_input_update_parallel( catalog.save("boats", dummy_dataframe) pattern = ( - "`before_node_run` must return either None or a dictionary " - "mapping dataset names to updated values, got `MockDatasetReplacement`" + "'before_node_run' must return either None or a dictionary " + "mapping dataset names to updated values, got 'MockDatasetReplacement'" ) with pytest.raises(TypeError, match=re.escape(pattern)): mock_session_with_broken_before_node_run_hooks.run(runner=ParallelRunner()) +def wait_and_identity(*args: Any): + time.sleep(0.1) + if len(args) == 1: + return args[0] + return args + + @pytest.fixture def sample_node(): - def wait_and_identity(x: Any): - time.sleep(0.1) - return x - return node(wait_and_identity, inputs="ds1", outputs="ds2", name="test-node") +@pytest.fixture +def sample_node_multiple_outputs(): + return node( + wait_and_identity, + inputs=["ds1", "ds2"], + outputs=["ds3", "ds4"], + name="test-node", + ) + + class LogCatalog(DataCatalog): def load(self, name: str, version: str = None) -> Any: dataset = super().load(name=name, version=version) @@ -578,7 +556,17 @@ def load(self, name: str, version: str = None) -> Any: def memory_catalog(): ds1 = MemoryDataSet({"data": 42}) ds2 = MemoryDataSet({"data": 42}) - return LogCatalog({"ds1": ds1, "ds2": ds2}) + ds3 = MemoryDataSet({"data": 42}) + ds4 = MemoryDataSet({"data": 42}) + return LogCatalog({"ds1": ds1, "ds2": ds2, "ds3": ds3, "ds4": ds4}) + + +@pytest.fixture +def hook_manager(): + hook_manager = _create_hook_manager() + _register_hooks(hook_manager, settings.HOOKS) + _register_hooks_setuptools(hook_manager, settings.DISABLE_HOOKS_FOR_PLUGINS) + return hook_manager class TestAsyncNodeDatasetHooks: @@ -590,7 +578,11 @@ def test_after_dataset_load_hook_async( mock_session.load_context() # run the node asynchronously with an instance of `LogCatalog` - _run_node_async(node=sample_node, catalog=memory_catalog) + _run_node_async( + node=sample_node, + catalog=memory_catalog, + hook_manager=mock_session._hook_manager, + ) hooks_log_messages = [r.message for r in logs_listener.logs] @@ -598,3 +590,50 @@ def test_after_dataset_load_hook_async( assert str( ["Before dataset loaded", "Catalog load", "After dataset loaded"] ).strip("[]") in str(hooks_log_messages).strip("[]") + + def test_after_dataset_load_hook_async_multiple_outputs( + self, + mocker, + memory_catalog, + hook_manager, + sample_node_multiple_outputs, + ): + after_dataset_saved_mock = mocker.patch.object( + hook_manager.hook, "after_dataset_saved" + ) + + _run_node_async( + node=sample_node_multiple_outputs, + catalog=memory_catalog, + hook_manager=hook_manager, + ) + + after_dataset_saved_mock.assert_has_calls( + [ + mocker.call( + dataset_name="ds3", + data={"data": 42}, + node=sample_node_multiple_outputs, + ), + mocker.call( + dataset_name="ds4", + data={"data": 42}, + node=sample_node_multiple_outputs, + ), + ], + any_order=True, + ) + assert after_dataset_saved_mock.call_count == 2 + + +class TestKedroContextSpecsHook: + """Test the behavior of `after_context_created` when updating node inputs.""" + + def test_after_context_created_hook(self, mock_session, caplog): + context = mock_session.load_context() + relevant_records = [ + r for r in caplog.records if r.getMessage() == "After context created" + ] + assert len(relevant_records) == 1 + record = relevant_records[0] + assert record.context is context diff --git a/tests/framework/session/test_session_hook_manager.py b/tests/framework/session/test_session_hook_manager.py index c05f208f0f..7d67b4d05c 100644 --- a/tests/framework/session/test_session_hook_manager.py +++ b/tests/framework/session/test_session_hook_manager.py @@ -1,36 +1,10 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +import logging from collections import namedtuple import pytest from dynaconf.validator import Validator -from kedro.framework.hooks.manager import _register_hooks, get_hook_manager +from kedro.framework.hooks.manager import _register_hooks from kedro.framework.project import _ProjectSettings from kedro.framework.session import KedroSession from tests.framework.session.conftest import _mock_imported_settings_paths @@ -62,19 +36,16 @@ class MockSettings(_ProjectSettings): class TestSessionHookManager: """Test the process of registering hooks with the hook manager in a session.""" - def test_assert_register_hooks(self, request, project_hooks): - hook_manager = get_hook_manager() - assert not hook_manager.is_registered(project_hooks) - - # call the fixture to construct the session - request.getfixturevalue("mock_session") - + @pytest.mark.nologreset + def test_assert_register_hooks(self, project_hooks, mock_session): + hook_manager = mock_session._hook_manager assert hook_manager.is_registered(project_hooks) @pytest.mark.usefixtures("mock_session") - def test_calling_register_hooks_twice(self, project_hooks): + @pytest.mark.nologreset + def test_calling_register_hooks_twice(self, project_hooks, mock_session): """Calling hook registration multiple times should not raise""" - hook_manager = get_hook_manager() + hook_manager = mock_session._hook_manager assert hook_manager.is_registered(project_hooks) _register_hooks(hook_manager, (project_hooks,)) @@ -82,22 +53,23 @@ def test_calling_register_hooks_twice(self, project_hooks): assert hook_manager.is_registered(project_hooks) @pytest.mark.parametrize("num_plugins", [0, 1]) + @pytest.mark.nologreset def test_hooks_registered_when_session_created( self, mocker, request, caplog, project_hooks, num_plugins ): - hook_manager = get_hook_manager() - assert not hook_manager.get_plugins() - - load_setuptools_entrypoints = mocker.patch.object( - hook_manager, "load_setuptools_entrypoints", return_value=num_plugins + caplog.set_level(logging.DEBUG, logger="kedro") + load_setuptools_entrypoints = mocker.patch( + "pluggy._manager.PluginManager.load_setuptools_entrypoints", + return_value=num_plugins, ) distinfo = [("plugin_obj_1", MockDistInfo("test-project-a", "0.1"))] - list_distinfo_mock = mocker.patch.object( - hook_manager, "list_plugin_distinfo", return_value=distinfo + list_distinfo_mock = mocker.patch( + "pluggy._manager.PluginManager.list_plugin_distinfo", return_value=distinfo ) # call a fixture which creates a session - request.getfixturevalue("mock_session") + session = request.getfixturevalue("mock_session") + hook_manager = session._hook_manager assert hook_manager.is_registered(project_hooks) load_setuptools_entrypoints.assert_called_once_with("kedro.hooks") @@ -112,6 +84,7 @@ def test_hooks_registered_when_session_created( assert expected_msg in log_messages @pytest.mark.usefixtures("mock_settings_with_disabled_hooks") + @pytest.mark.nologreset def test_disabling_auto_discovered_hooks( self, mocker, @@ -121,22 +94,25 @@ def test_disabling_auto_discovered_hooks( naughty_plugin, good_plugin, ): - hook_manager = get_hook_manager() - assert not hook_manager.get_plugins() + caplog.set_level(logging.DEBUG, logger="kedro") distinfo = [("plugin_obj_1", naughty_plugin), ("plugin_obj_2", good_plugin)] - list_distinfo_mock = mocker.patch.object( - hook_manager, "list_plugin_distinfo", return_value=distinfo + mocked_distinfo = mocker.patch( + "pluggy._manager.PluginManager.list_plugin_distinfo", return_value=distinfo ) - mocker.patch.object( - hook_manager, "load_setuptools_entrypoints", return_value=len(distinfo) + + mocker.patch( + "pluggy._manager.PluginManager.load_setuptools_entrypoints", + return_value=len(distinfo), ) - unregister_mock = mocker.patch.object(hook_manager, "unregister") + unregister_mock = mocker.patch("pluggy._manager.PluginManager.unregister") + # create a session that will use the mock_settings_with_disabled_hooks from the fixture. KedroSession.create( mock_package_name, tmp_path, extra_params={"params:key": "value"} ) - list_distinfo_mock.assert_called_once_with() + + mocked_distinfo.assert_called_once_with() unregister_mock.assert_called_once_with(plugin=distinfo[0][0]) # check the logs diff --git a/tests/framework/session/test_session_registration_hooks.py b/tests/framework/session/test_session_registration_hooks.py deleted file mode 100644 index 939ef4ad2a..0000000000 --- a/tests/framework/session/test_session_registration_hooks.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import re -from typing import Any, Dict, Optional - -import pytest -from dynaconf.validator import Validator - -from kedro.framework.context import KedroContextError -from kedro.framework.hooks import hook_impl -from kedro.framework.project import _ProjectSettings -from kedro.framework.session import KedroSession -from kedro.io import DataCatalog -from kedro.versioning import Journal -from tests.framework.session.conftest import ( - _assert_hook_call_record_has_expected_parameters, - _mock_imported_settings_paths, -) - -logger = logging.getLogger(__name__) - - -@pytest.fixture -def pipeline_registration_hook(mock_pipeline): - class PipelineHook: - @hook_impl - def register_pipelines(self): - logger.info("Registering pipelines") - return {"__default__": mock_pipeline} - - return PipelineHook() - - -def _mock_settings_with_hooks(mocker, hooks): - class MockSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=hooks) - - return _mock_imported_settings_paths(mocker, MockSettings()) - - -@pytest.fixture -def mock_settings_with_pipeline_hooks( - mocker, project_hooks, pipeline_registration_hook -): - return _mock_settings_with_hooks( - mocker, hooks=(project_hooks, pipeline_registration_hook) - ) - - -@pytest.fixture -def mock_settings_duplicate_hooks(mocker, project_hooks, pipeline_registration_hook): - return _mock_settings_with_hooks( - mocker, - hooks=(project_hooks, pipeline_registration_hook, pipeline_registration_hook), - ) - - -class RequiredRegistrationHooks: - """Mandatory registration hooks""" - - @hook_impl - def register_catalog( - self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: - return DataCatalog.from_config( # pragma: no cover - catalog, credentials, load_versions, save_version, journal - ) - - -@pytest.fixture -def mock_settings_broken_catalog_hooks(mocker): - class BrokenCatalogHooks(RequiredRegistrationHooks): - @hook_impl - def register_catalog(self): # pylint: disable=arguments-differ - return None - - return _mock_settings_with_hooks(mocker, hooks=(BrokenCatalogHooks(),)) - - -@pytest.fixture -def mock_session( - mock_settings_with_pipeline_hooks, mock_package_name, tmp_path -): # pylint: disable=unused-argument - return KedroSession.create( - mock_package_name, tmp_path, extra_params={"params:key": "value"} - ) - - -class TestRegistrationHooks: - def test_register_pipelines_is_called( - self, dummy_dataframe, caplog, mock_session, mock_pipeline - ): - context = mock_session.load_context() - catalog = context.catalog - catalog.save("cars", dummy_dataframe) - catalog.save("boats", dummy_dataframe) - mock_session.run() - - register_pipelines_calls = [ - record - for record in caplog.records - if record.funcName == "register_pipelines" - ] - assert len(register_pipelines_calls) == 1 - call_record = register_pipelines_calls[0] - assert call_record.getMessage() == "Registering pipelines" - _assert_hook_call_record_has_expected_parameters(call_record, []) - - expected_pipelines = { - "__default__": mock_pipeline, - "pipe": mock_pipeline, - } - assert context.pipelines == expected_pipelines - - def test_register_catalog_is_called(self, mock_session, caplog): - context = mock_session.load_context() - catalog = context.catalog - assert isinstance(catalog, DataCatalog) - - relevant_records = [ - r for r in caplog.records if r.getMessage() == "Registering catalog" - ] - assert len(relevant_records) == 1 - - record = relevant_records[0] - assert record.catalog.keys() == {"cars", "boats"} - assert record.credentials == {"dev_s3": "foo"} - # save_version is only passed during a run, not on the property getter - assert record.save_version is None - assert record.load_versions is None - assert record.journal is None - - -class TestDuplicatePipelineRegistration: - """Test to make sure that if pipelines are defined in both registration hooks - and pipelines_registry, they are deduplicated and a warning is displayed. - """ - - @pytest.mark.usefixtures("mock_settings_duplicate_hooks") - def test_register_pipelines_with_duplicate_entries( - self, tmp_path, mock_package_name, mock_pipeline - ): - session = KedroSession.create(mock_package_name, tmp_path) - context = session.load_context() - # check that all pipeline dictionaries merged together correctly - expected_pipelines = {key: mock_pipeline for key in ("__default__", "pipe")} - pattern = ( - "Found duplicate pipeline entries. The following " - "will be overwritten: __default__" - ) - with pytest.warns(UserWarning, match=re.escape(pattern)): - assert context.pipelines == expected_pipelines - - -class TestBrokenRegistrationHooks: - @pytest.mark.usefixtures("mock_settings_broken_catalog_hooks") - def test_broken_register_catalog_hook(self, tmp_path, mock_package_name): - pattern = "Expected an instance of `DataCatalog`, got `NoneType` instead." - with KedroSession.create(mock_package_name, tmp_path) as session: - context = session.load_context() - with pytest.raises(KedroContextError, match=re.escape(pattern)): - _ = context.catalog diff --git a/tests/framework/session/test_store.py b/tests/framework/session/test_store.py index 1782e0c43c..fa728271e7 100644 --- a/tests/framework/session/test_store.py +++ b/tests/framework/session/test_store.py @@ -1,36 +1,10 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import logging from pathlib import Path import pytest -from kedro.framework.session.store import BaseSessionStore, ShelveStore +from kedro.framework.session.shelvestore import ShelveStore +from kedro.framework.session.store import BaseSessionStore FAKE_SESSION_ID = "fake_session_id" STORE_LOGGER_NAME = "kedro.framework.session.store" @@ -38,38 +12,42 @@ class TestBaseStore: def test_init(self, caplog): + caplog.set_level(logging.DEBUG, logger="kedro") + path = "fake_path" store = BaseSessionStore(path, FAKE_SESSION_ID) - assert store == dict() + assert store == {} assert store._path == path assert store._session_id == FAKE_SESSION_ID - expected_log_messages = [ - "`read()` not implemented for `BaseSessionStore`. Assuming empty store." + expected_debug_messages = [ + "'read()' not implemented for 'BaseSessionStore'. Assuming empty store." ] - actual_log_messages = [ + actual_debug_messages = [ rec.getMessage() for rec in caplog.records - if rec.name == STORE_LOGGER_NAME and rec.levelno == logging.INFO + if rec.name == STORE_LOGGER_NAME and rec.levelno == logging.DEBUG ] - assert actual_log_messages == expected_log_messages + assert actual_debug_messages == expected_debug_messages def test_save(self, caplog): + caplog.set_level(logging.DEBUG, logger="kedro") + path = "fake_path" store = BaseSessionStore(path, FAKE_SESSION_ID) store.save() - assert store == dict() + assert store == {} - expected_log_messages = [ - "`read()` not implemented for `BaseSessionStore`. Assuming empty store.", - "`save()` not implemented for `BaseSessionStore`. Skipping the step.", + expected_debug_messages = [ + "'read()' not implemented for 'BaseSessionStore'. Assuming empty store.", + "'save()' not implemented for 'BaseSessionStore'. Skipping the step.", ] - actual_log_messages = [ + actual_debug_messages = [ rec.getMessage() for rec in caplog.records - if rec.name == STORE_LOGGER_NAME and rec.levelno == logging.INFO + if rec.name == STORE_LOGGER_NAME and rec.levelno == logging.DEBUG ] - assert actual_log_messages == expected_log_messages + assert actual_debug_messages == expected_debug_messages @pytest.fixture @@ -80,7 +58,7 @@ def shelve_path(tmp_path): class TestShelveStore: def test_empty(self, shelve_path): shelve = ShelveStore(str(shelve_path), FAKE_SESSION_ID) - assert shelve == dict() + assert shelve == {} assert shelve._location == shelve_path / FAKE_SESSION_ID / "store" assert not shelve_path.exists() diff --git a/tests/framework/test_startup.py b/tests/framework/test_startup.py index e731be5757..f3a127d72a 100644 --- a/tests/framework/test_startup.py +++ b/tests/framework/test_startup.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import os import re import sys @@ -60,18 +33,23 @@ def test_toml_invalid_format(self, tmp_path): def test_non_kedro_project(self, mocker): mocker.patch.object(Path, "is_file", return_value=True) - pyproject_toml_payload = {"tool": {}} - mocker.patch("anyconfig.load", return_value=pyproject_toml_payload) + mocker.patch.object(Path, "read_text", return_value="[tool]") assert not _is_project(self.project_path) def test_valid_toml_file(self, mocker): mocker.patch.object(Path, "is_file", return_value=True) - pyproject_toml_payload = {"tool": {"kedro": {}}} - mocker.patch("anyconfig.load", return_value=pyproject_toml_payload) + pyproject_toml_payload = "[tool.kedro]" # \nproject_name = 'proj'" + mocker.patch.object(Path, "read_text", return_value=pyproject_toml_payload) assert _is_project(self.project_path) + def test_toml_bad_encoding(self, mocker): + mocker.patch.object(Path, "is_file", return_value=True) + mocker.patch.object(Path, "read_text", side_effect=UnicodeDecodeError) + + assert not _is_project(self.project_path) + class TestGetProjectMetadata: project_path = Path.cwd() @@ -95,6 +73,33 @@ def test_toml_invalid_format(self, tmp_path): _get_project_metadata(str(tmp_path)) def test_valid_toml_file(self, mocker): + mocker.patch.object(Path, "is_file", return_value=True) + pyproject_toml_payload = { + "tool": { + "kedro": { + "package_name": "fake_package_name", + "project_name": "fake_project_name", + "kedro_init_version": kedro_version, + } + } + } + mocker.patch("anyconfig.load", return_value=pyproject_toml_payload) + + actual = _get_project_metadata(self.project_path) + + expected = ProjectMetadata( + source_dir=self.project_path / "src", # default + config_file=self.project_path / "pyproject.toml", + package_name="fake_package_name", + project_name="fake_project_name", + project_version=kedro_version, + kedro_init_version=kedro_version, + project_path=self.project_path, + ) + assert actual == expected + + # Temporary test for coverage to be removed in 0.19.0 when project_version is removed + def test_valid_toml_file_with_project_version(self, mocker): mocker.patch.object(Path, "is_file", return_value=True) pyproject_toml_payload = { "tool": { @@ -115,6 +120,7 @@ def test_valid_toml_file(self, mocker): package_name="fake_package_name", project_name="fake_project_name", project_version=kedro_version, + kedro_init_version=kedro_version, project_path=self.project_path, ) assert actual == expected @@ -126,7 +132,7 @@ def test_toml_file_with_extra_keys(self, mocker): "kedro": { "package_name": "fake_package_name", "project_name": "fake_project_name", - "project_version": kedro_version, + "kedro_init_version": kedro_version, "unexpected_key": "hello", } } @@ -135,7 +141,7 @@ def test_toml_file_with_extra_keys(self, mocker): pattern = ( "Found unexpected keys in 'pyproject.toml'. Make sure it " "only contains the following keys: ['package_name', " - "'project_name', 'project_version', 'source_dir']." + "'project_name', 'kedro_init_version', 'source_dir']." ) with pytest.raises(RuntimeError, match=re.escape(pattern)): @@ -145,7 +151,10 @@ def test_toml_file_has_missing_mandatory_keys(self, mocker): mocker.patch.object(Path, "is_file", return_value=True) pyproject_toml_payload = { "tool": { - "kedro": {"project_version": kedro_version, "unexpected_key": "hello"} + "kedro": { + "kedro_init_version": kedro_version, + "unexpected_key": "hello", + } } } mocker.patch("anyconfig.load", return_value=pyproject_toml_payload) @@ -175,7 +184,7 @@ def test_source_dir_specified_in_toml(self, mocker): "source_dir": source_dir, "package_name": "fake_package_name", "project_name": "fake_project_name", - "project_version": kedro_version, + "kedro_init_version": kedro_version, } } } @@ -189,6 +198,31 @@ def test_source_dir_specified_in_toml(self, mocker): "invalid_version", ["0.13.0", "10.0", "101.1", "100.0", "-0"] ) def test_invalid_version(self, invalid_version, mocker): + mocker.patch.object(Path, "is_file", return_value=True) + pyproject_toml_payload = { + "tool": { + "kedro": { + "source_dir": "source_dir", + "package_name": "fake_package_name", + "project_name": "fake_project_name", + "kedro_init_version": invalid_version, + } + } + } + mocker.patch("anyconfig.load", return_value=pyproject_toml_payload) + + pattern = ( + f"Your Kedro project version {invalid_version} does not match " + f"Kedro package version {kedro_version} you are running." + ) + with pytest.raises(ValueError, match=re.escape(pattern)): + _get_project_metadata(self.project_path) + + # Temporary test for coverage to be removed in 0.19.0 when project_version is removed + @pytest.mark.parametrize( + "invalid_version", ["0.13.0", "10.0", "101.1", "100.0", "-0"] + ) + def test_invalid_version_for_kedro_version(self, invalid_version, mocker): mocker.patch.object(Path, "is_file", return_value=True) pyproject_toml_payload = { "tool": { @@ -209,6 +243,23 @@ def test_invalid_version(self, invalid_version, mocker): with pytest.raises(ValueError, match=re.escape(pattern)): _get_project_metadata(self.project_path) + def test_toml_file_has_missing_version(self, mocker): + mocker.patch.object(Path, "is_file", return_value=True) + pyproject_toml_payload = { + "tool": { + "kedro": { + "source_dir": "source_dir", + "package_name": "fake_package_name", + "project_name": "fake_project_name", + } + } + } + mocker.patch("anyconfig.load", return_value=pyproject_toml_payload) + pattern = "Missing required key kedro_init_version from 'pyproject.toml'." + + with pytest.raises(RuntimeError, match=re.escape(pattern)): + _get_project_metadata(self.project_path) + class TestValidateSourcePath: @pytest.mark.parametrize( @@ -241,16 +292,14 @@ def test_non_existent_source_path(self, tmp_path): class TestBootstrapProject: - def test_bootstrap_project(self, mocker, monkeypatch, tmp_path): + def test_bootstrap_project(self, monkeypatch, tmp_path): monkeypatch.delenv("PYTHONPATH", raising=False) - # assume settings.py is okay - mocker.patch("kedro.framework.project._validate_module") pyproject_toml_payload = { "tool": { "kedro": { "package_name": "fake_package_name", "project_name": "fake_project_name", - "project_version": kedro_version, + "kedro_init_version": kedro_version, } } } @@ -267,6 +316,7 @@ def test_bootstrap_project(self, mocker, monkeypatch, tmp_path): "project_name": "fake_project_name", "project_path": tmp_path, "project_version": kedro_version, + "kedro_init_version": kedro_version, "source_dir": src_dir, } assert result == ProjectMetadata(**expected_metadata) diff --git a/tests/io/test_cached_dataset.py b/tests/io/test_cached_dataset.py index b1a49e808c..2d8145318a 100644 --- a/tests/io/test_cached_dataset.py +++ b/tests/io/test_cached_dataset.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import pickle from io import StringIO @@ -32,44 +5,44 @@ import yaml from kedro.extras.datasets.pandas import CSVDataSet -from kedro.io import CachedDataSet, DataCatalog, DataSetError, MemoryDataSet +from kedro.io import CachedDataset, DataCatalog, DatasetError, MemoryDataset YML_CONFIG = """ test_ds: - type: CachedDataSet + type: CachedDataset dataset: - type: kedro.extras.datasets.pandas.CSVDataSet - filepath: example.csv + type: kedro.extras.datasets.pandas.CSVDataSet + filepath: example.csv """ YML_CONFIG_VERSIONED = """ test_ds: - type: CachedDataSet + type: CachedDataset versioned: true dataset: - type: kedro.extras.datasets.pandas.CSVDataSet - filepath: example.csv + type: kedro.extras.datasets.pandas.CSVDataSet + filepath: example.csv """ YML_CONFIG_VERSIONED_BAD = """ test_ds: - type: CachedDataSet + type: CachedDataset dataset: - type: kedro.extras.datasets.pandas.CSVDataSet - filepath: example.csv - versioned: true + type: kedro.extras.datasets.pandas.CSVDataSet + filepath: example.csv + versioned: true """ @pytest.fixture def cached_ds(): - wrapped = MemoryDataSet() - return CachedDataSet(wrapped) + wrapped = MemoryDataset() + return CachedDataset(wrapped) class TestCachedDataset: def test_load_empty(self, cached_ds): - with pytest.raises(DataSetError, match=r"has not been saved yet"): + with pytest.raises(DatasetError, match=r"has not been saved yet"): _ = cached_ds.load() def test_save_load(self, cached_ds): @@ -77,11 +50,11 @@ def test_save_load(self, cached_ds): assert cached_ds.load() == 42 def test_save_load_caching(self, mocker): - wrapped = MemoryDataSet(-42) + wrapped = MemoryDataset(-42) mocker.spy(wrapped, "load") mocker.spy(wrapped, "save") - cached_ds = CachedDataSet(wrapped) + cached_ds = CachedDataset(wrapped) mocker.spy(cached_ds._cache, "save") mocker.spy(cached_ds._cache, "load") @@ -93,10 +66,10 @@ def test_save_load_caching(self, mocker): assert cached_ds._cache.save.call_count == 1 # pylint: disable=no-member def test_load_empty_cache(self, mocker): - wrapped = MemoryDataSet(-42) + wrapped = MemoryDataset(-42) mocker.spy(wrapped, "load") - cached_ds = CachedDataSet(wrapped) + cached_ds = CachedDataset(wrapped) mocker.spy(cached_ds._cache, "load") assert cached_ds.load() == -42 @@ -119,11 +92,11 @@ def test_from_yaml(self, mocker): def test_bad_argument(self): with pytest.raises( ValueError, - match=r"The argument type of `dataset` " + match=r"The argument type of 'dataset' " r"should be either a dict/YAML representation " r"of the dataset, or the actual dataset object", ): - _ = CachedDataSet(dataset="BadArgument") + _ = CachedDataset(dataset="BadArgument") def test_config_good_version(self): config = yaml.safe_load(StringIO(YML_CONFIG_VERSIONED)) @@ -133,9 +106,9 @@ def test_config_good_version(self): def test_config_bad_version(self): config = yaml.safe_load(StringIO(YML_CONFIG_VERSIONED_BAD)) with pytest.raises( - DataSetError, + DatasetError, match=r"Cached datasets should specify that they are " - r"versioned in the `CachedDataSet`, not in the " + r"versioned in the 'CachedDataset', not in the " r"wrapped dataset", ): _ = DataCatalog.from_config(config, load_versions={"test_ds": "42"}) @@ -151,7 +124,7 @@ def test_pickle(self, cached_ds, caplog): def test_str(self): assert ( - str(CachedDataSet(MemoryDataSet(42))) == "CachedDataSet(cache={}, " + str(CachedDataset(MemoryDataset(42))) == "CachedDataset(cache={}, " "dataset={'data': })" ) @@ -159,11 +132,11 @@ def test_release(self, cached_ds): cached_ds.save(5) cached_ds.release() with pytest.raises( - DataSetError, match=r"Data for MemoryDataSet has not been saved yet" + DatasetError, match=r"Data for MemoryDataset has not been saved yet" ): _ = cached_ds.load() def test_copy_mode(self, mocker): - mocked_memory_data_set = mocker.patch("kedro.io.cached_dataset.MemoryDataSet") - CachedDataSet(MemoryDataSet(), copy_mode="assign") - mocked_memory_data_set.assert_called_once_with(copy_mode="assign") + mocked_memory_dataset = mocker.patch("kedro.io.cached_dataset.MemoryDataset") + CachedDataset(MemoryDataset(), copy_mode="assign") + mocked_memory_dataset.assert_called_once_with(copy_mode="assign") diff --git a/tests/io/test_core.py b/tests/io/test_core.py index 9c5f9c0065..05a3204639 100644 --- a/tests/io/test_core.py +++ b/tests/io/test_core.py @@ -1,43 +1,23 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from __future__ import annotations +import importlib from decimal import Decimal from fractions import Fraction from pathlib import PurePosixPath -from typing import Any, List +from typing import Any import pytest -from kedro.io.core import AbstractDataSet, _parse_filepath, get_filepath_str +from kedro.io.core import ( + _DEPRECATED_ERROR_CLASSES, + AbstractDataSet, + _parse_filepath, + get_filepath_str, +) # List sourced from https://docs.python.org/3/library/stdtypes.html#truth-value-testing. # Excludes None, as None values are not shown in the str representation. -FALSE_BUILTINS: List[Any] = [ +FALSE_BUILTINS: list[Any] = [ False, 0, 0.0, @@ -53,12 +33,19 @@ ] +@pytest.mark.parametrize("module_name", ["kedro.io", "kedro.io.core"]) +@pytest.mark.parametrize("class_name", _DEPRECATED_ERROR_CLASSES) +def test_deprecation(module_name, class_name): + with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + getattr(importlib.import_module(module_name), class_name) + + class MyDataSet(AbstractDataSet): def __init__(self, var=None): self.var = var def _describe(self): - return dict(var=self.var) + return {"var": self.var} def _load(self): pass # pragma: no cover @@ -92,6 +79,17 @@ def test_get_filepath_str(self): ("gs://bucket/file.txt", {"protocol": "gs", "path": "bucket/file.txt"}), ("adl://bucket/file.txt", {"protocol": "adl", "path": "bucket/file.txt"}), ("abfs://bucket/file.txt", {"protocol": "abfs", "path": "bucket/file.txt"}), + ( + "abfss://bucket/file.txt", + {"protocol": "abfss", "path": "bucket/file.txt"}, + ), + ( + "abfss://mycontainer@mystorageaccount.dfs.core.windows.net/mypath", + { + "protocol": "abfss", + "path": "mycontainer@mystorageaccount.dfs.core.windows.net/mypath", + }, + ), ( "hdfs://namenode:8020/file.txt", {"protocol": "hdfs", "path": "/file.txt"}, diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 8baabf882e..45f64804ba 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import logging import re from copy import deepcopy @@ -40,14 +13,19 @@ from kedro.io import ( AbstractDataSet, DataCatalog, - DataSetAlreadyExistsError, - DataSetError, - DataSetNotFoundError, - LambdaDataSet, - MemoryDataSet, + DatasetAlreadyExistsError, + DatasetError, + DatasetNotFoundError, + LambdaDataset, + MemoryDataset, +) +from kedro.io.core import ( + _DEFAULT_PACKAGES, + VERSION_FORMAT, + Version, + generate_timestamp, + parse_dataset_definition, ) -from kedro.io.core import VERSION_FORMAT, generate_timestamp -from kedro.versioning import Journal @pytest.fixture @@ -93,37 +71,114 @@ def sane_config_with_nested_creds(sane_config): return sane_config +@pytest.fixture +def sane_config_with_tracking_ds(tmp_path): + boat_path = (tmp_path / "some" / "dir" / "test.csv").as_posix() + plane_path = (tmp_path / "some" / "dir" / "metrics.json").as_posix() + return { + "catalog": { + "boats": { + "type": "pandas.CSVDataSet", + "filepath": boat_path, + "versioned": True, + }, + "planes": {"type": "tracking.MetricsDataSet", "filepath": plane_path}, + }, + } + + +@pytest.fixture +def config_with_dataset_factories(): + return { + "catalog": { + "{brand}_cars": { + "type": "pandas.CSVDataSet", + "filepath": "data/01_raw/{brand}_cars.csv", + }, + "audi_cars": { + "type": "pandas.ParquetDataSet", + "filepath": "data/01_raw/audi_cars.pq", + }, + "{type}_boats": { + "type": "pandas.CSVDataSet", + "filepath": "data/01_raw/{type}_boats.csv", + }, + }, + } + + +@pytest.fixture +def config_with_dataset_factories_with_default(config_with_dataset_factories): + config_with_dataset_factories["catalog"]["{default_dataset}"] = { + "type": "pandas.CSVDataSet", + "filepath": "data/01_raw/{default_dataset}.csv", + } + return config_with_dataset_factories + + +@pytest.fixture +def config_with_dataset_factories_bad_pattern(config_with_dataset_factories): + config_with_dataset_factories["catalog"]["{type}@planes"] = { + "type": "pandas.ParquetDataSet", + "filepath": "data/01_raw/{brand}_plane.pq", + } + return config_with_dataset_factories + + +@pytest.fixture +def config_with_dataset_factories_only_patterns(): + return { + "catalog": { + "{default}": { + "type": "pandas.CSVDataSet", + "filepath": "data/01_raw/{default}.csv", + }, + "{namespace}_{dataset}": { + "type": "pandas.CSVDataSet", + "filepath": "data/01_raw/{namespace}_{dataset}.pq", + }, + "{country}_companies": { + "type": "pandas.CSVDataSet", + "filepath": "data/01_raw/{country}_companies.csv", + }, + "{dataset}s": { + "type": "pandas.CSVDataSet", + "filepath": "data/01_raw/{dataset}s.csv", + }, + }, + } + + @pytest.fixture def data_set(filepath): return CSVDataSet(filepath=filepath, save_args={"index": False}) @pytest.fixture -def multi_catalog(mocker): +def multi_catalog(): csv = CSVDataSet(filepath="abc.csv") parq = ParquetDataSet(filepath="xyz.parq") - journal = mocker.Mock() layers = {"raw": {"abc.csv"}, "model": {"xyz.parq"}} - return DataCatalog({"abc": csv, "xyz": parq}, journal=journal, layers=layers) + return DataCatalog({"abc": csv, "xyz": parq}, layers=layers) @pytest.fixture def memory_catalog(): - ds1 = MemoryDataSet({"data": 42}) - ds2 = MemoryDataSet([1, 2, 3, 4, 5]) + ds1 = MemoryDataset({"data": 42}) + ds2 = MemoryDataset([1, 2, 3, 4, 5]) return DataCatalog({"ds1": ds1, "ds2": ds2}) @pytest.fixture def conflicting_feed_dict(): - ds1 = MemoryDataSet({"data": 0}) + ds1 = MemoryDataset({"data": 0}) return {"ds1": ds1, "ds3": 1} -class BadDataSet(AbstractDataSet): # pragma: no cover +class BadDataset(AbstractDataSet): # pragma: no cover def __init__(self, filepath): self.filepath = filepath - raise Exception("Naughty!") + raise Exception("Naughty!") # pylint: disable=broad-exception-raised def _load(self): return None @@ -138,7 +193,7 @@ def _describe(self): @pytest.fixture def bad_config(filepath): return { - "bad": {"type": "tests.io.test_data_catalog.BadDataSet", "filepath": filepath} + "bad": {"type": "tests.io.test_data_catalog.BadDataset", "filepath": filepath} } @@ -183,27 +238,27 @@ def test_load_error(self, data_catalog): """Check the error when attempting to load a data set from nonexistent source""" pattern = r"Failed while loading data from data set CSVDataSet" - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): data_catalog.load("test") def test_add_data_set_twice(self, data_catalog, data_set): """Check the error when attempting to add the data set twice""" - pattern = r"DataSet 'test' has already been registered" - with pytest.raises(DataSetAlreadyExistsError, match=pattern): + pattern = r"Dataset 'test' has already been registered" + with pytest.raises(DatasetAlreadyExistsError, match=pattern): data_catalog.add("test", data_set) def test_load_from_unregistered(self): """Check the error when attempting to load unregistered data set""" catalog = DataCatalog(data_sets={}) - pattern = r"DataSet 'test' not found in the catalog" - with pytest.raises(DataSetNotFoundError, match=pattern): + pattern = r"Dataset 'test' not found in the catalog" + with pytest.raises(DatasetNotFoundError, match=pattern): catalog.load("test") def test_save_to_unregistered(self, dummy_dataframe): """Check the error when attempting to save to unregistered data set""" catalog = DataCatalog(data_sets={}) - pattern = r"DataSet 'test' not found in the catalog" - with pytest.raises(DataSetNotFoundError, match=pattern): + pattern = r"Dataset 'test' not found in the catalog" + with pytest.raises(DatasetNotFoundError, match=pattern): catalog.save("test", dummy_dataframe) def test_feed_dict(self, memory_catalog, conflicting_feed_dict): @@ -222,13 +277,13 @@ def test_exists(self, data_catalog, dummy_dataframe): def test_exists_not_implemented(self, caplog): """Test calling `exists` on the data set, which didn't implement it""" - catalog = DataCatalog(data_sets={"test": LambdaDataSet(None, None)}) + catalog = DataCatalog(data_sets={"test": LambdaDataset(None, None)}) result = catalog.exists("test") log_record = caplog.records[0] assert log_record.levelname == "WARNING" assert ( - "`exists()` not implemented for `LambdaDataSet`. " + "'exists()' not implemented for 'LambdaDataset'. " "Assuming output does not exist." in log_record.message ) assert result is False @@ -239,18 +294,18 @@ def test_exists_invalid(self, data_catalog): def test_release_unregistered(self, data_catalog): """Check the error when calling `release` on unregistered data set""" - pattern = r"DataSet \'wrong_key\' not found in the catalog" - with pytest.raises(DataSetNotFoundError, match=pattern) as e: + pattern = r"Dataset \'wrong_key\' not found in the catalog" + with pytest.raises(DatasetNotFoundError, match=pattern) as e: data_catalog.release("wrong_key") assert "did you mean" not in str(e.value) def test_release_unregistered_typo(self, data_catalog): """Check the error when calling `release` on mistyped data set""" pattern = ( - "DataSet 'text' not found in the catalog" + "Dataset 'text' not found in the catalog" " - did you mean one of these instead: test" ) - with pytest.raises(DataSetNotFoundError, match=re.escape(pattern)): + with pytest.raises(DatasetNotFoundError, match=re.escape(pattern)): data_catalog.release("text") def test_multi_catalog_list(self, multi_catalog): @@ -276,12 +331,12 @@ def test_multi_catalog_list_regex(self, multi_catalog, pattern, expected): def test_multi_catalog_list_bad_regex(self, multi_catalog): """Test that bad regex is caught accordingly""" escaped_regex = r"\(\(" - pattern = f"Invalid regular expression provided: `{escaped_regex}`" + pattern = f"Invalid regular expression provided: '{escaped_regex}'" with pytest.raises(SyntaxError, match=pattern): multi_catalog.list("((") def test_eq(self, multi_catalog, data_catalog): - assert multi_catalog == multi_catalog # pylint: disable=comparison-with-itself + assert multi_catalog == multi_catalog # noqa: PLR0124 assert multi_catalog == multi_catalog.shallow_copy() assert multi_catalog != data_catalog @@ -302,6 +357,18 @@ def test_adding_datasets_not_allowed(self, data_catalog_from_config): with pytest.raises(AttributeError, match=pattern): data_catalog_from_config.datasets.new_dataset = None + def test_add_feed_dict_should_grow_linearly(self, mocker, data_catalog_from_config): + """Check number of calls to `_sub_nonword_chars` when adding feed dict + should grow linearly with the number of keys in the dict. + Simulate this issue: https://github.com/kedro-org/kedro/issues/951 + """ + mock_sub_nonword_chars = mocker.patch( + "kedro.io.data_catalog._sub_nonword_chars" + ) + feed_dict = {"key1": "val1", "key2": "val2", "key3": "val3", "key4": "val4"} + data_catalog_from_config.add_feed_dict(feed_dict) + assert mock_sub_nonword_chars.call_count == len(feed_dict) + def test_mutating_datasets_not_allowed(self, data_catalog_from_config): """Check error if user tries to update the datasets attribute""" pattern = "Please change datasets through configuration." @@ -315,20 +382,24 @@ def test_confirm(self, mocker, caplog): data_catalog.confirm("mocked") mock_ds.confirm.assert_called_once_with() assert caplog.record_tuples == [ - ("kedro.io.data_catalog", logging.INFO, "Confirming DataSet 'mocked'") + ( + "kedro.io.data_catalog", + logging.INFO, + "Confirming dataset 'mocked'", + ) ] @pytest.mark.parametrize( "dataset_name,error_pattern", [ - ("missing", "DataSet 'missing' not found in the catalog"), - ("test", "DataSet 'test' does not have 'confirm' method"), + ("missing", "Dataset 'missing' not found in the catalog"), + ("test", "Dataset 'test' does not have 'confirm' method"), ], ) def test_bad_confirm(self, data_catalog, dataset_name, error_pattern): """Test confirming a non existent dataset or one that does not have `confirm` method""" - with pytest.raises(DataSetError, match=re.escape(error_pattern)): + with pytest.raises(DatasetError, match=re.escape(error_pattern)): data_catalog.confirm(dataset_name) def test_layers(self, data_catalog, data_catalog_from_config): @@ -350,10 +421,10 @@ def test_config_missing_type(self, sane_config): in the config""" del sane_config["catalog"]["boats"]["type"] pattern = ( - "An exception occurred when parsing config for DataSet `boats`:\n" - "`type` is missing from DataSet catalog configuration" + "An exception occurred when parsing config for dataset 'boats':\n" + "'type' is missing from dataset catalog configuration" ) - with pytest.raises(DataSetError, match=re.escape(pattern)): + with pytest.raises(DatasetError, match=re.escape(pattern)): DataCatalog.from_config(**sane_config) def test_config_invalid_module(self, sane_config): @@ -362,18 +433,32 @@ def test_config_invalid_module(self, sane_config): "type" ] = "kedro.invalid_module_name.io.CSVDataSet" - error_msg = "Class `kedro.invalid_module_name.io.CSVDataSet` not found" - with pytest.raises(DataSetError, match=re.escape(error_msg)): + error_msg = "Class 'kedro.invalid_module_name.io.CSVDataSet' not found" + with pytest.raises(DatasetError, match=re.escape(error_msg)): DataCatalog.from_config(**sane_config) def test_config_relative_import(self, sane_config): """Check the error if the type points to a relative import""" sane_config["catalog"]["boats"]["type"] = ".CSVDataSetInvalid" - pattern = "`type` class path does not support relative paths" - with pytest.raises(DataSetError, match=re.escape(pattern)): + pattern = "'type' class path does not support relative paths" + with pytest.raises(DatasetError, match=re.escape(pattern)): DataCatalog.from_config(**sane_config) + def test_config_import_kedro_datasets(self, sane_config, mocker): + """Test kedro.extras.datasets default path to the dataset class""" + # Spy _load_obj because kedro_datasets is not installed and we can't import it. + + import kedro.io.core # pylint: disable=import-outside-toplevel + + spy = mocker.spy(kedro.io.core, "_load_obj") + parse_dataset_definition(sane_config["catalog"]["boats"]) + for prefix, call_args in zip(_DEFAULT_PACKAGES, spy.call_args_list): + # In Python 3.7 call_args.args is not available thus we access the call + # arguments with less meaningful index. + # The 1st index returns a tuple, the 2nd index return the name of module. + assert call_args[0][0] == f"{prefix}pandas.CSVDataSet" + def test_config_import_extras(self, sane_config): """Test kedro.extras.datasets default path to the dataset class""" sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataSet" @@ -384,31 +469,31 @@ def test_config_missing_class(self, sane_config): sane_config["catalog"]["boats"]["type"] = "kedro.io.CSVDataSetInvalid" pattern = ( - "An exception occurred when parsing config for DataSet `boats`:\n" - "Class `kedro.io.CSVDataSetInvalid` not found" + "An exception occurred when parsing config for dataset 'boats':\n" + "Class 'kedro.io.CSVDataSetInvalid' not found" ) - with pytest.raises(DataSetError, match=re.escape(pattern)): + with pytest.raises(DatasetError, match=re.escape(pattern)): DataCatalog.from_config(**sane_config) def test_config_invalid_data_set(self, sane_config): """Check the error if the type points to invalid class""" sane_config["catalog"]["boats"]["type"] = "DataCatalog" pattern = ( - "An exception occurred when parsing config for DataSet `boats`:\n" - "DataSet type `kedro.io.data_catalog.DataCatalog` is invalid: " - "all data set types must extend `AbstractDataSet`" + "An exception occurred when parsing config for dataset 'boats':\n" + "Dataset type 'kedro.io.data_catalog.DataCatalog' is invalid: " + "all data set types must extend 'AbstractDataSet'" ) - with pytest.raises(DataSetError, match=re.escape(pattern)): + with pytest.raises(DatasetError, match=re.escape(pattern)): DataCatalog.from_config(**sane_config) def test_config_invalid_arguments(self, sane_config): """Check the error if the data set config contains invalid arguments""" sane_config["catalog"]["boats"]["save_and_load_args"] = False pattern = ( - r"DataSet 'boats' must only contain arguments valid for " - r"the constructor of `.*CSVDataSet`" + r"Dataset 'boats' must only contain arguments valid for " + r"the constructor of '.*CSVDataSet'" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): DataCatalog.from_config(**sane_config) def test_empty_config(self): @@ -469,7 +554,7 @@ def dummy_load(obj_path, *args, **kwargs): return ["CSVDataSet"] mocker.patch("kedro.io.core.load_obj", side_effect=dummy_load) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): DataCatalog.from_config(**sane_config) def test_idempotent_catalog(self, sane_config): @@ -480,16 +565,16 @@ def test_idempotent_catalog(self, sane_config): def test_error_dataset_init(self, bad_config): """Check the error when trying to instantiate erroneous data set""" - pattern = r"Failed to instantiate DataSet \'bad\' of type `.*BadDataSet`" - with pytest.raises(DataSetError, match=pattern): + pattern = r"Failed to instantiate dataset \'bad\' of type '.*BadDataset'" + with pytest.raises(DatasetError, match=pattern): DataCatalog.from_config(bad_config, None) def test_confirm(self, tmp_path, caplog, mocker): """Confirm the dataset""" - mock_confirm = mocker.patch("kedro.io.IncrementalDataSet.confirm") + mock_confirm = mocker.patch("kedro.io.IncrementalDataset.confirm") catalog = { "ds_to_confirm": { - "type": "IncrementalDataSet", + "type": "IncrementalDataset", "dataset": "pandas.CSVDataSet", "path": str(tmp_path), } @@ -500,7 +585,7 @@ def test_confirm(self, tmp_path, caplog, mocker): ( "kedro.io.data_catalog", logging.INFO, - "Confirming DataSet 'ds_to_confirm'", + "Confirming dataset 'ds_to_confirm'", ) ] mock_confirm.assert_called_once_with() @@ -508,15 +593,15 @@ def test_confirm(self, tmp_path, caplog, mocker): @pytest.mark.parametrize( "dataset_name,pattern", [ - ("missing", "DataSet 'missing' not found in the catalog"), - ("boats", "DataSet 'boats' does not have 'confirm' method"), + ("missing", "Dataset 'missing' not found in the catalog"), + ("boats", "Dataset 'boats' does not have 'confirm' method"), ], ) def test_bad_confirm(self, sane_config, dataset_name, pattern): """Test confirming non existent dataset or the one that does not have `confirm` method""" data_catalog = DataCatalog.from_config(**sane_config) - with pytest.raises(DataSetError, match=re.escape(pattern)): + with pytest.raises(DatasetError, match=re.escape(pattern)): data_catalog.confirm(dataset_name) @@ -533,16 +618,12 @@ def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): ) version = fmt.format(d=current_ts, ms=current_ts.microsecond // 1000) - journal = Journal({"run_id": "fake-id", "project_path": "fake-path"}) catalog = DataCatalog.from_config( **sane_config, load_versions={"boats": version}, save_version=version, - journal=journal, ) - assert catalog._journal == journal - catalog.save("boats", dummy_dataframe) path = Path(sane_config["catalog"]["boats"]["filepath"]) path = path / version / path.name @@ -573,7 +654,7 @@ def test_from_sane_config_versioned_warn(self, caplog, sane_config, versioned): DataCatalog.from_config(**sane_config) log_record = caplog.records[0] expected_log_message = ( - "`version` attribute removed from data set configuration since it " + "'version' attribute removed from data set configuration since it " "is a reserved word and cannot be directly specified" ) assert log_record.levelname == "WARNING" @@ -583,10 +664,34 @@ def test_from_sane_config_load_versions_warn(self, sane_config): sane_config["catalog"]["boats"]["versioned"] = True version = generate_timestamp() load_version = {"non-boart": version} - pattern = r"\`load_versions\` keys \[non-boart\] are not found in the catalog\." - with pytest.warns(UserWarning, match=pattern): + pattern = r"\'load_versions\' keys \[non-boart\] are not found in the catalog\." + with pytest.raises(DatasetNotFoundError, match=pattern): DataCatalog.from_config(**sane_config, load_versions=load_version) + def test_compare_tracking_and_other_dataset_versioned( + self, sane_config_with_tracking_ds, dummy_dataframe + ): + """Test saving of tracking data sets from config results in the same + save version as other versioned datasets.""" + + catalog = DataCatalog.from_config(**sane_config_with_tracking_ds) + + catalog.save("boats", dummy_dataframe) + dummy_data = {"col1": 1, "col2": 2, "col3": 3} + catalog.save("planes", dummy_data) + + # Verify that saved version on tracking dataset is the same as on the CSV dataset + csv_timestamp = datetime.strptime( + catalog.datasets.boats.resolve_save_version(), # pylint: disable=no-member + VERSION_FORMAT, + ) + tracking_timestamp = datetime.strptime( + catalog.datasets.planes.resolve_save_version(), # pylint: disable=no-member + VERSION_FORMAT, + ) + + assert tracking_timestamp == csv_timestamp + def test_load_version(self, sane_config, dummy_dataframe, mocker): """Test load versioned data sets from config""" new_dataframe = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) @@ -615,7 +720,7 @@ def test_load_version_on_unversioned_dataset( catalog = DataCatalog.from_config(**sane_config) catalog.save("boats", dummy_dataframe) - with pytest.raises(DataSetError): + with pytest.raises(DatasetError): catalog.load("boats", version="first") def test_replacing_nonword_characters(self): @@ -633,3 +738,162 @@ def test_replacing_nonword_characters(self): assert "ds2_spark" in catalog.datasets.__dict__ assert "ds3__csv" in catalog.datasets.__dict__ assert "jalapeño" in catalog.datasets.__dict__ + + def test_no_versions_with_cloud_protocol(self): + """Check the error if no versions are available for load from cloud storage""" + version = Version(load=None, save=None) + versioned_dataset = CSVDataSet("s3://bucket/file.csv", version=version) + pattern = re.escape( + f"Did not find any versions for {versioned_dataset}. " + f"This could be due to insufficient permission." + ) + with pytest.raises(DatasetError, match=pattern): + versioned_dataset.load() + + +class TestDataCatalogDatasetFactories: + def test_match_added_to_datasets_on_get(self, config_with_dataset_factories): + """Check that the datasets that match patterns are only added when fetched""" + catalog = DataCatalog.from_config(**config_with_dataset_factories) + assert "{brand}_cars" not in catalog._data_sets + assert "tesla_cars" not in catalog._data_sets + assert "{brand}_cars" in catalog._dataset_patterns + + tesla_cars = catalog._get_dataset("tesla_cars") + assert isinstance(tesla_cars, CSVDataSet) + assert "tesla_cars" in catalog._data_sets + + @pytest.mark.parametrize( + "dataset_name, expected", + [ + ("audi_cars", True), + ("tesla_cars", True), + ("row_boats", True), + ("boats", False), + ("tesla_card", False), + ], + ) + def test_exists_in_catalog_config( + self, config_with_dataset_factories, dataset_name, expected + ): + """Check that the dataset exists in catalog when it matches a pattern + or is in the catalog""" + catalog = DataCatalog.from_config(**config_with_dataset_factories) + assert (dataset_name in catalog) == expected + + def test_patterns_not_in_catalog_datasets(self, config_with_dataset_factories): + """Check that the pattern is not in the catalog datasets""" + catalog = DataCatalog.from_config(**config_with_dataset_factories) + assert "audi_cars" in catalog._data_sets + assert "{brand}_cars" not in catalog._data_sets + assert "audi_cars" not in catalog._dataset_patterns + assert "{brand}_cars" in catalog._dataset_patterns + + def test_explicit_entry_not_overwritten(self, config_with_dataset_factories): + """Check that the existing catalog entry is not overwritten by config in pattern""" + catalog = DataCatalog.from_config(**config_with_dataset_factories) + audi_cars = catalog._get_dataset("audi_cars") + assert isinstance(audi_cars, ParquetDataSet) + + @pytest.mark.parametrize( + "dataset_name,pattern", + [ + ("missing", "Dataset 'missing' not found in the catalog"), + ("tesla@cars", "Dataset 'tesla@cars' not found in the catalog"), + ], + ) + def test_dataset_not_in_catalog_when_no_pattern_match( + self, config_with_dataset_factories, dataset_name, pattern + ): + """Check that the dataset is not added to the catalog when there is no pattern""" + catalog = DataCatalog.from_config(**config_with_dataset_factories) + with pytest.raises(DatasetError, match=re.escape(pattern)): + catalog._get_dataset(dataset_name) + + def test_sorting_order_patterns(self, config_with_dataset_factories_only_patterns): + """Check that the sorted order of the patterns is correct according + to parsing rules""" + catalog = DataCatalog.from_config(**config_with_dataset_factories_only_patterns) + sorted_keys_expected = [ + "{country}_companies", + "{namespace}_{dataset}", + "{dataset}s", + "{default}", + ] + assert list(catalog._dataset_patterns.keys()) == sorted_keys_expected + + def test_default_dataset(self, config_with_dataset_factories_with_default, caplog): + """Check that default dataset is used when no other pattern matches""" + catalog = DataCatalog.from_config(**config_with_dataset_factories_with_default) + assert "jet@planes" not in catalog._data_sets + jet_dataset = catalog._get_dataset("jet@planes") + log_record = caplog.records[0] + assert log_record.levelname == "WARNING" + assert ( + "Config from the dataset factory pattern '{default_dataset}' " + "in the catalog will be used to override the default " + "MemoryDataset creation for the dataset 'jet@planes'" in log_record.message + ) + assert isinstance(jet_dataset, CSVDataSet) + + def test_unmatched_key_error_when_parsing_config( + self, config_with_dataset_factories_bad_pattern + ): + """Check error raised when key mentioned in the config is not in pattern name""" + catalog = DataCatalog.from_config(**config_with_dataset_factories_bad_pattern) + pattern = "Unable to resolve 'filepath' for the pattern '{type}@planes'" + with pytest.raises(DatasetError, match=re.escape(pattern)): + catalog._get_dataset("jet@planes") + + def test_factory_layer(self, config_with_dataset_factories): + """Check that layer is correctly processed for patterned datasets""" + config_with_dataset_factories["catalog"]["{brand}_cars"]["layer"] = "raw" + catalog = DataCatalog.from_config(**config_with_dataset_factories) + _ = catalog._get_dataset("tesla_cars") + assert catalog.layers["raw"] == {"tesla_cars"} + + def test_factory_config_versioned( + self, config_with_dataset_factories, filepath, dummy_dataframe + ): + """Test load and save of versioned data sets from config""" + config_with_dataset_factories["catalog"]["{brand}_cars"]["versioned"] = True + config_with_dataset_factories["catalog"]["{brand}_cars"]["filepath"] = filepath + + assert "tesla_cars" not in config_with_dataset_factories + + # Decompose `generate_timestamp` to keep `current_ts` reference. + current_ts = datetime.now(tz=timezone.utc) + fmt = ( + "{d.year:04d}-{d.month:02d}-{d.day:02d}T{d.hour:02d}" + ".{d.minute:02d}.{d.second:02d}.{ms:03d}Z" + ) + version = fmt.format(d=current_ts, ms=current_ts.microsecond // 1000) + + catalog = DataCatalog.from_config( + **config_with_dataset_factories, + load_versions={"tesla_cars": version}, + save_version=version, + ) + + catalog.save("tesla_cars", dummy_dataframe) + path = Path( + config_with_dataset_factories["catalog"]["{brand}_cars"]["filepath"] + ) + path = path / version / path.name + assert path.is_file() + + reloaded_df = catalog.load("tesla_cars") + assert_frame_equal(reloaded_df, dummy_dataframe) + + reloaded_df_version = catalog.load("tesla_cars", version=version) + assert_frame_equal(reloaded_df_version, dummy_dataframe) + + # Verify that `VERSION_FORMAT` can help regenerate `current_ts`. + actual_timestamp = datetime.strptime( + catalog.datasets.tesla_cars.resolve_load_version(), # pylint: disable=no-member + VERSION_FORMAT, + ) + expected_timestamp = current_ts.replace( + microsecond=current_ts.microsecond // 1000 * 1000, tzinfo=None + ) + assert actual_timestamp == expected_timestamp diff --git a/tests/io/test_data_catalog_with_default.py b/tests/io/test_data_catalog_with_default.py deleted file mode 100644 index 72cfff1f9b..0000000000 --- a/tests/io/test_data_catalog_with_default.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import pandas as pd -import pytest - -from kedro.extras.datasets.pandas import CSVDataSet -from kedro.io import DataCatalog, DataCatalogWithDefault, MemoryDataSet - - -@pytest.fixture -def filepath(tmp_path): - return str(tmp_path / "some" / "dir" / "test.csv") - - -@pytest.fixture -def data_set(filepath): - return CSVDataSet(filepath=filepath, save_args={"index": False}) - - -def default_csv(name): - return CSVDataSet(name) - - -@pytest.fixture -def dummy_dataframe(): - return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) - - -@pytest.fixture -def sane_config(filepath): - return { - "catalog": { - "boats": { - "type": "kedro.extras.datasets.pandas.CSVDataSet", - "filepath": filepath, - }, - "cars": { - "type": "kedro.extras.datasets.pandas.CSVDataSet", - "filepath": "s3://test_bucket/test_file.csv", - "credentials": "s3_credentials", - }, - }, - "credentials": { - "s3_credentials": {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} - }, - } - - -def test_load_from_unregistered(dummy_dataframe, tmpdir): - catalog = DataCatalogWithDefault(data_sets={}, default=default_csv) - - path = str(tmpdir.mkdir("sub").join("test.csv")) - catalog.save(path, dummy_dataframe) - reloaded_df = catalog.load(path) - - assert dummy_dataframe.equals(reloaded_df) - - -def test_save_and_load_catalog(data_set, dummy_dataframe, tmpdir): - catalog = DataCatalogWithDefault(data_sets={"test": data_set}, default=default_csv) - - path = str(tmpdir.mkdir("sub").join("test")) - catalog.save(path, dummy_dataframe) - reloaded_df = catalog.load(path) - assert dummy_dataframe.equals(reloaded_df) - - -def test_from_sane_config(sane_config): - with pytest.raises( - ValueError, match="Cannot instantiate a `DataCatalogWithDefault`" - ): - DataCatalogWithDefault.from_config( - sane_config["catalog"], sane_config["credentials"] - ) - - -def test_from_sane_config_default(sane_config, dummy_dataframe, tmpdir): - catalog = DataCatalog.from_config( - sane_config["catalog"], sane_config["credentials"] - ) - catalog_with_default = DataCatalogWithDefault.from_data_catalog( - catalog, default_csv - ) - path = str(tmpdir.mkdir("sub").join("missing.csv")) - catalog_with_default.save(path, dummy_dataframe) - reloaded_df = catalog_with_default.load(path) - assert dummy_dataframe.equals(reloaded_df) - - -def test_default_none(): - with pytest.raises( - TypeError, - match="Default must be a callable with a " - "single input string argument: the " - "key of the requested data set.", - ): - DataCatalogWithDefault(data_sets={}, default=None) - - -# pylint: disable=unused-argument -def default_memory(name): - return MemoryDataSet(5) - - -def test_remember_load(): - catalog = DataCatalogWithDefault( - data_sets={}, default=default_memory, remember=True - ) - assert catalog.load("any") == 5 - assert "any" in catalog.list() - - -def test_remember_save(tmpdir, dummy_dataframe): - catalog = DataCatalogWithDefault(data_sets={}, default=default_csv, remember=True) - - path = str(tmpdir.mkdir("sub").join("test.csv")) - catalog.save(path, dummy_dataframe) - assert tmpdir.join("sub").join("test.csv") in catalog.list() diff --git a/tests/io/test_incremental_dataset.py b/tests/io/test_incremental_dataset.py index 530d3f9ef6..0ea0f81796 100644 --- a/tests/io/test_incremental_dataset.py +++ b/tests/io/test_incremental_dataset.py @@ -1,33 +1,9 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from __future__ import annotations + +import os import re from pathlib import Path -from typing import Any, Dict +from typing import Any import boto3 import pandas as pd @@ -37,7 +13,7 @@ from kedro.extras.datasets.pickle import PickleDataSet from kedro.extras.datasets.text import TextDataSet -from kedro.io import AbstractDataSet, DataSetError, IncrementalDataSet +from kedro.io import AbstractDataSet, DatasetError, IncrementalDataset from kedro.io.data_catalog import CREDENTIALS_KEY DATASET = "kedro.extras.datasets.pandas.CSVDataSet" @@ -65,11 +41,11 @@ def local_csvs(tmp_path, partitioned_data_pandas): return local_dir -class DummyDataSet(AbstractDataSet): # pragma: no cover +class DummyDataset(AbstractDataSet): # pragma: no cover def __init__(self, filepath): pass - def _describe(self) -> Dict[str, Any]: + def _describe(self) -> dict[str, Any]: return {"dummy": True} def _load(self) -> Any: @@ -87,11 +63,11 @@ def dummy_lt_func(value1: str, value2: str): return value1 < value2 -class TestIncrementalDataSetLocal: +class TestIncrementalDatasetLocal: def test_load_and_confirm(self, local_csvs, partitioned_data_pandas): """Test the standard flow for loading, confirming and reloading - an IncrementalDataSet""" - pds = IncrementalDataSet(str(local_csvs), DATASET) + an IncrementalDataset""" + pds = IncrementalDataset(str(local_csvs), DATASET) loaded = pds.load() assert loaded.keys() == partitioned_data_pandas.keys() for partition_id, data in loaded.items(): @@ -111,11 +87,11 @@ def test_load_and_confirm(self, local_csvs, partitioned_data_pandas): assert reloaded_after_release == {} def test_save(self, local_csvs): - """Test saving a new partition into an IncrementalDataSet""" + """Test saving a new partition into an IncrementalDataset""" df = pd.DataFrame({"dummy": [1, 2, 3]}) new_partition_key = "p05/data.csv" new_partition_path = local_csvs / new_partition_key - pds = IncrementalDataSet(str(local_csvs), DATASET) + pds = IncrementalDataset(str(local_csvs), DATASET) assert not new_partition_path.exists() assert new_partition_key not in pds.load() @@ -145,7 +121,7 @@ def test_save(self, local_csvs): def test_filename_suffix(self, filename_suffix, expected_partitions, local_csvs): """Test how specifying filename_suffix affects the available partitions and their names""" - pds = IncrementalDataSet( + pds = IncrementalDataset( str(local_csvs), DATASET, filename_suffix=filename_suffix ) loaded = pds.load() @@ -176,7 +152,7 @@ def test_force_checkpoint_no_checkpoint_file( ): """Test how forcing checkpoint value affects the available partitions if the checkpoint file does not exist""" - pds = IncrementalDataSet(str(local_csvs), DATASET, checkpoint=forced_checkpoint) + pds = IncrementalDataset(str(local_csvs), DATASET, checkpoint=forced_checkpoint) loaded = pds.load() assert loaded.keys() == expected_partitions @@ -211,11 +187,11 @@ def test_force_checkpoint_checkpoint_file_exists( ): """Test how forcing checkpoint value affects the available partitions if the checkpoint file exists""" - IncrementalDataSet(str(local_csvs), DATASET).confirm() - checkpoint = local_csvs / IncrementalDataSet.DEFAULT_CHECKPOINT_FILENAME + IncrementalDataset(str(local_csvs), DATASET).confirm() + checkpoint = local_csvs / IncrementalDataset.DEFAULT_CHECKPOINT_FILENAME assert checkpoint.read_text() == "p04/data.csv" - pds = IncrementalDataSet(str(local_csvs), DATASET, checkpoint=forced_checkpoint) + pds = IncrementalDataset(str(local_csvs), DATASET, checkpoint=forced_checkpoint) assert pds._checkpoint.exists() loaded = pds.load() assert loaded.keys() == expected_partitions @@ -226,7 +202,7 @@ def test_force_checkpoint_checkpoint_file_exists( def test_force_checkpoint_no_partitions(self, forced_checkpoint, local_csvs): """Test that forcing the checkpoint to certain values results in no partitions being returned""" - pds = IncrementalDataSet(str(local_csvs), DATASET, checkpoint=forced_checkpoint) + pds = IncrementalDataset(str(local_csvs), DATASET, checkpoint=forced_checkpoint) loaded = pds.load() assert loaded == {} @@ -241,7 +217,7 @@ def test_checkpoint_path(self, local_csvs, partitioned_data_pandas): checkpoint_path = local_csvs / "checkpoint_folder" / "checkpoint_file" assert not checkpoint_path.exists() - IncrementalDataSet( + IncrementalDataset( str(local_csvs), DATASET, checkpoint={"filepath": str(checkpoint_path)} ).confirm() assert checkpoint_path.is_file() @@ -252,14 +228,14 @@ def test_checkpoint_path(self, local_csvs, partitioned_data_pandas): [ (None, TextDataSet), ({"type": "kedro.extras.datasets.pickle.PickleDataSet"}, PickleDataSet), - ({"type": "tests.io.test_incremental_dataset.DummyDataSet"}, DummyDataSet), + ({"type": "tests.io.test_incremental_dataset.DummyDataset"}, DummyDataset), ], ) def test_checkpoint_type( self, tmp_path, checkpoint_config, expected_checkpoint_class ): """Test configuring a different checkpoint dataset type""" - pds = IncrementalDataSet(str(tmp_path), DATASET, checkpoint=checkpoint_config) + pds = IncrementalDataset(str(tmp_path), DATASET, checkpoint=checkpoint_config) assert isinstance(pds._checkpoint, expected_checkpoint_class) @pytest.mark.parametrize( @@ -267,14 +243,14 @@ def test_checkpoint_type( [ ( {"versioned": True}, - "`IncrementalDataSet` does not support versioning " - "of the checkpoint. Please remove `versioned` key from the " + "'IncrementalDataset' does not support versioning " + "of the checkpoint. Please remove 'versioned' key from the " "checkpoint definition.", ), ( {"version": None}, - "`IncrementalDataSet` does not support versioning " - "of the checkpoint. Please remove `version` key from the " + "'IncrementalDataset' does not support versioning " + "of the checkpoint. Please remove 'version' key from the " "checkpoint definition.", ), ], @@ -283,8 +259,8 @@ def test_checkpoint_versioning_not_allowed( self, tmp_path, checkpoint_config, error_pattern ): """Test that invalid checkpoint configurations raise expected errors""" - with pytest.raises(DataSetError, match=re.escape(error_pattern)): - IncrementalDataSet(str(tmp_path), DATASET, checkpoint=checkpoint_config) + with pytest.raises(DatasetError, match=re.escape(error_pattern)): + IncrementalDataset(str(tmp_path), DATASET, checkpoint=checkpoint_config) @pytest.mark.parametrize("dataset_config", [{"type": DATASET, "versioned": True}]) @pytest.mark.parametrize( @@ -400,7 +376,7 @@ def test_malformed_versioned_path(self, tmp_path): def test_credentials(self, pds_config, fs_creds, dataset_creds, checkpoint_creds): """Test correctness of credentials propagation into the dataset and checkpoint constructors""" - pds = IncrementalDataSet(str(Path.cwd()), **pds_config) + pds = IncrementalDataset(str(Path.cwd()), **pds_config) assert pds._credentials == fs_creds assert pds._dataset_config[CREDENTIALS_KEY] == dataset_creds assert pds._checkpoint_config[CREDENTIALS_KEY] == checkpoint_creds @@ -427,7 +403,7 @@ def test_comparison_func(self, comparison_func, expected_partitions, local_csvs) "force_checkpoint": "p02/data.csv", "comparison_func": comparison_func, } - pds = IncrementalDataSet(str(local_csvs), DATASET, checkpoint=checkpoint_config) + pds = IncrementalDataset(str(local_csvs), DATASET, checkpoint=checkpoint_config) assert pds.load().keys() == expected_partitions @@ -440,6 +416,7 @@ def mocked_s3_bucket(): with mock_s3(): conn = boto3.client( "s3", + region_name="us-east-1", aws_access_key_id="fake_access_key", aws_secret_access_key="fake_secret_key", ) @@ -460,15 +437,13 @@ def mocked_csvs_in_s3(mocked_s3_bucket, partitioned_data_pandas): class TestPartitionedDataSetS3: - @pytest.fixture(autouse=True) - def fake_aws_creds(self, monkeypatch): - monkeypatch.setenv("AWS_ACCESS_KEY_ID", "FAKE_ACCESS_KEY") - monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "FAKE_SECRET_KEY") + os.environ["AWS_ACCESS_KEY_ID"] = "FAKE_ACCESS_KEY" + os.environ["AWS_SECRET_ACCESS_KEY"] = "FAKE_SECRET_KEY" def test_load_and_confirm(self, mocked_csvs_in_s3, partitioned_data_pandas): """Test the standard flow for loading, confirming and reloading - a IncrementalDataSet in S3""" - pds = IncrementalDataSet(mocked_csvs_in_s3, DATASET) + a IncrementalDataset in S3""" + pds = IncrementalDataset(mocked_csvs_in_s3, DATASET) assert pds._checkpoint._protocol == "s3" loaded = pds.load() assert loaded.keys() == partitioned_data_pandas.keys() @@ -485,7 +460,7 @@ def test_load_and_confirm_s3a( self, mocked_csvs_in_s3, partitioned_data_pandas, mocker ): s3a_path = f"s3a://{mocked_csvs_in_s3.split('://', 1)[1]}" - pds = IncrementalDataSet(s3a_path, DATASET) + pds = IncrementalDataset(s3a_path, DATASET) assert pds._protocol == "s3a" assert pds._checkpoint._protocol == "s3" @@ -525,7 +500,7 @@ def test_force_checkpoint_no_checkpoint_file( ): """Test how forcing checkpoint value affects the available partitions in S3 if the checkpoint file does not exist""" - pds = IncrementalDataSet( + pds = IncrementalDataset( mocked_csvs_in_s3, DATASET, checkpoint=forced_checkpoint ) loaded = pds.load() @@ -562,14 +537,14 @@ def test_force_checkpoint_checkpoint_file_exists( """Test how forcing checkpoint value affects the available partitions in S3 if the checkpoint file exists""" # create checkpoint and assert that it exists - IncrementalDataSet(mocked_csvs_in_s3, DATASET).confirm() - checkpoint_path = "{}/{}".format( - mocked_csvs_in_s3, IncrementalDataSet.DEFAULT_CHECKPOINT_FILENAME + IncrementalDataset(mocked_csvs_in_s3, DATASET).confirm() + checkpoint_path = ( + f"{mocked_csvs_in_s3}/{IncrementalDataset.DEFAULT_CHECKPOINT_FILENAME}" ) checkpoint_value = TextDataSet(checkpoint_path).load() assert checkpoint_value == "p04/data.csv" - pds = IncrementalDataSet( + pds = IncrementalDataset( mocked_csvs_in_s3, DATASET, checkpoint=forced_checkpoint ) assert pds._checkpoint.exists() @@ -582,7 +557,7 @@ def test_force_checkpoint_checkpoint_file_exists( def test_force_checkpoint_no_partitions(self, forced_checkpoint, mocked_csvs_in_s3): """Test that forcing the checkpoint to certain values results in no partitions returned from S3""" - pds = IncrementalDataSet( + pds = IncrementalDataset( mocked_csvs_in_s3, DATASET, checkpoint=forced_checkpoint ) loaded = pds.load() diff --git a/tests/io/test_lambda_data_set.py b/tests/io/test_lambda_data_set.py deleted file mode 100644 index 7a8bccb54b..0000000000 --- a/tests/io/test_lambda_data_set.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from kedro.io import DataSetError, LambdaDataSet - - -@pytest.fixture -def mocked_save(mocker): - return mocker.Mock() - - -@pytest.fixture -def mocked_data_set(mocked_save): - return LambdaDataSet(None, mocked_save) - - -def test_data_set_describe(): - """Test `describe` method invocation""" - - def _dummy_load(): - pass # pragma: no cover - - def _dummy_save(): - pass # pragma: no cover - - def _dummy_exists(): - return False # pragma: no cover - - def _dummy_release(): - pass # pragma: no cover - - assert "LambdaDataSet(load=)" in str( - LambdaDataSet(_dummy_load, None) - ) - assert "LambdaDataSet(save=)" in str( - LambdaDataSet(None, _dummy_save) - ) - assert "LambdaDataSet(exists=)" in str( - LambdaDataSet(None, None, _dummy_exists) - ) - assert ( - "LambdaDataSet(release=)" - in str(LambdaDataSet(None, None, None, _dummy_release)) - ) - - # __init__ keys alphabetically sorted, None values not shown - expected = ( - "LambdaDataSet(exists=, " - "load=, " - "save=)" - ) - actual = str(LambdaDataSet(_dummy_load, _dummy_save, _dummy_exists, None)) - assert actual == expected - - -class TestLambdaDataSetLoad: - def test_load_invocation(self, mocker): - """Test the basic `load` method invocation""" - mocked_load = mocker.Mock(return_value=42) - data_set = LambdaDataSet(mocked_load, None) - result = data_set.load() - - mocked_load.assert_called_once_with() - assert result == 42 - - def test_load_raises_error(self): - """Check the error if loading the LambdaDataSet raises an exception""" - error_message = "Internal load exception message" - - def internal_load(): - raise FileNotFoundError(error_message) - - data_set = LambdaDataSet(internal_load, None) - with pytest.raises(DataSetError, match=error_message): - data_set.load() - - def test_load_undefined(self): - """Check the error if `LambdaDataSet.__load` is None""" - with pytest.raises(DataSetError, match="Cannot load data set"): - LambdaDataSet(None, None).load() - - def test_load_not_callable(self): - pattern = ( - r"`load` function for LambdaDataSet must be a Callable\. " - r"Object of type `str` provided instead\." - ) - with pytest.raises(DataSetError, match=pattern): - LambdaDataSet("load", None) - - -class TestLambdaDataSetSave: - def test_save_invocation(self, mocked_save, mocked_data_set): - """Test the basic `save` method invocation""" - mocked_data_set.save("foo") - mocked_save.assert_called_once_with("foo") - - def test_save_raises_error(self, mocked_save, mocked_data_set): - """Check the error if saving the LambdaDataSet raises an exception""" - error_message = "Cannot save to an existing file" - mocked_save.side_effect = FileExistsError(error_message) - - pattern = ( - r"Failed while saving data to data set LambdaDataSet\(.+\)\.\n" - + error_message - ) - with pytest.raises(DataSetError, match=pattern): - mocked_data_set.save("data") - mocked_save.assert_called_once_with("data") - - def test_save_undefined(self): - """Check the error if `LambdaDataSet.__save` is None""" - with pytest.raises(DataSetError, match="Cannot save to data set"): - LambdaDataSet(None, None).save(42) - - def test_save_none(self, mocked_save, mocked_data_set): - """Check the error when passing None to `save` call""" - pattern = "Saving `None` to a `DataSet` is not allowed" - with pytest.raises(DataSetError, match=pattern): - mocked_data_set.save(None) - assert mocked_save.called == 0 - - def test_save_not_callable(self): - pattern = ( - r"`save` function for LambdaDataSet must be a Callable\. " - r"Object of type `str` provided instead\." - ) - with pytest.raises(DataSetError, match=pattern): - LambdaDataSet(None, "save") - - -class TestLambdaDataSetExists: - def test_exists_invocation(self, mocker): - """Test the basic `exists` method invocation""" - mocked_exists = mocker.Mock(return_value=True) - data_set = LambdaDataSet(None, None, mocked_exists) - result = data_set.exists() - mocked_exists.assert_called_once_with() - assert result is True - - def test_exists_not_implemented(self): - """Check that `exists` method returns False by default""" - data_set = LambdaDataSet(None, None) - assert not data_set.exists() - - def test_exists_raises_error(self, mocker): - """Check the error when `exists` raises an exception""" - mocked_exists = mocker.Mock() - error_message = "File not found" - mocked_exists.side_effect = FileNotFoundError(error_message) - data_set = LambdaDataSet(None, None, mocked_exists) - - with pytest.raises(DataSetError, match=error_message): - data_set.exists() - mocked_exists.assert_called_once_with() - - def test_exists_not_callable(self): - pattern = ( - r"`exists` function for LambdaDataSet must be a Callable\. " - r"Object of type `str` provided instead\." - ) - with pytest.raises(DataSetError, match=pattern): - LambdaDataSet(None, None, "exists") - - -class TestLambdaDataSetRelease: - def test_release_invocation(self, mocker): - """Test the basic `release` method invocation""" - mocked_release = mocker.Mock() - data_set = LambdaDataSet(None, None, None, mocked_release) - data_set.release() - mocked_release.assert_called_once_with() - - def test_release_not_implemented(self): - """Check that `release` does nothing by default""" - data_set = LambdaDataSet(None, None) - data_set.release() - - def test_release_raises_error(self, mocker): - """Check the error when `release` raises an exception""" - mocked_release = mocker.Mock() - error_message = "File not found" - mocked_release.side_effect = FileNotFoundError(error_message) - data_set = LambdaDataSet(None, None, None, mocked_release) - - with pytest.raises(DataSetError, match=error_message): - data_set.release() - mocked_release.assert_called_once_with() - - def test_release_not_callable(self): - pattern = ( - r"`release` function for LambdaDataSet must be a Callable\. " - r"Object of type `str` provided instead\." - ) - with pytest.raises(DataSetError, match=pattern): - LambdaDataSet(None, None, None, "release") diff --git a/tests/io/test_lambda_dataset.py b/tests/io/test_lambda_dataset.py new file mode 100644 index 0000000000..fab6e13862 --- /dev/null +++ b/tests/io/test_lambda_dataset.py @@ -0,0 +1,194 @@ +import pytest + +from kedro.io import DatasetError, LambdaDataset + + +@pytest.fixture +def mocked_save(mocker): + return mocker.Mock() + + +@pytest.fixture +def mocked_data_set(mocked_save): + return LambdaDataset(None, mocked_save) + + +def test_data_set_describe(): + """Test `describe` method invocation""" + + def _dummy_load(): + pass # pragma: no cover + + def _dummy_save(): + pass # pragma: no cover + + def _dummy_exists(): + return False # pragma: no cover + + def _dummy_release(): + pass # pragma: no cover + + assert "LambdaDataset(load=)" in str( + LambdaDataset(_dummy_load, None) + ) + assert "LambdaDataset(save=)" in str( + LambdaDataset(None, _dummy_save) + ) + assert "LambdaDataset(exists=)" in str( + LambdaDataset(None, None, _dummy_exists) + ) + assert ( + "LambdaDataset(release=)" + in str(LambdaDataset(None, None, None, _dummy_release)) + ) + + # __init__ keys alphabetically sorted, None values not shown + expected = ( + "LambdaDataset(exists=, " + "load=, " + "save=)" + ) + actual = str(LambdaDataset(_dummy_load, _dummy_save, _dummy_exists, None)) + assert actual == expected + + +class TestLambdaDatasetLoad: + def test_load_invocation(self, mocker): + """Test the basic `load` method invocation""" + mocked_load = mocker.Mock(return_value=42) + data_set = LambdaDataset(mocked_load, None) + result = data_set.load() + + mocked_load.assert_called_once_with() + assert result == 42 + + def test_load_raises_error(self): + """Check the error if loading the LambdaDataset raises an exception""" + error_message = "Internal load exception message" + + def internal_load(): + raise FileNotFoundError(error_message) + + data_set = LambdaDataset(internal_load, None) + with pytest.raises(DatasetError, match=error_message): + data_set.load() + + def test_load_undefined(self): + """Check the error if `LambdaDataset.__load` is None""" + with pytest.raises(DatasetError, match="Cannot load data set"): + LambdaDataset(None, None).load() + + def test_load_not_callable(self): + pattern = ( + r"'load' function for LambdaDataset must be a Callable\. " + r"Object of type 'str' provided instead\." + ) + with pytest.raises(DatasetError, match=pattern): + LambdaDataset("load", None) + + +class TestLambdaDatasetSave: + def test_save_invocation(self, mocked_save, mocked_data_set): + """Test the basic `save` method invocation""" + mocked_data_set.save("foo") + mocked_save.assert_called_once_with("foo") + + def test_save_raises_error(self, mocked_save, mocked_data_set): + """Check the error if saving the LambdaDataset raises an exception""" + error_message = "Cannot save to an existing file" + mocked_save.side_effect = FileExistsError(error_message) + + pattern = ( + r"Failed while saving data to data set LambdaDataset\(.+\)\.\n" + + error_message + ) + with pytest.raises(DatasetError, match=pattern): + mocked_data_set.save("data") + mocked_save.assert_called_once_with("data") + + def test_save_undefined(self): + """Check the error if `LambdaDataset.__save` is None""" + with pytest.raises(DatasetError, match="Cannot save to data set"): + LambdaDataset(None, None).save(42) + + def test_save_none(self, mocked_save, mocked_data_set): + """Check the error when passing None to `save` call""" + pattern = "Saving 'None' to a 'Dataset' is not allowed" + with pytest.raises(DatasetError, match=pattern): + mocked_data_set.save(None) + assert mocked_save.called == 0 + + def test_save_not_callable(self): + pattern = ( + r"'save' function for LambdaDataset must be a Callable\. " + r"Object of type 'str' provided instead\." + ) + with pytest.raises(DatasetError, match=pattern): + LambdaDataset(None, "save") + + +class TestLambdaDatasetExists: + def test_exists_invocation(self, mocker): + """Test the basic `exists` method invocation""" + mocked_exists = mocker.Mock(return_value=True) + data_set = LambdaDataset(None, None, mocked_exists) + result = data_set.exists() + mocked_exists.assert_called_once_with() + assert result is True + + def test_exists_not_implemented(self): + """Check that `exists` method returns False by default""" + data_set = LambdaDataset(None, None) + assert not data_set.exists() + + def test_exists_raises_error(self, mocker): + """Check the error when `exists` raises an exception""" + mocked_exists = mocker.Mock() + error_message = "File not found" + mocked_exists.side_effect = FileNotFoundError(error_message) + data_set = LambdaDataset(None, None, mocked_exists) + + with pytest.raises(DatasetError, match=error_message): + data_set.exists() + mocked_exists.assert_called_once_with() + + def test_exists_not_callable(self): + pattern = ( + r"'exists' function for LambdaDataset must be a Callable\. " + r"Object of type 'str' provided instead\." + ) + with pytest.raises(DatasetError, match=pattern): + LambdaDataset(None, None, "exists") + + +class TestLambdaDatasetRelease: + def test_release_invocation(self, mocker): + """Test the basic `release` method invocation""" + mocked_release = mocker.Mock() + data_set = LambdaDataset(None, None, None, mocked_release) + data_set.release() + mocked_release.assert_called_once_with() + + def test_release_not_implemented(self): + """Check that `release` does nothing by default""" + data_set = LambdaDataset(None, None) + data_set.release() + + def test_release_raises_error(self, mocker): + """Check the error when `release` raises an exception""" + mocked_release = mocker.Mock() + error_message = "File not found" + mocked_release.side_effect = FileNotFoundError(error_message) + data_set = LambdaDataset(None, None, None, mocked_release) + + with pytest.raises(DatasetError, match=error_message): + data_set.release() + mocked_release.assert_called_once_with() + + def test_release_not_callable(self): + pattern = ( + r"'release' function for LambdaDataset must be a Callable\. " + r"Object of type 'str' provided instead\." + ) + with pytest.raises(DatasetError, match=pattern): + LambdaDataset(None, None, None, "release") diff --git a/tests/io/test_memory_data_set.py b/tests/io/test_memory_dataset.py similarity index 57% rename from tests/io/test_memory_data_set.py rename to tests/io/test_memory_dataset.py index b799978c5d..81d1c3fc38 100644 --- a/tests/io/test_memory_data_set.py +++ b/tests/io/test_memory_dataset.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - import re # pylint: disable=unused-argument @@ -33,8 +5,8 @@ import pandas as pd import pytest -from kedro.io import DataSetError, MemoryDataSet -from kedro.io.memory_data_set import _copy_with_mode, _infer_copy_mode +from kedro.io import DatasetError, MemoryDataset +from kedro.io.memory_dataset import _copy_with_mode, _infer_copy_mode def _update_data(data, idx, jdx, value): @@ -76,35 +48,35 @@ def new_data(): @pytest.fixture -def memory_data_set(input_data): - return MemoryDataSet(data=input_data) +def memory_dataset(input_data): + return MemoryDataset(data=input_data) @pytest.fixture def mocked_infer_mode(mocker): - return mocker.patch("kedro.io.memory_data_set._infer_copy_mode") + return mocker.patch("kedro.io.memory_dataset._infer_copy_mode") @pytest.fixture def mocked_copy_with_mode(mocker): - return mocker.patch("kedro.io.memory_data_set._copy_with_mode") + return mocker.patch("kedro.io.memory_dataset._copy_with_mode") -class TestMemoryDataSet: - def test_load(self, memory_data_set, input_data): +class TestMemoryDataset: + def test_load(self, memory_dataset, input_data): """Test basic load""" - loaded_data = memory_data_set.load() + loaded_data = memory_dataset.load() assert _check_equals(loaded_data, input_data) def test_load_none(self): - loaded_data = MemoryDataSet(None).load() + loaded_data = MemoryDataset(None).load() assert loaded_data is None def test_load_infer_mode( - self, memory_data_set, input_data, mocked_infer_mode, mocked_copy_with_mode + self, memory_dataset, input_data, mocked_infer_mode, mocked_copy_with_mode ): """Test load calls infer_mode and copy_mode_with""" - memory_data_set.load() + memory_dataset.load() assert mocked_infer_mode.call_count == 1 assert mocked_copy_with_mode.call_count == 1 @@ -115,18 +87,18 @@ def test_load_infer_mode( assert mocked_copy_with_mode.call_args[0] assert _check_equals(mocked_copy_with_mode.call_args[0][0], input_data) - def test_save(self, memory_data_set, input_data, new_data): - """Test overriding the data set""" - memory_data_set.save(data=new_data) - reloaded = memory_data_set.load() + def test_save(self, memory_dataset, input_data, new_data): + """Test overriding the dataset""" + memory_dataset.save(data=new_data) + reloaded = memory_dataset.load() assert not _check_equals(reloaded, input_data) assert _check_equals(reloaded, new_data) def test_save_infer_mode( - self, memory_data_set, new_data, mocked_infer_mode, mocked_copy_with_mode + self, memory_dataset, new_data, mocked_infer_mode, mocked_copy_with_mode ): """Test save calls infer_mode and copy_mode_with""" - memory_data_set.save(data=new_data) + memory_dataset.save(data=new_data) assert mocked_infer_mode.call_count == 1 assert mocked_copy_with_mode.call_count == 1 @@ -137,65 +109,65 @@ def test_save_infer_mode( assert mocked_copy_with_mode.call_args[0] assert _check_equals(mocked_copy_with_mode.call_args[0][0], new_data) - def test_load_modify_original_data(self, memory_data_set, input_data): - """Check that the data set object is not updated when the original + def test_load_modify_original_data(self, memory_dataset, input_data): + """Check that the dataset object is not updated when the original object is changed.""" input_data = _update_data(input_data, 1, 1, -5) - assert not _check_equals(memory_data_set.load(), input_data) + assert not _check_equals(memory_dataset.load(), input_data) - def test_save_modify_original_data(self, memory_data_set, new_data): - """Check that the data set object is not updated when the original + def test_save_modify_original_data(self, memory_dataset, new_data): + """Check that the dataset object is not updated when the original object is changed.""" - memory_data_set.save(new_data) + memory_dataset.save(new_data) new_data = _update_data(new_data, 1, 1, "new value") - assert not _check_equals(memory_data_set.load(), new_data) + assert not _check_equals(memory_dataset.load(), new_data) @pytest.mark.parametrize( "input_data", ["dummy_dataframe", "dummy_numpy_array"], indirect=True ) - def test_load_returns_new_object(self, memory_data_set, input_data): + def test_load_returns_new_object(self, memory_dataset, input_data): """Test that consecutive loads point to different objects in case of a pandas DataFrame and numpy array""" - loaded_data = memory_data_set.load() - reloaded_data = memory_data_set.load() + loaded_data = memory_dataset.load() + reloaded_data = memory_dataset.load() assert _check_equals(loaded_data, input_data) assert _check_equals(reloaded_data, input_data) assert loaded_data is not reloaded_data def test_create_without_data(self): """Test instantiation without data""" - assert MemoryDataSet() is not None + assert MemoryDataset() is not None def test_loading_none(self): - """Check the error when attempting to load the data set that doesn't + """Check the error when attempting to load the dataset that doesn't contain any data""" - pattern = r"Data for MemoryDataSet has not been saved yet\." - with pytest.raises(DataSetError, match=pattern): - MemoryDataSet().load() + pattern = r"Data for MemoryDataset has not been saved yet\." + with pytest.raises(DatasetError, match=pattern): + MemoryDataset().load() def test_saving_none(self): - """Check the error when attempting to save the data set without + """Check the error when attempting to save the dataset without providing the data""" - pattern = r"Saving `None` to a `DataSet` is not allowed" - with pytest.raises(DataSetError, match=pattern): - MemoryDataSet().save(None) + pattern = r"Saving 'None' to a 'Dataset' is not allowed" + with pytest.raises(DatasetError, match=pattern): + MemoryDataset().save(None) @pytest.mark.parametrize( "input_data,expected", [ - ("dummy_dataframe", "MemoryDataSet(data=)"), - ("dummy_numpy_array", "MemoryDataSet(data=)"), + ("dummy_dataframe", "MemoryDataset(data=)"), + ("dummy_numpy_array", "MemoryDataset(data=)"), ], indirect=["input_data"], ) - def test_str_representation(self, memory_data_set, input_data, expected): - """Test string representation of the data set""" - assert expected in str(memory_data_set) + def test_str_representation(self, memory_dataset, input_data, expected): + """Test string representation of the dataset""" + assert expected in str(memory_dataset) def test_exists(self, new_data): """Test `exists` method invocation""" - data_set = MemoryDataSet() + data_set = MemoryDataset() assert not data_set.exists() data_set.save(new_data) @@ -230,7 +202,7 @@ def test_copy_mode_deepcopy(data): def test_copy_mode_invalid_string(): """Test _copy_with_mode with invalid string""" pattern = "Invalid copy mode: alice. Possible values are: deepcopy, copy, assign." - with pytest.raises(DataSetError, match=re.escape(pattern)): + with pytest.raises(DatasetError, match=re.escape(pattern)): _copy_with_mode(None, copy_mode="alice") diff --git a/tests/io/test_partitioned_dataset.py b/tests/io/test_partitioned_dataset.py index fce6d46c77..b2c3c56478 100644 --- a/tests/io/test_partitioned_dataset.py +++ b/tests/io/test_partitioned_dataset.py @@ -1,31 +1,5 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import logging +import os import re from pathlib import Path @@ -37,9 +11,9 @@ from pandas.util.testing import assert_frame_equal from kedro.extras.datasets.pandas import CSVDataSet, ParquetDataSet -from kedro.io import DataSetError, PartitionedDataSet +from kedro.io import DatasetError, PartitionedDataset from kedro.io.data_catalog import CREDENTIALS_KEY -from kedro.io.partitioned_data_set import KEY_PROPAGATION_WARNING +from kedro.io.partitioned_dataset import KEY_PROPAGATION_WARNING @pytest.fixture @@ -77,11 +51,11 @@ def filepath_csvs(tmp_path): ] -class FakeDataSet: # pylint: disable=too-few-public-methods +class FakeDataset: # pylint: disable=too-few-public-methods pass -class TestPartitionedDataSetLocal: +class TestPartitionedDatasetLocal: @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) @pytest.mark.parametrize( "suffix,expected_num_parts", [("", 5), (".csv", 3), ("p4", 1)] @@ -89,7 +63,7 @@ class TestPartitionedDataSetLocal: def test_load( self, dataset, local_csvs, partitioned_data_pandas, suffix, expected_num_parts ): - pds = PartitionedDataSet(str(local_csvs), dataset, filename_suffix=suffix) + pds = PartitionedDataset(str(local_csvs), dataset, filename_suffix=suffix) loaded_partitions = pds.load() assert len(loaded_partitions.keys()) == expected_num_parts @@ -102,7 +76,7 @@ def test_load( @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) @pytest.mark.parametrize("suffix", ["", ".csv"]) def test_save(self, dataset, local_csvs, suffix): - pds = PartitionedDataSet(str(local_csvs), dataset, filename_suffix=suffix) + pds = PartitionedDataset(str(local_csvs), dataset, filename_suffix=suffix) original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) part_id = "new/data" pds.save({part_id: original_data}) @@ -116,7 +90,7 @@ def test_save(self, dataset, local_csvs, suffix): @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) @pytest.mark.parametrize("suffix", ["", ".csv"]) def test_lazy_save(self, dataset, local_csvs, suffix): - pds = PartitionedDataSet(str(local_csvs), dataset, filename_suffix=suffix) + pds = PartitionedDataset(str(local_csvs), dataset, filename_suffix=suffix) def original_data(): return pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) @@ -132,7 +106,7 @@ def original_data(): def test_save_invalidates_cache(self, local_csvs, mocker): """Test that save calls invalidate partition cache""" - pds = PartitionedDataSet(str(local_csvs), "pandas.CSVDataSet") + pds = PartitionedDataset(str(local_csvs), "pandas.CSVDataSet") mocked_fs_invalidate = mocker.patch.object(pds._filesystem, "invalidate_cache") first_load = pds.load() assert pds._partition_cache.currsize == 1 @@ -153,11 +127,24 @@ def test_save_invalidates_cache(self, local_csvs, mocker): assert new_partition not in first_load assert new_partition in second_load + @pytest.mark.parametrize("overwrite,expected_num_parts", [(False, 6), (True, 1)]) + def test_overwrite(self, local_csvs, overwrite, expected_num_parts): + pds = PartitionedDataset( + str(local_csvs), "pandas.CSVDataSet", overwrite=overwrite + ) + original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) + part_id = "new/data" + pds.save({part_id: original_data}) + loaded_partitions = pds.load() + + assert part_id in loaded_partitions + assert len(loaded_partitions.keys()) == expected_num_parts + def test_release_instance_cache(self, local_csvs): """Test that cache invalidation does not affect other instances""" - ds_a = PartitionedDataSet(str(local_csvs), "pandas.CSVDataSet") + ds_a = PartitionedDataset(str(local_csvs), "pandas.CSVDataSet") ds_a.load() - ds_b = PartitionedDataSet(str(local_csvs), "pandas.CSVDataSet") + ds_b = PartitionedDataset(str(local_csvs), "pandas.CSVDataSet") ds_b.load() assert ds_a._partition_cache.currsize == 1 @@ -171,17 +158,17 @@ def test_release_instance_cache(self, local_csvs): @pytest.mark.parametrize("dataset", ["pandas.CSVDataSet", "pandas.ParquetDataSet"]) def test_exists(self, local_csvs, dataset): - assert PartitionedDataSet(str(local_csvs), dataset).exists() + assert PartitionedDataset(str(local_csvs), dataset).exists() empty_folder = local_csvs / "empty" / "folder" - assert not PartitionedDataSet(str(empty_folder), dataset).exists() + assert not PartitionedDataset(str(empty_folder), dataset).exists() empty_folder.mkdir(parents=True) - assert not PartitionedDataSet(str(empty_folder), dataset).exists() + assert not PartitionedDataset(str(empty_folder), dataset).exists() @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) def test_release(self, dataset, local_csvs): partition_to_remove = "p2.csv" - pds = PartitionedDataSet(str(local_csvs), dataset) + pds = PartitionedDataset(str(local_csvs), dataset) initial_load = pds.load() assert partition_to_remove in initial_load @@ -196,7 +183,7 @@ def test_release(self, dataset, local_csvs): @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) def test_describe(self, dataset): path = str(Path.cwd()) - pds = PartitionedDataSet(path, dataset) + pds = PartitionedDataset(path, dataset) assert f"path={path}" in str(pds) assert "dataset_type=CSVDataSet" in str(pds) @@ -210,7 +197,7 @@ def test_load_args(self, mocker): path = str(Path.cwd()) load_args = {"maxdepth": 42, "withdirs": True} - pds = PartitionedDataSet(path, "pandas.CSVDataSet", load_args=load_args) + pds = PartitionedDataset(path, "pandas.CSVDataSet", load_args=load_args) mocker.patch.object(pds, "_path_to_partition", return_value=fake_partition_name) assert pds.load().keys() == {fake_partition_name} @@ -225,7 +212,7 @@ def test_credentials( ): mocked_filesystem = mocker.patch("fsspec.filesystem") path = str(Path.cwd()) - pds = PartitionedDataSet(path, "pandas.CSVDataSet", credentials=credentials) + pds = PartitionedDataset(path, "pandas.CSVDataSet", credentials=credentials) assert mocked_filesystem.call_count == 2 mocked_filesystem.assert_called_with("file", **expected_pds_creds) @@ -251,7 +238,7 @@ def test_fs_args(self, mocker): mocked_filesystem = mocker.patch("fsspec.filesystem") path = str(Path.cwd()) - pds = PartitionedDataSet(path, "pandas.CSVDataSet", fs_args=fs_args) + pds = PartitionedDataset(path, "pandas.CSVDataSet", fs_args=fs_args) assert mocked_filesystem.call_count == 2 mocked_filesystem.assert_called_with("file", **fs_args) @@ -259,12 +246,12 @@ def test_fs_args(self, mocker): @pytest.mark.parametrize("dataset", ["pandas.ParquetDataSet", ParquetDataSet]) def test_invalid_dataset(self, dataset, local_csvs): - pds = PartitionedDataSet(str(local_csvs), dataset) + pds = PartitionedDataset(str(local_csvs), dataset) loaded_partitions = pds.load() for partition, df_loader in loaded_partitions.items(): pattern = r"Failed while loading data from data set ParquetDataSet(.*)" - with pytest.raises(DataSetError, match=pattern) as exc_info: + with pytest.raises(DatasetError, match=pattern) as exc_info: df_loader() error_message = str(exc_info.value) assert ( @@ -276,22 +263,22 @@ def test_invalid_dataset(self, dataset, local_csvs): @pytest.mark.parametrize( "dataset_config,error_pattern", [ - ("UndefinedDatasetType", "Class `UndefinedDatasetType` not found"), + ("UndefinedDatasetType", "Class 'UndefinedDatasetType' not found"), ( "missing.module.UndefinedDatasetType", - r"Class `missing\.module\.UndefinedDatasetType` not found", + r"Class 'missing\.module\.UndefinedDatasetType' not found", ), ( - FakeDataSet, - r"DataSet type `tests\.io\.test_partitioned_dataset\.FakeDataSet` " - r"is invalid\: all data set types must extend `AbstractDataSet`", + FakeDataset, + r"Dataset type 'tests\.io\.test_partitioned_dataset\.FakeDataset' " + r"is invalid\: all data set types must extend 'AbstractDataSet'", ), - ({}, "`type` is missing from DataSet catalog configuration"), + ({}, "'type' is missing from dataset catalog configuration"), ], ) def test_invalid_dataset_config(self, dataset_config, error_pattern): - with pytest.raises(DataSetError, match=error_pattern): - PartitionedDataSet(str(Path.cwd()), dataset_config) + with pytest.raises(DatasetError, match=error_pattern): + PartitionedDataset(str(Path.cwd()), dataset_config) @pytest.mark.parametrize( "dataset_config", @@ -360,10 +347,10 @@ def test_malformed_versioned_path(self, tmp_path): pds.load() def test_no_partitions(self, tmpdir): - pds = PartitionedDataSet(str(tmpdir), "pandas.CSVDataSet") + pds = PartitionedDataset(str(tmpdir), "pandas.CSVDataSet") - pattern = re.escape(f"No partitions found in `{tmpdir}`") - with pytest.raises(DataSetError, match=pattern): + pattern = re.escape(f"No partitions found in '{tmpdir}'") + with pytest.raises(DatasetError, match=pattern): pds.load() @pytest.mark.parametrize( @@ -388,16 +375,16 @@ def test_no_partitions(self, tmpdir): ) def test_filepath_arg_warning(self, pds_config, filepath_arg): pattern = ( - f"`{filepath_arg}` key must not be specified in the dataset definition as it " + f"'{filepath_arg}' key must not be specified in the dataset definition as it " f"will be overwritten by partition path" ) with pytest.warns(UserWarning, match=re.escape(pattern)): - PartitionedDataSet(**pds_config) + PartitionedDataset(**pds_config) def test_credentials_log_warning(self, caplog): """Check that the warning is logged if the dataset credentials will overwrite the top-level ones""" - pds = PartitionedDataSet( + pds = PartitionedDataset( path=str(Path.cwd()), dataset={"type": CSVDataSet, "credentials": {"secret": "dataset"}}, credentials={"secret": "global"}, @@ -412,7 +399,7 @@ def test_credentials_log_warning(self, caplog): def test_fs_args_log_warning(self, caplog): """Check that the warning is logged if the dataset filesystem arguments will overwrite the top-level ones""" - pds = PartitionedDataSet( + pds = PartitionedDataset( path=str(Path.cwd()), dataset={"type": CSVDataSet, "fs_args": {"args": "dataset"}}, fs_args={"args": "dataset"}, @@ -465,7 +452,7 @@ def test_fs_args_log_warning(self, caplog): ) def test_dataset_creds(self, pds_config, expected_ds_creds, global_creds): """Check that global credentials do not interfere dataset credentials.""" - pds = PartitionedDataSet(path=str(Path.cwd()), **pds_config) + pds = PartitionedDataset(path=str(Path.cwd()), **pds_config) assert pds._dataset_config["credentials"] == expected_ds_creds assert pds._credentials == global_creds @@ -486,6 +473,7 @@ def mocked_s3_bucket(): with mock_s3(): conn = boto3.client( "s3", + region_name="us-east-1", aws_access_key_id="fake_access_key", aws_secret_access_key="fake_secret_key", ) @@ -505,15 +493,13 @@ def mocked_csvs_in_s3(mocked_s3_bucket, partitioned_data_pandas): return f"s3://{BUCKET_NAME}/{prefix}" -class TestPartitionedDataSetS3: - @pytest.fixture(autouse=True) - def fake_aws_creds(self, monkeypatch): - monkeypatch.setenv("AWS_ACCESS_KEY_ID", "FAKE_ACCESS_KEY") - monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "FAKE_SECRET_KEY") +class TestPartitionedDatasetS3: + os.environ["AWS_ACCESS_KEY_ID"] = "FAKE_ACCESS_KEY" + os.environ["AWS_SECRET_ACCESS_KEY"] = "FAKE_SECRET_KEY" @pytest.mark.parametrize("dataset", S3_DATASET_DEFINITION) def test_load(self, dataset, mocked_csvs_in_s3, partitioned_data_pandas): - pds = PartitionedDataSet(mocked_csvs_in_s3, dataset) + pds = PartitionedDataset(mocked_csvs_in_s3, dataset) loaded_partitions = pds.load() assert loaded_partitions.keys() == partitioned_data_pandas.keys() @@ -526,7 +512,7 @@ def test_load_s3a(self, mocked_csvs_in_s3, partitioned_data_pandas, mocker): s3a_path = f"s3a://{path}" # any type is fine as long as it passes isinstance check # since _dataset_type is mocked later anyways - pds = PartitionedDataSet(s3a_path, "pandas.CSVDataSet") + pds = PartitionedDataset(s3a_path, "pandas.CSVDataSet") assert pds._protocol == "s3a" mocked_ds = mocker.patch.object(pds, "_dataset_type") @@ -541,9 +527,22 @@ def test_load_s3a(self, mocked_csvs_in_s3, partitioned_data_pandas, mocker): ] mocked_ds.assert_has_calls(expected, any_order=True) + @pytest.mark.parametrize( + "partition_path", ["s3_bucket/dummy.csv", "fake_bucket/dummy.csv"] + ) + def test_join_protocol_with_bucket_name_startswith_protocol( + self, mocked_csvs_in_s3, partition_path + ): + """Make sure protocol is joined correctly for the edge case when + bucket name starts with the protocol name, i.e. `s3://s3_bucket/dummy_.txt` + """ + + pds = PartitionedDataset(mocked_csvs_in_s3, "pandas.CSVDataSet") + assert pds._join_protocol(partition_path) == f"s3://{partition_path}" + @pytest.mark.parametrize("dataset", S3_DATASET_DEFINITION) def test_save(self, dataset, mocked_csvs_in_s3): - pds = PartitionedDataSet(mocked_csvs_in_s3, dataset) + pds = PartitionedDataset(mocked_csvs_in_s3, dataset) original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) part_id = "new/data.csv" pds.save({part_id: original_data}) @@ -562,7 +561,7 @@ def test_save_s3a(self, mocked_csvs_in_s3, mocker): s3a_path = f"s3a://{path}" # any type is fine as long as it passes isinstance check # since _dataset_type is mocked later anyways - pds = PartitionedDataSet(s3a_path, "pandas.CSVDataSet", filename_suffix=".csv") + pds = PartitionedDataset(s3a_path, "pandas.CSVDataSet", filename_suffix=".csv") assert pds._protocol == "s3a" mocked_ds = mocker.patch.object(pds, "_dataset_type") @@ -576,18 +575,18 @@ def test_save_s3a(self, mocked_csvs_in_s3, mocker): @pytest.mark.parametrize("dataset", ["pandas.CSVDataSet", "pandas.HDFDataSet"]) def test_exists(self, dataset, mocked_csvs_in_s3): - assert PartitionedDataSet(mocked_csvs_in_s3, dataset).exists() + assert PartitionedDataset(mocked_csvs_in_s3, dataset).exists() empty_folder = "/".join([mocked_csvs_in_s3, "empty", "folder"]) - assert not PartitionedDataSet(empty_folder, dataset).exists() + assert not PartitionedDataset(empty_folder, dataset).exists() s3fs.S3FileSystem().mkdir(empty_folder) - assert not PartitionedDataSet(empty_folder, dataset).exists() + assert not PartitionedDataset(empty_folder, dataset).exists() @pytest.mark.parametrize("dataset", S3_DATASET_DEFINITION) def test_release(self, dataset, mocked_csvs_in_s3): partition_to_remove = "p2.csv" - pds = PartitionedDataSet(mocked_csvs_in_s3, dataset) + pds = PartitionedDataset(mocked_csvs_in_s3, dataset) initial_load = pds.load() assert partition_to_remove in initial_load @@ -603,7 +602,7 @@ def test_release(self, dataset, mocked_csvs_in_s3): @pytest.mark.parametrize("dataset", S3_DATASET_DEFINITION) def test_describe(self, dataset): path = f"s3://{BUCKET_NAME}/foo/bar" - pds = PartitionedDataSet(path, dataset) + pds = PartitionedDataset(path, dataset) assert f"path={path}" in str(pds) assert "dataset_type=CSVDataSet" in str(pds) diff --git a/tests/io/test_transformers.py b/tests/io/test_transformers.py deleted file mode 100644 index 5c2095d46f..0000000000 --- a/tests/io/test_transformers.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -import re -from typing import Any, Callable, Dict - -import pytest - -from kedro.io import AbstractDataSet, DataCatalog, DataSetNotFoundError -from kedro.io.transformers import AbstractTransformer - - -class FakeDataSet(AbstractDataSet): - def __init__(self, data): - self.log = [] - self.data = data - - def _load(self) -> Any: - self.log.append(("load", self.data)) - return self.data - - def _save(self, data: Any) -> None: - self.log.append(("save", data)) - self.data = data - - def _describe(self) -> Dict[str, Any]: - return {"data": self.data} - - -class NoopTransformer(AbstractTransformer): - pass - - -class FakeTransformer(AbstractTransformer): - def __init__(self): - self.log = [] - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - res = load() - self.log.append(("load", res)) - return res + 1 - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - self.log.append(("save", data)) - save(data + 1) - - -@pytest.fixture -def fake_data_set(): - return FakeDataSet(123) - - -@pytest.fixture -def fake_transformer(): - return FakeTransformer() - - -@pytest.fixture -def catalog(fake_data_set): - return DataCatalog({"test": fake_data_set}) - - -class TestTransformers: - def test_noop(self, fake_data_set, catalog): - catalog.add_transformer(NoopTransformer()) - - catalog.save("test", 42) - assert catalog.load("test") == 42 - assert fake_data_set.log == [("save", 42), ("load", 42)] - - def test_basic(self, fake_data_set, catalog, fake_transformer): - catalog.add_transformer(fake_transformer) - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_copy(self, fake_data_set, catalog, fake_transformer): - catalog.add_transformer(fake_transformer) - catalog = catalog.shallow_copy() - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_specific(self, fake_data_set, catalog, fake_transformer): - catalog.add_transformer(fake_transformer, "test") - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_specific_list(self, fake_data_set, catalog, fake_transformer): - catalog.add_transformer(fake_transformer, ["test"]) - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_not_found_error(self, fake_transformer): - catalog = DataCatalog() - - with pytest.raises(DataSetNotFoundError): - catalog.add_transformer(fake_transformer, "test") - - def test_not_found_error_in_constructor(self): - with pytest.raises(DataSetNotFoundError): - DataCatalog(transformers={"test": []}) - - def test_all_before_adding(self, fake_data_set, fake_transformer): - catalog = DataCatalog() - catalog.add_transformer(fake_transformer) - catalog.add("test", fake_data_set) - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_all_before_copy_and_add(self, fake_data_set, fake_transformer): - catalog = DataCatalog() - catalog.add_transformer(fake_transformer) - catalog = catalog.shallow_copy() - catalog.add("test", fake_data_set) - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_add_bad_transformer(self, catalog): - with pytest.raises(TypeError, match="not an instance of AbstractTransformer"): - catalog.add_transformer(object) - - def test_deprecation_warning(self, catalog, fake_transformer): - pattern = ( - "The transformer API will be deprecated in Kedro 0.18.0." - "Please use Dataset Hooks to customise the load and save methods." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/04_hooks.html" - ) - with pytest.warns(DeprecationWarning, match=re.escape(pattern)): - catalog.add_transformer(fake_transformer) diff --git a/tests/extras/transformers/__init__.py b/tests/ipython/__init__.py similarity index 100% rename from tests/extras/transformers/__init__.py rename to tests/ipython/__init__.py diff --git a/tests/ipython/test_ipython.py b/tests/ipython/test_ipython.py new file mode 100644 index 0000000000..0eaa25e33d --- /dev/null +++ b/tests/ipython/test_ipython.py @@ -0,0 +1,294 @@ +# pylint: disable=import-outside-toplevel +from pathlib import Path + +import pytest +from IPython.core.error import UsageError +from IPython.testing.globalipapp import get_ipython + +from kedro.framework.project import pipelines +from kedro.framework.startup import ProjectMetadata +from kedro.ipython import _resolve_project_path, load_ipython_extension, reload_kedro +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline + +PACKAGE_NAME = "fake_package_name" +PROJECT_NAME = "fake_project_name" +PROJECT_VERSION = "0.1" + + +@pytest.fixture(autouse=True) +def cleanup_pipeline(): + yield + from kedro.framework.project import pipelines # pylint: disable=reimported + + pipelines.configure() + + +@pytest.fixture(scope="module", autouse=True) # get_ipython() twice will result in None +def ipython(): + ipython = get_ipython() + load_ipython_extension(ipython) + return ipython + + +@pytest.fixture(autouse=True) +def fake_metadata(tmp_path): + metadata = ProjectMetadata( + source_dir=tmp_path / "src", # default + config_file=tmp_path / "pyproject.toml", + package_name=PACKAGE_NAME, + project_name=PROJECT_NAME, + project_version=PROJECT_VERSION, + kedro_init_version=PROJECT_VERSION, + project_path=tmp_path, + ) + return metadata + + +@pytest.fixture(autouse=True) +def mock_kedro_project(mocker, fake_metadata): + mocker.patch("kedro.ipython.bootstrap_project", return_value=fake_metadata) + mocker.patch("kedro.ipython.configure_project") + mocker.patch("kedro.ipython.KedroSession.create") + + +class TestLoadKedroObjects: + def test_ipython_load_entry_points( + self, + mocker, + fake_metadata, + caplog, + ): + mock_line_magic = mocker.MagicMock() + mock_line_magic_name = "abc" + mock_line_magic.__name__ = mock_line_magic_name + mock_line_magic.__qualname__ = mock_line_magic_name # Required by IPython + + mocker.patch("kedro.ipython.load_entry_points", return_value=[mock_line_magic]) + expected_message = f"Registered line magic '{mock_line_magic_name}'" + + reload_kedro(fake_metadata.project_path) + + log_messages = [record.getMessage() for record in caplog.records] + assert expected_message in log_messages + + def test_ipython_lazy_load_pipeline( + self, + mocker, + ): + pipelines.configure("dummy_pipeline") # Setup the pipelines + + my_pipelines = {"ds": modular_pipeline([])} + + def my_register_pipeline(): + return my_pipelines + + mocker.patch.object( + pipelines, + "_get_pipelines_registry_callable", + return_value=my_register_pipeline, + ) + reload_kedro() + + assert pipelines._content == {} # Check if it is lazy loaded + pipelines._load_data() # Trigger data load + assert pipelines._content == my_pipelines + + def test_ipython_load_objects( + self, + mocker, + ipython, + ): + mock_session_create = mocker.patch("kedro.ipython.KedroSession.create") + pipelines.configure("dummy_pipeline") # Setup the pipelines + + my_pipelines = {"ds": modular_pipeline([])} + + def my_register_pipeline(): + return my_pipelines + + mocker.patch.object( + pipelines, + "_get_pipelines_registry_callable", + return_value=my_register_pipeline, + ) + ipython_spy = mocker.spy(ipython, "push") + + reload_kedro() + + mock_session_create.assert_called_once_with( + PACKAGE_NAME, None, env=None, extra_params=None + ) + _, kwargs = ipython_spy.call_args_list[0] + variables = kwargs["variables"] + + assert variables["context"] == mock_session_create().load_context() + assert variables["catalog"] == mock_session_create().load_context().catalog + assert variables["session"] == mock_session_create() + assert variables["pipelines"] == my_pipelines + + def test_ipython_load_objects_with_args(self, mocker, fake_metadata, ipython): + mock_session_create = mocker.patch("kedro.ipython.KedroSession.create") + pipelines.configure("dummy_pipeline") # Setup the pipelines + + my_pipelines = {"ds": modular_pipeline([])} + + def my_register_pipeline(): + return my_pipelines + + mocker.patch.object( + pipelines, + "_get_pipelines_registry_callable", + return_value=my_register_pipeline, + ) + ipython_spy = mocker.spy(ipython, "push") + dummy_env = "env" + dummy_dict = {"key": "value"} + + reload_kedro(fake_metadata.project_path, "env", {"key": "value"}) + + mock_session_create.assert_called_once_with( + PACKAGE_NAME, + fake_metadata.project_path, + env=dummy_env, + extra_params=dummy_dict, + ) + _, kwargs = ipython_spy.call_args_list[0] + variables = kwargs["variables"] + + assert variables["context"] == mock_session_create().load_context() + assert variables["catalog"] == mock_session_create().load_context().catalog + assert variables["session"] == mock_session_create() + assert variables["pipelines"] == my_pipelines + + +class TestLoadIPythonExtension: + def test_load_ipython_extension(self, ipython): + ipython.magic("load_ext kedro.ipython") + + def test_load_extension_missing_dependency(self, mocker): + mocker.patch("kedro.ipython.reload_kedro", side_effect=ImportError) + mocker.patch( + "kedro.ipython._find_kedro_project", + return_value=mocker.Mock(), + ) + mocker.patch("IPython.core.magic.register_line_magic") + mocker.patch("IPython.core.magic_arguments.magic_arguments") + mocker.patch("IPython.core.magic_arguments.argument") + mock_ipython = mocker.patch("IPython.get_ipython") + + with pytest.raises(ImportError): + load_ipython_extension(mocker.Mock()) + + assert not mock_ipython().called + assert not mock_ipython().push.called + + def test_load_extension_not_in_kedro_project(self, mocker, caplog): + mocker.patch("kedro.ipython._find_kedro_project", return_value=None) + mocker.patch("IPython.core.magic.register_line_magic") + mocker.patch("IPython.core.magic_arguments.magic_arguments") + mocker.patch("IPython.core.magic_arguments.argument") + mock_ipython = mocker.patch("IPython.get_ipython") + + load_ipython_extension(mocker.Mock()) + + assert not mock_ipython().called + assert not mock_ipython().push.called + + log_messages = [record.getMessage() for record in caplog.records] + expected_message = ( + "Kedro extension was registered but couldn't find a Kedro project. " + "Make sure you run '%reload_kedro '." + ) + assert expected_message in log_messages + + def test_load_extension_register_line_magic(self, mocker, ipython): + mocker.patch("kedro.ipython._find_kedro_project") + mock_reload_kedro = mocker.patch("kedro.ipython.reload_kedro") + load_ipython_extension(ipython) + mock_reload_kedro.assert_called_once() + + # Calling the line magic to check if the line magic is available + ipython.magic("reload_kedro") + assert mock_reload_kedro.call_count == 2 + + @pytest.mark.parametrize( + "args", + [ + "", + ".", + ". --env=base", + "--env=base", + "-e base", + ". --env=base --params=key:val", + ], + ) + def test_line_magic_with_valid_arguments(self, mocker, args, ipython): + mocker.patch("kedro.ipython._find_kedro_project") + mocker.patch("kedro.ipython.reload_kedro") + + ipython.magic(f"reload_kedro {args}") + + def test_line_magic_with_invalid_arguments(self, mocker, ipython): + mocker.patch("kedro.ipython._find_kedro_project") + mocker.patch("kedro.ipython.reload_kedro") + load_ipython_extension(ipython) + + with pytest.raises( + UsageError, match=r"unrecognized arguments: --invalid_arg=dummy" + ): + ipython.magic("reload_kedro --invalid_arg=dummy") + + +class TestProjectPathResolution: + def test_only_path_specified(self): + result = _resolve_project_path(path="/test") + expected = Path("/test").resolve() + assert result == expected + + def test_only_local_namespace_specified(self): + # pylint: disable=too-few-public-methods + class MockKedroContext: + # A dummy stand-in for KedroContext sufficient for this test + _project_path = Path("/test").resolve() + + result = _resolve_project_path(local_namespace={"context": MockKedroContext()}) + expected = Path("/test").resolve() + assert result == expected + + def test_no_path_no_local_namespace_specified(self, mocker): + mocker.patch( + "kedro.ipython._find_kedro_project", return_value=Path("/test").resolve() + ) + result = _resolve_project_path() + expected = Path("/test").resolve() + assert result == expected + + def test_project_path_unresolvable(self, mocker): + mocker.patch("kedro.ipython._find_kedro_project", return_value=None) + result = _resolve_project_path() + expected = None + assert result == expected + + def test_project_path_unresolvable_warning(self, mocker, caplog, ipython): + mocker.patch("kedro.ipython._find_kedro_project", return_value=None) + ipython.magic("reload_ext kedro.ipython") + log_messages = [record.getMessage() for record in caplog.records] + expected_message = ( + "Kedro extension was registered but couldn't find a Kedro project. " + "Make sure you run '%reload_kedro '." + ) + assert expected_message in log_messages + + def test_project_path_update(self, caplog): + # pylint: disable=too-few-public-methods + class MockKedroContext: + # A dummy stand-in for KedroContext sufficient for this test + _project_path = Path("/test").resolve() + + local_namespace = {"context": MockKedroContext()} + updated_path = Path("/updated_path").resolve() + _resolve_project_path(path=updated_path, local_namespace=local_namespace) + + log_messages = [record.getMessage() for record in caplog.records] + expected_message = f"Updating path to Kedro project: {updated_path}..." + assert expected_message in log_messages diff --git a/tests/pipeline/test_decorators.py b/tests/pipeline/test_decorators.py deleted file mode 100644 index 6b6f89c30c..0000000000 --- a/tests/pipeline/test_decorators.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from functools import partial -from time import sleep - -from kedro.io import DataCatalog -from kedro.pipeline import Pipeline, node -from kedro.pipeline.decorators import log_time -from kedro.runner import SequentialRunner - - -def sleeping_identity(inp): - sleep(0.1) - return inp - - -def identity(arg): - return arg - - -def test_log_time(caplog): - caplog.clear() - func = log_time(sleeping_identity) - res = func(1) - - logger_name, severity, message = caplog.record_tuples[0] - assert res == 1 - assert logger_name == "kedro.pipeline.decorators" - assert severity == logging.INFO - expected = "Running '%s.%s' took" % ( - sleeping_identity.__module__, - sleeping_identity.__qualname__, - ) - assert expected in message - - -def test_log_time_no_module(caplog): - """When func module is not defined, function full name is not logged.""" - - def no_module(arg): - return sleeping_identity(arg) - - no_module.__module__ = None - - caplog.clear() - func = log_time(no_module) - res = func(1) - - logger_name, severity, message = caplog.record_tuples[0] - assert res == 1 - assert logger_name == "kedro.pipeline.decorators" - assert severity == logging.INFO - expected = f"Running {no_module.__qualname__!r} took" - assert expected in message - - -def test_log_time_with_partial(recwarn): - pipeline = Pipeline( - [node(partial(identity, 1), None, "output", name="identity1")] - ).decorate(log_time) - catalog = DataCatalog({}, dict(number=1)) - result = SequentialRunner().run(pipeline, catalog) - assert result["output"] == 1 - warning = recwarn.pop(UserWarning) - assert ( - "The node producing outputs `['output']` is made from a " - "`partial` function. Partial functions do not have a " - "`__name__` attribute" in str(warning.message) - ) diff --git a/tests/pipeline/test_modular_pipeline.py b/tests/pipeline/test_modular_pipeline.py index 584dfe9855..8e4f06330f 100644 --- a/tests/pipeline/test_modular_pipeline.py +++ b/tests/pipeline/test_modular_pipeline.py @@ -1,34 +1,8 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import pytest -from kedro.pipeline import Pipeline, node, pipeline +from kedro.pipeline import node, pipeline from kedro.pipeline.modular_pipeline import ModularPipelineError +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline # Different dummy func based on the number of arguments @@ -54,7 +28,7 @@ def test_transform_dataset_names(self): """ Rename some datasets, test string, list and dict formats. """ - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(identity, "A", "B", name="node1"), node(biconcat, ["C", "D"], ["E", "F"], name="node2"), @@ -85,7 +59,7 @@ def test_prefix_dataset_names(self): """ Simple prefixing for dataset of all formats: str, list and dict """ - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(identity, "A", "B", name="node1"), node(biconcat, ["C", "D"], ["E", "F"], name="node2"), @@ -110,7 +84,7 @@ def test_prefixing_and_renaming(self): Prefixing and renaming at the same time. Explicitly renamed datasets should not be prefixed anymore. """ - raw_pipeline = Pipeline([node(biconcat, ["C", "D"], ["E", "F"])]) + raw_pipeline = modular_pipeline([node(biconcat, ["C", "D"], ["E", "F"])]) resulting_pipeline = pipeline( raw_pipeline, namespace="PREFIX", @@ -125,7 +99,7 @@ def test_prefixing_and_renaming(self): [("A", "D"), (["A"], ["D"]), ({"A"}, {"D"}), ({"A": "A"}, {"D": "D"})], ) def test_prefix_exclude_free_inputs(self, inputs, outputs): - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(identity, "A", "B", name="node1"), node(identity, "B", "C", name="node2"), @@ -149,7 +123,7 @@ def test_transform_params_prefix_and_parameters(self): """ Test that transform should prefix all parameters by default. """ - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(identity, "parameters", "params:B", name="node1"), node(biconcat, ["params:C", "D"], ["parameters", "F"], name="node2"), @@ -174,7 +148,9 @@ def test_transform_params_prefix_and_parameters(self): assert nodes[2].name == "PREFIX.node3" def test_dataset_transcoding_mapping_base_name(self): - raw_pipeline = Pipeline([node(biconcat, ["C@pandas", "D"], ["E@spark", "F"])]) + raw_pipeline = modular_pipeline( + [node(biconcat, ["C@pandas", "D"], ["E@spark", "F"])] + ) resulting_pipeline = pipeline( raw_pipeline, namespace="PREFIX", inputs={"C": "C_new"} ) @@ -183,7 +159,7 @@ def test_dataset_transcoding_mapping_base_name(self): assert resulting_pipeline.nodes[0]._outputs == ["PREFIX.E@spark", "PREFIX.F"] def test_dataset_transcoding_mapping_full_dataset(self): - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(biconcat, ["A@pandas", "B"], "C"), node(biconcat, ["A@spark", "C"], "CC"), @@ -200,7 +176,7 @@ def test_dataset_transcoding_mapping_full_dataset(self): assert resulting_pipeline.nodes[1]._outputs == "PREFIX.CC" def test_empty_input(self): - raw_pipeline = Pipeline([node(constant_output, None, ["A", "B"])]) + raw_pipeline = modular_pipeline([node(constant_output, None, ["A", "B"])]) resulting_pipeline = pipeline( raw_pipeline, namespace="PREFIX", outputs={"A": "A_new"} @@ -209,7 +185,7 @@ def test_empty_input(self): assert resulting_pipeline.nodes[0]._outputs == ["A_new", "PREFIX.B"] def test_empty_output(self): - raw_pipeline = Pipeline([node(biconcat, ["A", "B"], None)]) + raw_pipeline = modular_pipeline([node(biconcat, ["A", "B"], None)]) resulting_pipeline = pipeline( raw_pipeline, namespace="PREFIX", inputs={"A": "A_new"} @@ -243,8 +219,8 @@ def test_empty_output(self): ) def test_missing_dataset_name( self, func, inputs, outputs, inputs_map, outputs_map, expected_missing - ): # pylint: disable=too-many-arguments - raw_pipeline = Pipeline([node(func, inputs, outputs)]) + ): # noqa: too-many-arguments + raw_pipeline = modular_pipeline([node(func, inputs, outputs)]) with pytest.raises(ModularPipelineError, match=r"Failed to map datasets") as e: pipeline( @@ -257,17 +233,17 @@ def test_node_properties_preserved(self): Check that we don't loose any valuable properties on node cloning. Also an explicitly defined name should get prefixed. """ - raw_pipeline = Pipeline([node(identity, "A", "B", name="node1", tags=["tag1"])]) - raw_pipeline = raw_pipeline.decorate(lambda: None) + raw_pipeline = modular_pipeline( + [node(identity, "A", "B", name="node1", tags=["tag1"])] + ) resulting_pipeline = pipeline(raw_pipeline, namespace="PREFIX") assert resulting_pipeline.nodes[0].name == "PREFIX.node1" assert resulting_pipeline.nodes[0].tags == {"tag1"} - assert len(resulting_pipeline.nodes[0]._decorators) == 1 def test_default_node_name_is_namespaced(self): """Check that auto-generated node names are also namespaced""" - raw_pipeline = Pipeline([node(identity, "A", "B")]) + raw_pipeline = modular_pipeline([node(identity, "A", "B")]) first_layer_nested_pipe = pipeline(raw_pipeline, namespace="PREFIX") resulting_node = first_layer_nested_pipe.nodes[0] @@ -283,7 +259,7 @@ def test_default_node_name_is_namespaced(self): def test_expose_intermediate_output(self): """Check that we don't namespace an intermediary dataset, anywhere it is used - either input or output""" - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(identity, "A", "B", name="node1"), node(identity, "B", "C", name="node2"), @@ -308,7 +284,9 @@ def test_expose_intermediate_output(self): assert actual_nodes[3]._outputs == "ACTUAL.X" def test_parameters_left_intact_when_defined_as_str(self): - raw_pipeline = Pipeline([node(biconcat, ["A", "params:x"], "AA", name="node1")]) + raw_pipeline = modular_pipeline( + [node(biconcat, ["A", "params:x"], "AA", name="node1")] + ) resulting_pipeline = pipeline( raw_pipeline, outputs={"AA": "B"}, parameters="x", namespace="PREFIX" ) @@ -318,10 +296,10 @@ def test_parameters_left_intact_when_defined_as_str(self): assert actual_nodes[0]._outputs == "B" @pytest.mark.parametrize( - "parameters", ["params:x", set(["params:x"]), {"params:x": "params:x"}] + "parameters", ["params:x", {"params:x"}, {"params:x": "params:x"}] ) def test_parameters_left_intact_when_defined_as_(self, parameters): - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [node(triconcat, ["A", "params:x", "params:y"], "AA", name="node1")] ) resulting_pipeline = pipeline( @@ -337,7 +315,7 @@ def test_parameters_left_intact_when_defined_as_(self, parameters): assert actual_nodes[0]._outputs == "B" def test_parameters_updated_with_dict(self): - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(biconcat, ["A", "params:x"], "AA", name="node1"), node(biconcat, ["AA", "params:y"], "B", name="node2"), @@ -362,13 +340,13 @@ def test_parameters_updated_with_dict(self): assert actual_nodes[2]._outputs == "ACTUAL.BB" def test_parameters_defined_with_params_prefix(self): - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [node(triconcat, ["A", "params:x", "params:y"], "AA", name="node1")] ) resulting_pipeline = pipeline( raw_pipeline, outputs={"AA": "B"}, - parameters=set(["params:x"]), + parameters={"params:x"}, namespace="PREFIX", ) actual_nodes = resulting_pipeline.nodes @@ -378,14 +356,14 @@ def test_parameters_defined_with_params_prefix(self): assert actual_nodes[0]._outputs == "B" def test_parameters_specified_under_inputs(self): - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(biconcat, ["A", "params:alpha"], "AA", name="node1"), node(biconcat, ["AA", "parameters"], "BB", name="node2"), ] ) - pattern = r"Parameters should be specified in the `parameters` argument" + pattern = r"Parameters should be specified in the 'parameters' argument" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, inputs={"params:alpha": "params:beta"}) @@ -393,7 +371,7 @@ def test_parameters_specified_under_inputs(self): pipeline(raw_pipeline, inputs={"parameters": "some_yaml_dataset"}) def test_non_existent_parameters_mapped(self): - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(biconcat, ["A", "params:alpha"], "AA", name="node1"), node(biconcat, ["AA", "CC"], "BB", name="node2"), @@ -409,7 +387,7 @@ def test_non_existent_parameters_mapped(self): pipeline(raw_pipeline, parameters={"parameters": "some_yaml_dataset"}) def test_bad_inputs_mapping(self): - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(biconcat, ["A", "params:alpha"], "AA", name="node1"), node(biconcat, ["AA", "parameters"], "BB", name="node2"), @@ -421,7 +399,7 @@ def test_bad_inputs_mapping(self): pipeline(raw_pipeline, inputs={"AA": "CC"}) def test_bad_outputs_mapping(self): - raw_pipeline = Pipeline( + raw_pipeline = modular_pipeline( [ node(biconcat, ["A", "params:alpha"], "AA", name="node1"), node(biconcat, ["AA", "parameters"], "BB", name="node2"), @@ -431,3 +409,17 @@ def test_bad_outputs_mapping(self): pattern = "Outputs can't contain free inputs to the pipeline" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, outputs={"A": "C"}) + + def test_pipeline_always_copies(self): + original_pipeline = pipeline([node(constant_output, None, "A")]) + new_pipeline = pipeline(original_pipeline) + assert new_pipeline.nodes == original_pipeline.nodes + assert new_pipeline is not original_pipeline + + def test_pipeline_tags(self): + tagged_pipeline = pipeline( + [node(constant_output, None, "A"), node(constant_output, None, "B")], + tags="tag", + ) + + assert all(n.tags == {"tag"} for n in tagged_pipeline.nodes) diff --git a/tests/pipeline/test_node.py b/tests/pipeline/test_node.py index 00a333ab74..87920d0dbd 100644 --- a/tests/pipeline/test_node.py +++ b/tests/pipeline/test_node.py @@ -1,30 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import re from functools import partial, update_wrapper, wraps from typing import Callable @@ -58,11 +31,11 @@ def simple_tuple_node_list(): (biconcat, ["A", "B"], "C"), (identity, "C", ["D", "E"]), (biconcat, ["H", "I"], ["J", "K"]), - (identity, "J", dict(result="K")), - (biconcat, ["J", "K"], dict(result="L")), - (identity, dict(input1="J"), "L"), - (identity, dict(input1="J"), ["L", "M"]), - (identity, dict(input1="J"), dict(result="K")), + (identity, "J", {"result": "K"}), + (biconcat, ["J", "K"], {"result": "L"}), + (identity, {"input1": "J"}, "L"), + (identity, {"input1": "J"}, ["L", "M"]), + (identity, {"input1": "J"}, {"result": "K"}), (constant_output, None, "M"), (biconcat, ["N", "O"], None), (lambda x: None, "F", "G"), @@ -94,7 +67,7 @@ def test_call(self): biconcat, inputs=["input1", "input2"], outputs="output", name="myname" ) actual = dummy_node(input1="in1", input2="in2") - expected = dummy_node.run(dict(input1="in1", input2="in2")) + expected = dummy_node.run({"input1": "in1", "input2": "in2"}) assert actual == expected def test_call_with_non_keyword_arguments(self): @@ -107,14 +80,14 @@ def test_call_with_non_keyword_arguments(self): def test_run_with_duplicate_inputs_list(self): dummy_node = node(func=biconcat, inputs=["input1", "input1"], outputs="output") - actual = dummy_node.run(dict(input1="in1")) + actual = dummy_node.run({"input1": "in1"}) assert actual == {"output": "in1in1"} def test_run_with_duplicate_inputs_dict(self): dummy_node = node( func=biconcat, inputs={"input1": "in1", "input2": "in1"}, outputs="output" ) - actual = dummy_node.run(dict(in1="hello")) + actual = dummy_node.run({"in1": "hello"}) assert actual == {"output": "hellohello"} def test_no_input(self): @@ -268,11 +241,11 @@ def no_input_or_output_node(): def input_same_as_output_node(): - return biconcat, ["A", "B"], dict(a="A") + return biconcat, ["A", "B"], {"a": "A"} def duplicate_output_dict_node(): - return identity, "A", dict(a="A", b="A") + return identity, "A", {"a": "A", "b": "A"} def duplicate_output_list_node(): @@ -282,10 +255,10 @@ def duplicate_output_list_node(): @pytest.mark.parametrize( "func, expected", [ - (bad_input_type_node, r"`inputs` type must be one of "), - (bad_output_type_node, r"`outputs` type must be one of "), + (bad_input_type_node, r"'inputs' type must be one of "), + (bad_output_type_node, r"'outputs' type must be one of "), (bad_function_type_node, r"first argument must be a function"), - (no_input_or_output_node, r"it must have some `inputs` or `outputs`"), + (no_input_or_output_node, r"it must have some 'inputs' or 'outputs'"), ( input_same_as_output_node, r"A node cannot have the same inputs and outputs: {\'A\'}", @@ -293,13 +266,13 @@ def duplicate_output_list_node(): ( duplicate_output_dict_node, r"Failed to create node identity" - r"\(\[A\]\) -> \[A,A\] due to " + r"\(\[A\]\) -> \[A;A\] due to " r"duplicate output\(s\) {\'A\'}.", ), ( duplicate_output_list_node, r"Failed to create node identity" - r"\(\[A\]\) -> \[A,A\] due to " + r"\(\[A\]\) -> \[A;A\] due to " r"duplicate output\(s\) {\'A\'}.", ), ], @@ -327,7 +300,7 @@ def dummy_func_args(**kwargs): return dummy_func_args, "A", "B" -lambda_identity = lambda input1: input1 # noqa: disable=E731 +lambda_identity = lambda input1: input1 # noqa: disable=E731 # pylint: disable=C3001 def lambda_inconsistent_input_size(): @@ -374,57 +347,17 @@ def test_bad_input(func, expected): def apply_f(func: Callable) -> Callable: @wraps(func) def with_f(*args, **kwargs): - return func(*[f"f({a})" for a in args], **kwargs) + return func(*(f"f({a})" for a in args), **kwargs) # pragma: no cover return with_f -def apply_g(func: Callable) -> Callable: - @wraps(func) - def with_g(*args, **kwargs): - return func(*[f"g({a})" for a in args], **kwargs) - - return with_g - - -def apply_h(func: Callable) -> Callable: - @wraps(func) - def with_h(*args, **kwargs): - return func(*[f"h({a})" for a in args], **kwargs) - - return with_h - - -def apply_ij(func: Callable) -> Callable: - @wraps(func) - def with_ij(*args, **kwargs): - return func(*[f"ij({a})" for a in args], **kwargs) - - return with_ij - - @apply_f def decorated_identity(value): - return value - - -class TestTagDecorator: - def test_apply_decorators(self): - old_node = node(apply_g(decorated_identity), "input", "output", name="node") - pattern = ( - "The node's `decorate` API will be deprecated in Kedro 0.18.0." - "Please use a node's Hooks to extend the node's behaviour in a pipeline." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/04_hooks.html" - ) - with pytest.warns(DeprecationWarning, match=re.escape(pattern)): - new_node = old_node.decorate(apply_h, apply_ij) - result = new_node.run(dict(input=1)) + return value # pragma: no cover - assert old_node.name == new_node.name - assert "output" in result - assert result["output"] == "f(g(ij(h(1))))" +class TestTag: def test_tag_nodes(self): tagged_node = node(identity, "input", "output", tags=["hello"]).tag(["world"]) assert "hello" in tagged_node.tags @@ -435,15 +368,6 @@ def test_tag_nodes_single_tag(self): tagged_node = node(identity, "input", "output", tags="hello").tag("world") assert "hello" in tagged_node.tags assert "world" in tagged_node.tags - assert len(tagged_node.tags) == 2 - - def test_tag_and_decorate(self): - tagged_node = node(identity, "input", "output", tags=["hello"]) - tagged_node = tagged_node.decorate(apply_f) - tagged_node = tagged_node.tag(["world"]) - assert "hello" in tagged_node.tags - assert "world" in tagged_node.tags - assert tagged_node.run(dict(input=1))["output"] == "f(1)" class TestNames: @@ -502,7 +426,7 @@ def test_updated_partial(self): def test_updated_partial_dict_inputs(self): n = node( update_wrapper(partial(biconcat, input1=["in1"]), biconcat), - dict(input2="in2"), + {"input2": "in2"}, ["out"], ) assert str(n) == "biconcat([in2]) -> [out]" diff --git a/tests/pipeline/test_node_run.py b/tests/pipeline/test_node_run.py index ba473b95cd..40289890ef 100644 --- a/tests/pipeline/test_node_run.py +++ b/tests/pipeline/test_node_run.py @@ -1,36 +1,8 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - # pylint: disable=unused-argument import pytest -from kedro.io import LambdaDataSet +from kedro.io import LambdaDataset from kedro.pipeline import node @@ -38,7 +10,7 @@ def mocked_dataset(mocker): load = mocker.Mock(return_value=42) save = mocker.Mock() - return LambdaDataSet(load, save) + return LambdaDataset(load, save) def one_in_one_out(arg): @@ -46,7 +18,7 @@ def one_in_one_out(arg): def one_in_dict_out(arg): - return dict(ret=arg) + return {"ret": arg} def two_in_first_out(arg1, arg2): @@ -56,9 +28,9 @@ def two_in_first_out(arg1, arg2): @pytest.fixture def valid_nodes_with_inputs(): return [ - (node(one_in_one_out, "ds1", "dsOut"), dict(ds1=42)), - (node(one_in_dict_out, dict(arg="ds1"), dict(ret="dsOut")), dict(ds1=42)), - (node(two_in_first_out, ["ds1", "ds2"], "dsOut"), dict(ds1=42, ds2=58)), + (node(one_in_one_out, "ds1", "dsOut"), {"ds1": 42}), + (node(one_in_dict_out, {"arg": "ds1"}, {"ret": "dsOut"}), {"ds1": 42}), + (node(two_in_first_out, ["ds1", "ds2"], "dsOut"), {"ds1": 42, "ds2": 58}), ] @@ -72,9 +44,9 @@ def test_valid_nodes(valid_nodes_with_inputs): def test_run_got_dataframe(mocked_dataset): """Check an exception when non-dictionary (class object) is passed.""" pattern = r"Node.run\(\) expects a dictionary or None, " - pattern += r"but got instead" + pattern += r"but got instead" with pytest.raises(ValueError, match=pattern): - node(one_in_one_out, dict(arg="ds1"), "A").run(mocked_dataset) + node(one_in_one_out, {"arg": "ds1"}, "A").run(mocked_dataset) class TestNodeRunInvalidInput: @@ -86,15 +58,15 @@ def test_unresolved(self): def test_no_inputs_node_error(self, mocked_dataset): """Pass one input when none is expected.""" with pytest.raises(ValueError, match=r"expected no inputs"): - node(lambda: 1, None, "A").run(dict(unexpected=mocked_dataset)) + node(lambda: 1, None, "A").run({"unexpected": mocked_dataset}) def test_one_input_error(self, mocked_dataset): """Pass a different input.""" pattern = r"expected one input named 'ds1', but got the " pattern += r"following 1 input\(s\) instead: \['arg'\]" with pytest.raises(ValueError, match=pattern): - node(one_in_dict_out, "ds1", dict(ret="B", ans="C")).run( - dict(arg=mocked_dataset) + node(one_in_dict_out, "ds1", {"ret": "B", "ans": "C"}).run( + {"arg": mocked_dataset} ) def test_run_diff_size_lists(self, mocked_dataset): @@ -102,68 +74,68 @@ def test_run_diff_size_lists(self, mocked_dataset): pattern = r"expected 2 input\(s\) \['ds1', 'ds2'\], but " pattern += r"got the following 1 input\(s\) instead." with pytest.raises(ValueError, match=pattern): - node(two_in_first_out, ["ds1", "ds2"], "A").run(dict(ds1=mocked_dataset)) + node(two_in_first_out, ["ds1", "ds2"], "A").run({"ds1": mocked_dataset}) def test_run_diff_size_list_dict(self, mocked_dataset): """Pass two dict inputs when one (list) are expected.""" pattern = r"expected 1 input\(s\) \['ds1'\], but got the " pattern += r"following 2 input\(s\) instead: \['ds1', 'ds2'\]\." with pytest.raises(ValueError, match=pattern): - node(one_in_one_out, ["ds1"], "A").run(dict(ds1=mocked_dataset, ds2=2)) + node(one_in_one_out, ["ds1"], "A").run({"ds1": mocked_dataset, "ds2": 2}) def test_run_list_dict_unavailable(self, mocked_dataset): """Pass one dict which is different from expected.""" pattern = r"expected 1 input\(s\) \['ds1'\], but got the " pattern += r"following 1 input\(s\) instead: \['ds2'\]\." with pytest.raises(ValueError, match=pattern): - node(one_in_one_out, ["ds1"], "A").run(dict(ds2=mocked_dataset)) + node(one_in_one_out, ["ds1"], "A").run({"ds2": mocked_dataset}) def test_run_dict_unavailable(self, mocked_dataset): """Pass one dict which is different from expected.""" pattern = r"expected 1 input\(s\) \['ds1'\], but got the " pattern += r"following 1 input\(s\) instead: \['ds2'\]\." with pytest.raises(ValueError, match=pattern): - node(one_in_one_out, dict(arg="ds1"), "A").run(dict(ds2=mocked_dataset)) + node(one_in_one_out, {"arg": "ds1"}, "A").run({"ds2": mocked_dataset}) def test_run_dict_diff_size(self, mocked_dataset): """Pass two dict inputs when one is expected.""" pattern = r"expected 1 input\(s\) \['ds1'\], but got the " pattern += r"following 2 input\(s\) instead: \['ds1', 'ds2'\]\." with pytest.raises(ValueError, match=pattern): - node(one_in_one_out, dict(arg="ds1"), "A").run( - dict(ds1=mocked_dataset, ds2=2) + node(one_in_one_out, {"arg": "ds1"}, "A").run( + {"ds1": mocked_dataset, "ds2": 2} ) class TestNodeRunInvalidOutput: def test_miss_matching_output_types(self, mocked_dataset): - pattern = r"The node output is a dictionary, whereas the function " - pattern += r"output is not\." + pattern = "The node output is a dictionary, whereas the function " + pattern += "output is ." with pytest.raises(ValueError, match=pattern): - node(one_in_one_out, "ds1", dict(a="ds")).run(dict(ds1=mocked_dataset)) + node(one_in_one_out, "ds1", {"a": "ds"}).run({"ds1": mocked_dataset}) def test_miss_matching_output_keys(self, mocked_dataset): pattern = r"The node's output keys {'ret'} do not match " pattern += r"with the returned output's keys" with pytest.raises(ValueError, match=pattern): - node(one_in_dict_out, "ds1", dict(ret="B", ans="C")).run( - dict(ds1=mocked_dataset) + node(one_in_dict_out, "ds1", {"ret": "B", "ans": "C"}).run( + {"ds1": mocked_dataset} ) def test_node_not_list_output(self, mocked_dataset): pattern = r"The node definition contains a list of outputs " pattern += r"\['B', 'C'\], whereas the node function returned " - pattern += r"a `LambdaDataSet`" + pattern += r"a 'LambdaDataset'" with pytest.raises(ValueError, match=pattern): - node(one_in_one_out, "ds1", ["B", "C"]).run(dict(ds1=mocked_dataset)) + node(one_in_one_out, "ds1", ["B", "C"]).run({"ds1": mocked_dataset}) def test_node_wrong_num_of_outputs(self, mocker, mocked_dataset): def one_in_two_out(arg): load = mocker.Mock(return_value=42) save = mocker.Mock() - return [LambdaDataSet(load, save), LambdaDataSet(load, save)] + return [LambdaDataset(load, save), LambdaDataset(load, save)] pattern = r"The node function returned 2 output\(s\), whereas " pattern += r"the node definition contains 3 output\(s\)\." with pytest.raises(ValueError, match=pattern): - node(one_in_two_out, "ds1", ["A", "B", "C"]).run(dict(ds1=mocked_dataset)) + node(one_in_two_out, "ds1", ["A", "B", "C"]).run({"ds1": mocked_dataset}) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 60fc6a9998..231599bcee 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,40 +1,11 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. import re -from functools import wraps from itertools import chain -from typing import Callable import pytest import kedro -from kedro.io import DataCatalog -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.pipeline.pipeline import ( CircularDependencyError, ConfirmNotUniqueError, @@ -42,7 +13,6 @@ _strip_transcoding, _transcode_split, ) -from kedro.runner import SequentialRunner class TestTranscodeHelpers: @@ -141,26 +111,26 @@ def pipeline_with_dicts(): "nodes": [ node(triconcat, ["H", "I", "M"], "N", name="node1"), node(identity, "H", "I", name="node2"), - node(identity, "F", dict(M="M", N="G"), name="node3"), - node(identity, "E", dict(O="F", P="H"), name="node4"), # NOQA - node(identity, dict(input1="D"), None, name="node5"), + node(identity, "F", {"M": "M", "N": "G"}, name="node3"), + node(identity, "E", {"O": "F", "P": "H"}, name="node4"), # NOQA + node(identity, {"input1": "D"}, None, name="node5"), node(identity, "C", "D", name="node6", tags=["foo"]), - node(identity, "B", dict(P="C", Q="E"), name="node7", tags=["foo"]), - node(identity, "A", dict(R="B", S="L"), name="node8"), + node(identity, "B", {"P": "C", "Q": "E"}, name="node7", tags=["foo"]), + node(identity, "A", {"R": "B", "S": "L"}, name="node8"), node(constant_output, None, "A", name="node9"), ], "expected": [ {node(constant_output, None, "A", name="node9")}, - {node(identity, "A", dict(R="B", S="L"), name="node8")}, - {node(identity, "B", dict(P="C", Q="E"), name="node7", tags=["foo"])}, + {node(identity, "A", {"R": "B", "S": "L"}, name="node8")}, + {node(identity, "B", {"P": "C", "Q": "E"}, name="node7", tags=["foo"])}, { node(identity, "C", "D", name="node6", tags=["foo"]), - node(identity, "E", dict(O="F", P="H"), name="node4"), # NOQA + node(identity, "E", {"O": "F", "P": "H"}, name="node4"), # NOQA }, { - node(identity, dict(input1="D"), None, name="node5"), + node(identity, {"input1": "D"}, None, name="node5"), node(identity, "H", "I", name="node2"), - node(identity, "F", dict(M="M", N="G"), name="node3"), + node(identity, "F", {"M": "M", "N": "G"}, name="node3"), }, {node(triconcat, ["H", "I", "M"], "N", name="node1")}, ], @@ -249,7 +219,7 @@ def str_node_inputs_list(): @pytest.fixture def complex_pipeline(pipeline_list_with_lists): nodes = pipeline_list_with_lists["nodes"] - pipeline = Pipeline(nodes) + pipeline = modular_pipeline(nodes) return pipeline @@ -271,7 +241,7 @@ def input_data(request): class TestValidPipeline: def test_nodes(self, str_node_inputs_list): nodes = str_node_inputs_list["nodes"] - pipeline = Pipeline(nodes) + pipeline = modular_pipeline(nodes) assert set(pipeline.nodes) == set(nodes) @@ -279,19 +249,20 @@ def test_grouped_nodes(self, input_data): """Check if grouped_nodes func groups the nodes correctly""" nodes_input = input_data["nodes"] expected = input_data["expected"] - pipeline = Pipeline(nodes_input) + pipeline = modular_pipeline(nodes_input) grouped = pipeline.grouped_nodes # Flatten a list of grouped nodes assert pipeline.nodes == list(chain.from_iterable(grouped)) - # Check each grouped node matches with expected group - assert all(g == e for g, e in zip(grouped, expected)) + # Check each grouped node matches with the expected group, the order is + # non-deterministic, so we are only checking they have the same set of nodes. + assert all(set(g) == e for g, e in zip(grouped, expected)) def test_free_input(self, input_data): nodes = input_data["nodes"] inputs = input_data["free_inputs"] - pipeline = Pipeline(nodes) + pipeline = modular_pipeline(nodes) assert pipeline.inputs() == set(inputs) @@ -299,16 +270,16 @@ def test_outputs(self, input_data): nodes = input_data["nodes"] outputs = input_data["outputs"] - pipeline = Pipeline(nodes) + pipeline = modular_pipeline(nodes) assert pipeline.outputs() == set(outputs) def test_empty_case(self): """Empty pipeline is possible""" - Pipeline([]) + modular_pipeline([]) def test_initialized_with_tags(self): - pipeline = Pipeline( + pipeline = modular_pipeline( [node(identity, "A", "B", tags=["node1", "p1"]), node(identity, "B", "C")], tags=["p1", "p2"], ) @@ -352,7 +323,7 @@ def non_unique_node_outputs(): node(identity, "A", ["B", "C"], name="node1"), node(identity, "C", ["D", "E", "F"], name="node2"), # D, E non-unique - node(identity, "B", dict(out1="D", out2="E"), name="node3"), + node(identity, "B", {"out1": "D", "out2": "E"}, name="node3"), node(identity, "D", ["E"], name="node4"), # E non-unique ] @@ -361,15 +332,15 @@ class TestInvalidPipeline: def test_circle_case(self, pipeline_with_circle): pattern = "Circular dependencies" with pytest.raises(CircularDependencyError, match=pattern): - Pipeline(pipeline_with_circle) + modular_pipeline(pipeline_with_circle) def test_unique_outputs(self, non_unique_node_outputs): with pytest.raises(OutputNotUniqueError, match=r"\['D', 'E'\]"): - Pipeline(non_unique_node_outputs) + modular_pipeline(non_unique_node_outputs) def test_none_case(self): with pytest.raises(ValueError, match="is None"): - Pipeline(None) + modular_pipeline(None) def test_duplicate_free_nodes(self): pattern = ( @@ -377,21 +348,21 @@ def test_duplicate_free_nodes(self): "names appear more than once:\n\nFree nodes:\n - same_name" ) with pytest.raises(ValueError, match=re.escape(pattern)): - Pipeline( + modular_pipeline( [ node(identity, "in1", "out1", name="same_name"), node(identity, "in2", "out2", name="same_name"), ] ) - pipeline = Pipeline([node(identity, "in1", "out1", name="same_name")]) + pipeline = modular_pipeline([node(identity, "in1", "out1", name="same_name")]) another_node = node(identity, "in2", "out2", name="same_name") with pytest.raises(ValueError, match=re.escape(pattern)): # 'pipeline' passes the check, 'another_node' doesn't - Pipeline([pipeline, another_node]) + modular_pipeline([pipeline, another_node]) def test_duplicate_nodes_in_pipelines(self): - pipeline = Pipeline( + pipeline = modular_pipeline( [node(biconcat, ["input", "input1"], ["output", "output1"], name="node")] ) pattern = ( @@ -400,26 +371,33 @@ def test_duplicate_nodes_in_pipelines(self): ) with pytest.raises(ValueError, match=pattern): # the first 'pipeline' passes the check, the second doesn't - Pipeline([pipeline, pipeline]) + modular_pipeline([pipeline, pipeline]) another_node = node(identity, "in1", "out1", name="node") with pytest.raises(ValueError, match=pattern): # 'another_node' passes the check, 'pipeline' doesn't - Pipeline([another_node, pipeline]) + modular_pipeline([another_node, pipeline]) - def test_bad_combine(self): + def test_bad_combine_node(self): """Node cannot be combined to pipeline.""" fred = node(identity, "input", "output") - pipeline = Pipeline([fred]) + pipeline = modular_pipeline([fred]) with pytest.raises(TypeError): pipeline + fred # pylint: disable=pointless-statement + def test_bad_combine_int(self): + """int cannot be combined to pipeline, tests __radd__""" + fred = node(identity, "input", "output") + pipeline = modular_pipeline([fred]) + with pytest.raises(TypeError): + _ = 1 + pipeline + def test_conflicting_names(self): """Node names must be unique.""" - pipeline1 = Pipeline( + pipeline1 = modular_pipeline( [node(biconcat, ["input", "input1"], ["output1"], name="a")] ) - new_pipeline = Pipeline( + new_pipeline = modular_pipeline( [node(biconcat, ["input", "input1"], ["output2"], name="a")] ) pattern = ( @@ -431,10 +409,10 @@ def test_conflicting_names(self): def test_conflicting_outputs(self): """Node outputs must be unique.""" - pipeline1 = Pipeline( + pipeline1 = modular_pipeline( [node(biconcat, ["input", "input1"], ["output", "output1"], name="a")] ) - new_pipeline = Pipeline( + new_pipeline = modular_pipeline( [node(biconcat, ["input", "input2"], ["output", "output2"], name="b")] ) with pytest.raises(OutputNotUniqueError, match=r"\['output'\]"): @@ -442,8 +420,10 @@ def test_conflicting_outputs(self): def test_duplicate_node_confirms(self): """Test that non-unique dataset confirms break pipeline concatenation""" - pipeline1 = Pipeline([node(identity, "input1", "output1", confirms="other")]) - pipeline2 = Pipeline( + pipeline1 = modular_pipeline( + [node(identity, "input1", "output1", confirms="other")] + ) + pipeline2 = modular_pipeline( [node(identity, "input2", "output2", confirms=["other", "output2"])] ) with pytest.raises(ConfirmNotUniqueError, match=r"\['other'\]"): @@ -451,24 +431,42 @@ def test_duplicate_node_confirms(self): class TestPipelineOperators: - def test_combine(self): - pipeline1 = Pipeline([node(biconcat, ["input", "input1"], "output1", name="a")]) - pipeline2 = Pipeline([node(biconcat, ["input", "input2"], "output2", name="b")]) + def test_combine_add(self): + pipeline1 = modular_pipeline( + [node(biconcat, ["input", "input1"], "output1", name="a")] + ) + pipeline2 = modular_pipeline( + [node(biconcat, ["input", "input2"], "output2", name="b")] + ) new_pipeline = pipeline1 + pipeline2 assert new_pipeline.inputs() == {"input", "input1", "input2"} assert new_pipeline.outputs() == {"output1", "output2"} assert {n.name for n in new_pipeline.nodes} == {"a", "b"} + def test_combine_sum(self): + pipeline1 = modular_pipeline( + [node(biconcat, ["input", "input1"], "output1", name="a")] + ) + pipeline2 = modular_pipeline( + [node(biconcat, ["input", "input2"], "output2", name="b")] + ) + new_pipeline = sum([pipeline1, pipeline2]) + assert new_pipeline.inputs() == {"input", "input1", "input2"} + assert new_pipeline.outputs() == {"output1", "output2"} + assert {n.name for n in new_pipeline.nodes} == {"a", "b"} + def test_remove(self): """Create a pipeline of 3 nodes and remove one of them""" - pipeline1 = Pipeline( + pipeline1 = modular_pipeline( [ node(biconcat, ["input", "input1"], "output1", name="a"), node(biconcat, ["input", "input2"], "output2", name="b"), node(biconcat, ["input", "input3"], "output3", name="c"), ] ) - pipeline2 = Pipeline([node(biconcat, ["input", "input2"], "output2", name="b")]) + pipeline2 = modular_pipeline( + [node(biconcat, ["input", "input2"], "output2", name="b")] + ) new_pipeline = pipeline1 - pipeline2 assert new_pipeline.inputs() == {"input", "input1", "input3"} assert new_pipeline.outputs() == {"output1", "output3"} @@ -478,14 +476,14 @@ def test_remove_with_partial_intersection(self): """Create a pipeline of 3 nodes and remove one of them using a pipeline that contains a partial match. """ - pipeline1 = Pipeline( + pipeline1 = modular_pipeline( [ node(biconcat, ["input", "input1"], "output1", name="a"), node(biconcat, ["input", "input2"], "output2", name="b"), node(biconcat, ["input", "input3"], "output3", name="c"), ] ) - pipeline2 = Pipeline( + pipeline2 = modular_pipeline( [ node(biconcat, ["input", "input2"], "output2", name="b"), node(biconcat, ["input", "input4"], "output4", name="d"), @@ -498,8 +496,10 @@ def test_remove_with_partial_intersection(self): def test_remove_empty_from_pipeline(self): """Remove an empty pipeline""" - pipeline1 = Pipeline([node(biconcat, ["input", "input1"], "output1", name="a")]) - pipeline2 = Pipeline([]) + pipeline1 = modular_pipeline( + [node(biconcat, ["input", "input1"], "output1", name="a")] + ) + pipeline2 = modular_pipeline([]) new_pipeline = pipeline1 - pipeline2 assert new_pipeline.inputs() == pipeline1.inputs() assert new_pipeline.outputs() == pipeline1.outputs() @@ -507,8 +507,10 @@ def test_remove_empty_from_pipeline(self): def test_remove_from_empty_pipeline(self): """Remove node from an empty pipeline""" - pipeline1 = Pipeline([node(biconcat, ["input", "input1"], "output1", name="a")]) - pipeline2 = Pipeline([]) + pipeline1 = modular_pipeline( + [node(biconcat, ["input", "input1"], "output1", name="a")] + ) + pipeline2 = modular_pipeline([]) new_pipeline = pipeline2 - pipeline1 assert new_pipeline.inputs() == pipeline2.inputs() assert new_pipeline.outputs() == pipeline2.outputs() @@ -516,25 +518,29 @@ def test_remove_from_empty_pipeline(self): def test_remove_all_nodes(self): """Remove an entire pipeline""" - pipeline1 = Pipeline([node(biconcat, ["input", "input1"], "output1", name="a")]) - pipeline2 = Pipeline([node(biconcat, ["input", "input1"], "output1", name="a")]) + pipeline1 = modular_pipeline( + [node(biconcat, ["input", "input1"], "output1", name="a")] + ) + pipeline2 = modular_pipeline( + [node(biconcat, ["input", "input1"], "output1", name="a")] + ) new_pipeline = pipeline1 - pipeline2 assert new_pipeline.inputs() == set() assert new_pipeline.outputs() == set() assert not new_pipeline.nodes def test_invalid_remove(self): - p = Pipeline([]) + p = modular_pipeline([]) pattern = r"unsupported operand type\(s\) for -: 'Pipeline' and 'str'" with pytest.raises(TypeError, match=pattern): p - "hello" # pylint: disable=pointless-statement def test_combine_same_node(self): """Multiple (identical) pipelines are possible""" - pipeline1 = Pipeline( + pipeline1 = modular_pipeline( [node(biconcat, ["input", "input1"], ["output"], name="a")] ) - pipeline2 = Pipeline( + pipeline2 = modular_pipeline( [node(biconcat, ["input", "input1"], ["output"], name="a")] ) new_pipeline = pipeline1 + pipeline2 @@ -543,60 +549,68 @@ def test_combine_same_node(self): assert {n.name for n in new_pipeline.nodes} == {"a"} def test_intersection(self): - pipeline1 = Pipeline( + pipeline1 = modular_pipeline( [ node(biconcat, ["input", "input1"], "output1", name="a"), node(biconcat, ["input", "input2"], "output2", name="b"), ] ) - pipeline2 = Pipeline([node(biconcat, ["input", "input2"], "output2", name="b")]) + pipeline2 = modular_pipeline( + [node(biconcat, ["input", "input2"], "output2", name="b")] + ) new_pipeline = pipeline1 & pipeline2 assert new_pipeline.inputs() == {"input", "input2"} assert new_pipeline.outputs() == {"output2"} assert {n.name for n in new_pipeline.nodes} == {"b"} def test_invalid_intersection(self): - p = Pipeline([]) + p = modular_pipeline([]) pattern = r"unsupported operand type\(s\) for &: 'Pipeline' and 'str'" with pytest.raises(TypeError, match=pattern): p & "hello" # pylint: disable=pointless-statement def test_union(self): - pipeline1 = Pipeline( + pipeline1 = modular_pipeline( [ node(biconcat, ["input", "input1"], "output1", name="a"), node(biconcat, ["input", "input2"], "output2", name="b"), ] ) - pipeline2 = Pipeline([node(biconcat, ["input", "input2"], "output2", name="b")]) + pipeline2 = modular_pipeline( + [node(biconcat, ["input", "input2"], "output2", name="b")] + ) new_pipeline = pipeline1 | pipeline2 assert new_pipeline.inputs() == {"input", "input1", "input2"} assert new_pipeline.outputs() == {"output1", "output2"} assert {n.name for n in new_pipeline.nodes} == {"a", "b"} def test_invalid_union(self): - p = Pipeline([]) + p = modular_pipeline([]) pattern = r"unsupported operand type\(s\) for |: 'Pipeline' and 'str'" with pytest.raises(TypeError, match=pattern): p | "hello" # pylint: disable=pointless-statement def test_node_unique_confirms(self): """Test that unique dataset confirms don't break pipeline concatenation""" - pipeline1 = Pipeline([node(identity, "input1", "output1", confirms="output1")]) - pipeline2 = Pipeline([node(identity, "input2", "output2", confirms="other")]) - pipeline3 = Pipeline([node(identity, "input3", "output3")]) + pipeline1 = modular_pipeline( + [node(identity, "input1", "output1", confirms="output1")] + ) + pipeline2 = modular_pipeline( + [node(identity, "input2", "output2", confirms="other")] + ) + pipeline3 = modular_pipeline([node(identity, "input3", "output3")]) combined = pipeline1 + pipeline2 + pipeline3 assert len(combined.nodes) == 3 def test_connected_pipeline(self, disjoint_pipeline): """Connect two separate pipelines.""" nodes = disjoint_pipeline["nodes"] - subpipeline = Pipeline(nodes, tags=["subpipeline"]) + subpipeline = modular_pipeline(nodes, tags=["subpipeline"]) assert len(subpipeline.inputs()) == 2 assert len(subpipeline.outputs()) == 2 - pipeline = Pipeline( + pipeline = modular_pipeline( [node(identity, "C", "D", name="connecting_node"), subpipeline], tags="main" ) @@ -607,7 +621,7 @@ def test_connected_pipeline(self, disjoint_pipeline): class TestPipelineDescribe: def test_names_only(self, str_node_inputs_list): - pipeline = Pipeline(str_node_inputs_list["nodes"]) + pipeline = modular_pipeline(str_node_inputs_list["nodes"]) description = pipeline.describe() desc = description.split("\n") @@ -627,7 +641,7 @@ def test_names_only(self, str_node_inputs_list): assert res == example def test_full(self, str_node_inputs_list): - pipeline = Pipeline(str_node_inputs_list["nodes"]) + pipeline = modular_pipeline(str_node_inputs_list["nodes"]) description = pipeline.describe(names_only=False) desc = description.split("\n") @@ -635,7 +649,7 @@ def test_full(self, str_node_inputs_list): "#### Pipeline execution order ####", "Inputs: input1, input2", "", - "node1: biconcat([input1,input2]) -> [input3]", + "node1: biconcat([input1;input2]) -> [input3]", "node2: identity([input3]) -> [input4]", "", "Outputs: input4", @@ -647,57 +661,6 @@ def test_full(self, str_node_inputs_list): assert res == example -def apply_f(func: Callable) -> Callable: - @wraps(func) - def with_f(*args, **kwargs): - return func(*[f"f({a})" for a in args], **kwargs) - - return with_f - - -def apply_g(func: Callable) -> Callable: - @wraps(func) - def with_g(*args, **kwargs): - return func(*[f"g({a})" for a in args], **kwargs) - - return with_g - - -class TestPipelineDecorator: - def test_apply(self): - nodes = sorted( - [ - node(identity, "number", "output1", name="identity1"), - node(identity, "output1", "output2", name="biconcat"), - node(identity, "output2", "output", name="identity3"), - ], - key=lambda x: x.name, - ) - pattern = ( - "The pipeline's `decorate` API will be deprecated in Kedro 0.18.0." - "Please use a node's Hooks to extend the node's behaviour in a pipeline." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/04_hooks.html" - ) - with pytest.warns(DeprecationWarning, match=re.escape(pattern)): - pipeline = Pipeline(nodes).decorate(apply_f, apply_g) - catalog = DataCatalog({}, dict(number=1)) - result = SequentialRunner().run(pipeline, catalog) - decorated_nodes = sorted(pipeline.nodes, key=lambda x: x.name) - - assert result["output"] == "g(f(g(f(g(f(1))))))" - assert len(pipeline.nodes) == 3 - assert all(n1.name == n2.name for n1, n2 in zip(nodes, decorated_nodes)) - - def test_empty_apply(self): - """Applying no decorators is valid.""" - identity_node = node(identity, "number", "output", name="identity") - pipeline = Pipeline([identity_node]).decorate() - catalog = DataCatalog({}, dict(number=1)) - result = SequentialRunner().run(pipeline, catalog) - assert result["output"] == 1 - - @pytest.fixture def nodes_with_tags(): return [ @@ -712,13 +675,13 @@ def nodes_with_tags(): class TestPipelineTags: def test_tag_existing_pipeline(self, branchless_pipeline): - pipeline = Pipeline(branchless_pipeline["nodes"]) + pipeline = modular_pipeline(branchless_pipeline["nodes"]) pipeline = pipeline.tag(["new_tag"]) assert all("new_tag" in n.tags for n in pipeline.nodes) def test_pipeline_single_tag(self, branchless_pipeline): - p1 = Pipeline(branchless_pipeline["nodes"], tags="single_tag") - p2 = Pipeline(branchless_pipeline["nodes"]).tag("single_tag") + p1 = modular_pipeline(branchless_pipeline["nodes"], tags="single_tag") + p2 = modular_pipeline(branchless_pipeline["nodes"]).tag("single_tag") for pipeline in (p1, p2): assert all("single_tag" in n.tags for n in pipeline.nodes) @@ -726,7 +689,7 @@ def test_pipeline_single_tag(self, branchless_pipeline): @pytest.fixture def pipeline_with_namespaces(): - return Pipeline( + return modular_pipeline( [ node(identity, "A", "B", name="node1", namespace="katie"), node(identity, "B", "C", name="node2", namespace="lisa"), @@ -804,7 +767,7 @@ class TestPipelineFilterHelpers: ], ) def test_only_nodes_with_tags(self, tags, expected_nodes, nodes_with_tags): - pipeline = Pipeline(nodes_with_tags) + pipeline = modular_pipeline(nodes_with_tags) def get_nodes_with_tags(*tags): p = pipeline.only_nodes_with_tags(*tags) @@ -842,7 +805,7 @@ def test_to_nodes_unknown(self, complex_pipeline): "target_node_names", [["node2", "node3", "node4", "node8"], ["node1"]] ) def test_only_nodes(self, target_node_names, pipeline_list_with_lists): - full = Pipeline(pipeline_list_with_lists["nodes"]) + full = modular_pipeline(pipeline_list_with_lists["nodes"]) partial = full.only_nodes(*target_node_names) target_list = list(target_node_names) names = map(lambda node_: node_.name, partial.nodes) @@ -853,10 +816,66 @@ def test_only_nodes(self, target_node_names, pipeline_list_with_lists): ) def test_only_nodes_unknown(self, pipeline_list_with_lists, target_node_names): pattern = r"Pipeline does not contain nodes" - full = Pipeline(pipeline_list_with_lists["nodes"]) + full = modular_pipeline(pipeline_list_with_lists["nodes"]) with pytest.raises(ValueError, match=pattern): full.only_nodes(*target_node_names) + @pytest.mark.parametrize( + "non_namespaced_node_name", + ["node1", "node2", "node3", "node4", "node5", "node6"], + ) + def test_only_nodes_with_namespacing( + self, pipeline_with_namespaces, non_namespaced_node_name + ): + # Tests that error message will supply correct namespaces. + # Example of expected error: + # Pipeline does not contain nodes named ['node1']. Did you mean: ['katie.node1']? + pattern = ( + rf"Pipeline does not contain nodes named \['{non_namespaced_node_name}'\]\. " + rf"Did you mean: \['.*\.{non_namespaced_node_name}'\]\?" + ) + with pytest.raises(ValueError, match=pattern): + pipeline_with_namespaces.only_nodes(non_namespaced_node_name) + + @pytest.mark.parametrize( + "non_namespaced_node_names", + [("node1", "node2"), ("node3", "node4"), ("node5", "node6")], + ) + def test_only_nodes_with_namespacing_multiple_args( + self, pipeline_with_namespaces, non_namespaced_node_names + ): + # Tests that error message will contain suggestions for all provided arguments. + # Example of expected error message: + # "Pipeline does not contain nodes named ['node1', 'node2']. + # Did you mean: ['katie.node1', 'lisa.node2']?" + pattern = ( + rf"(('.*\.{non_namespaced_node_names[0]}')+.*" + rf"('.*\.{non_namespaced_node_names[1]}')+)" + rf"|" # use OR operator because ordering is unspecified + rf"(('.*\.{non_namespaced_node_names[1]}')+.*" + rf"('.*\.{non_namespaced_node_names[0]}')+)" + ) + with pytest.raises(ValueError, match=pattern): + pipeline_with_namespaces.only_nodes(*non_namespaced_node_names) + + @pytest.mark.parametrize( + "non_namespaced_node_names", + [("node1", "invalid_node"), ("invalid_node", "node2")], + ) + def test_only_nodes_with_namespacing_and_invalid_args( + self, pipeline_with_namespaces, non_namespaced_node_names + ): + # Tests error message will still contain namespace suggestions for correct arguments. + # regex is not specific to node names due to unspecified order + # Example of expected error message: + # "Pipeline does not contain nodes named ['node1', 'invalid_node']. + # Did you mean: ['katie.node1']?" + pattern = ( + r"Pipeline does not contain nodes named \[.*\]\. Did you mean: \[.*\]\?" + ) + with pytest.raises(ValueError, match=pattern): + pipeline_with_namespaces.only_nodes(*non_namespaced_node_names) + def test_from_inputs(self, complex_pipeline): """F and H are inputs of node1, node2 and node3.""" new_pipeline = complex_pipeline.from_inputs("F", "H") @@ -905,7 +924,7 @@ def test_only_nodes_with_namespace( @pytest.mark.parametrize("namespace", ["katie", None]) def test_only_nodes_with_namespace_unknown(self, namespace): - pipeline = Pipeline([node(identity, "A", "B", namespace=namespace)]) + pipeline = modular_pipeline([node(identity, "A", "B", namespace=namespace)]) pattern = r"Pipeline does not contain nodes" with pytest.raises(ValueError, match=pattern): pipeline.only_nodes_with_namespace("non_existent") @@ -941,7 +960,7 @@ def test_only_nodes_with_outputs_unknown(self, complex_pipeline): def test_pipeline_to_json(input_data): nodes = input_data["nodes"] - json_rep = Pipeline(nodes).to_json() + json_rep = modular_pipeline(nodes).to_json() for pipeline_node in nodes: assert pipeline_node.name in json_rep assert all(node_input in json_rep for node_input in pipeline_node.inputs) diff --git a/tests/pipeline/test_pipeline_from_missing.py b/tests/pipeline/test_pipeline_from_missing.py index 5876d4e559..f399e70c06 100644 --- a/tests/pipeline/test_pipeline_from_missing.py +++ b/tests/pipeline/test_pipeline_from_missing.py @@ -1,37 +1,11 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from unittest import mock import pytest -from kedro.io import DataCatalog, LambdaDataSet -from kedro.pipeline import Pipeline, node +from kedro.framework.hooks import _create_hook_manager +from kedro.io import DataCatalog, LambdaDataset +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import SequentialRunner @@ -48,6 +22,11 @@ def biconcat(input1: str, input2: str): return input1 + input2 # pragma: no cover +@pytest.fixture +def hook_manager(): + return _create_hook_manager() + + @pytest.fixture def branched_pipeline(): # #### Pipeline execution order #### @@ -77,7 +56,7 @@ def branched_pipeline(): # r-out # # ################################## - return Pipeline( + return modular_pipeline( [ node(identity, "A", "B", name="left_in"), node(constant_output, None, "C", name="right_in"), @@ -91,19 +70,19 @@ def branched_pipeline(): def _make_catalog( existent=None, non_existent=None, no_exists_method=None, feed_dict=None ): - """Creates a catalog of existent and non-existent DataSets.""" + """Creates a catalog of existent and non-existent Datasets.""" existent = [] if existent is None else existent non_existent = [] if non_existent is None else non_existent no_exists_method = [] if no_exists_method is None else no_exists_method catalog = DataCatalog(feed_dict=feed_dict) for source in existent: - catalog.add(source, LambdaDataSet(None, None, lambda: True)) + catalog.add(source, LambdaDataset(None, None, lambda: True)) for source in non_existent: - catalog.add(source, LambdaDataSet(None, None, lambda: False)) - # Some LambdaDataSet do not have exists() method + catalog.add(source, LambdaDataset(None, None, lambda: False)) + # Some LambdaDataset do not have exists() method for source in no_exists_method: - catalog.add(source, LambdaDataSet(None, None)) + catalog.add(source, LambdaDataset(None, None)) return catalog @@ -115,135 +94,135 @@ def _pipeline_contains(pipe, nodes): return set(nodes) == {n.name for n in pipe.nodes} -def _from_missing(pipeline, catalog): +def _from_missing(pipeline, catalog, hook_manager): """Create a new pipeline based on missing outputs.""" name = "kedro.runner.runner.AbstractRunner.run" with mock.patch(name) as run: - SequentialRunner().run_only_missing(pipeline, catalog) + SequentialRunner().run_only_missing(pipeline, catalog, hook_manager) _, args, _ = run.mock_calls[0] new_pipeline = args[0] return new_pipeline class TestPipelineMissing: - def test_all_missing(self, branched_pipeline): + def test_all_missing(self, branched_pipeline, hook_manager): catalog = _make_catalog(non_existent=["A", "B", "C", "D", "E", "F"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipelines_equal(branched_pipeline, new_pipeline) - def test_none_missing(self, branched_pipeline): + def test_none_missing(self, branched_pipeline, hook_manager): catalog = _make_catalog(existent=["A", "B", "C", "D", "E", "F"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains(new_pipeline, []) - def test_none_missing_feeddict_only(self, branched_pipeline): + def test_none_missing_feeddict_only(self, branched_pipeline, hook_manager): feed_dict = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6} catalog = _make_catalog(feed_dict=feed_dict) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains(new_pipeline, []) - def test_first_missing(self, branched_pipeline): + def test_first_missing(self, branched_pipeline, hook_manager): """combine from B and C is missing.""" catalog = _make_catalog(non_existent=["B", "C"], existent=["A", "D", "E", "F"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipelines_equal(branched_pipeline, new_pipeline) - def test_only_left_missing(self, branched_pipeline): + def test_only_left_missing(self, branched_pipeline, hook_manager): catalog = _make_catalog(non_existent=["B"], existent=["A", "C", "D", "E", "F"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains( new_pipeline, ["left_in", "combine", "split", "right_out"] ) - def test_last_missing(self, branched_pipeline): + def test_last_missing(self, branched_pipeline, hook_manager): """r-out from F is missing.""" catalog = _make_catalog(non_existent=["F"], existent=["A", "B", "C", "D", "E"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains(new_pipeline, ["split", "right_out"]) - def test_missing_and_no_exists(self, branched_pipeline, caplog): + def test_missing_and_no_exists(self, branched_pipeline, caplog, hook_manager): """If F doesn't have exists(), F is treated as missing.""" catalog = _make_catalog( existent=["A", "B", "C", "D", "E"], no_exists_method=["F"] ) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains(new_pipeline, ["split", "right_out"]) log_record = caplog.records[0] assert log_record.levelname == "WARNING" assert ( - "`exists()` not implemented for `LambdaDataSet`" in log_record.getMessage() + "'exists()' not implemented for 'LambdaDataset'" in log_record.getMessage() ) - def test_all_no_exists_method(self, branched_pipeline, caplog): + def test_all_no_exists_method(self, branched_pipeline, caplog, hook_manager): catalog = _make_catalog(no_exists_method=["A", "B", "C", "D", "E", "F"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipelines_equal(branched_pipeline, new_pipeline) log_msgs = [record.getMessage() for record in caplog.records] expected_msg = ( - "`exists()` not implemented for `LambdaDataSet`. " + "'exists()' not implemented for 'LambdaDataset'. " "Assuming output does not exist." ) assert expected_msg in log_msgs - def test_catalog_and_feed_dict(self, branched_pipeline): + def test_catalog_and_feed_dict(self, branched_pipeline, hook_manager): """Mix of feed_dict and non-existent F.""" catalog = _make_catalog(non_existent=["F"], existent=["D", "E"]) catalog.add_feed_dict({"A": 1, "B": 2, "C": 3}) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains(new_pipeline, ["split", "right_out"]) class TestPipelineUnregistered: - def test_propagate_up(self, branched_pipeline): + def test_propagate_up(self, branched_pipeline, hook_manager): """If a node needs to be rerun and requires unregistered (node-to-node) inputs, all necessary upstream nodes should be added. """ catalog = _make_catalog(existent=["A"], non_existent=["E"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains( new_pipeline, ["left_in", "right_in", "combine", "split"] ) - def test_propagate_down_then_up(self, branched_pipeline): + def test_propagate_down_then_up(self, branched_pipeline, hook_manager): """Unregistered (node-to-node) inputs for downstream nodes should be included, too. """ catalog = _make_catalog(existent=["A", "D", "E"], non_existent=["C"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipelines_equal(branched_pipeline, new_pipeline) - def test_ignore_unneccessary_unreg(self, branched_pipeline): + def test_ignore_unneccessary_unreg(self, branched_pipeline, hook_manager): """Unregistered (node-to-node) data sources should not trigger reruns, unless necessary to recreate registered data sources. """ catalog = _make_catalog(existent=["A", "E", "F"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains(new_pipeline, []) - def test_partial_propagation(self, branched_pipeline): + def test_partial_propagation(self, branched_pipeline, hook_manager): """Unregistered (node-to-node) data sources should not trigger reruns, unless necessary to recreate registered data sources. """ catalog = _make_catalog(existent=["A", "D"], no_exists_method=["F"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains(new_pipeline, ["split", "right_out"]) - def test_partial_non_existent_propagation(self, branched_pipeline): + def test_partial_non_existent_propagation(self, branched_pipeline, hook_manager): """A non existent data set whose node has one unregistered input and one existent input should be recalculated correctly. """ catalog = _make_catalog(existent=["A", "C", "E", "F"], non_existent=["D"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains( new_pipeline, ["left_in", "combine", "split", "right_out"] ) - def test_free_output(self, branched_pipeline): + def test_free_output(self, branched_pipeline, hook_manager): """Free outputs are the only unregistered data sources that should trigger runs. """ catalog = _make_catalog(existent=["A", "B", "C", "F"]) - new_pipeline = _from_missing(branched_pipeline, catalog) + new_pipeline = _from_missing(branched_pipeline, catalog, hook_manager) assert _pipeline_contains(new_pipeline, ["combine", "split"]) diff --git a/tests/pipeline/test_pipeline_integration.py b/tests/pipeline/test_pipeline_integration.py index 2625fd3687..dda444c5a8 100644 --- a/tests/pipeline/test_pipeline_integration.py +++ b/tests/pipeline/test_pipeline_integration.py @@ -1,33 +1,6 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - from kedro.io import DataCatalog -from kedro.pipeline import Pipeline, node, pipeline +from kedro.pipeline import node, pipeline +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import SequentialRunner @@ -49,11 +22,11 @@ def test_connect_existing_pipelines(self): Two pipelines exist, the dataset names do not match. We `transform` them to work together. """ - cook_pipeline = Pipeline( + cook_pipeline = modular_pipeline( [node(defrost, "frozen_meat", "meat"), node(grill, "meat", "grilled_meat")] ) - lunch_pipeline = Pipeline([node(eat, "food", "output")]) + lunch_pipeline = modular_pipeline([node(eat, "food", "output")]) pipeline1 = ( pipeline(cook_pipeline, outputs={"grilled_meat": "food"}) + lunch_pipeline @@ -77,14 +50,16 @@ def test_reuse_same_pipeline(self): Normally dataset and node names would conflict, so we need to `transform` the pipelines. """ - cook_pipeline = Pipeline( + cook_pipeline = modular_pipeline( [ node(defrost, "frozen_meat", "meat", name="defrost_node"), node(grill, "meat", "grilled_meat", name="grill_node"), ] ) - breakfast_pipeline = Pipeline([node(eat, "breakfast_food", "breakfast_output")]) - lunch_pipeline = Pipeline([node(eat, "lunch_food", "lunch_output")]) + breakfast_pipeline = modular_pipeline( + [node(eat, "breakfast_food", "breakfast_output")] + ) + lunch_pipeline = modular_pipeline([node(eat, "lunch_food", "lunch_output")]) # We are using two different mechanisms here for breakfast and lunch, # renaming and prefixing pipelines differently. diff --git a/tests/pipeline/test_pipeline_with_transcoding.py b/tests/pipeline/test_pipeline_with_transcoding.py index 5bf73edca6..a5c15f9e68 100644 --- a/tests/pipeline/test_pipeline_with_transcoding.py +++ b/tests/pipeline/test_pipeline_with_transcoding.py @@ -1,36 +1,10 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. from itertools import chain import pytest import kedro -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.pipeline.pipeline import OutputNotUniqueError, _strip_transcoding @@ -109,7 +83,7 @@ def pipeline_with_duplicate_transcoded_inputs(): @pytest.fixture def complex_pipeline(): - pipeline = Pipeline( + pipeline = modular_pipeline( [ node(triconcat, ["H@node1", "I", "M"], "N", name="node1"), node(identity, "H@node2", "I", name="node2"), @@ -142,19 +116,19 @@ def test_grouped_nodes(self, input_data): """Check if grouped_nodes func groups the nodes correctly""" nodes_input = input_data["nodes"] expected = input_data["expected"] - pipeline = Pipeline(nodes_input) + pipeline = modular_pipeline(nodes_input) grouped = pipeline.grouped_nodes # Flatten a list of grouped nodes assert pipeline.nodes == list(chain.from_iterable(grouped)) # Check each grouped node matches with expected group - assert all(g == e for g, e in zip(grouped, expected)) + assert all(set(g) == e for g, e in zip(grouped, expected)) def test_free_input(self, input_data): nodes = input_data["nodes"] inputs = input_data["free_inputs"] - pipeline = Pipeline(nodes) + pipeline = modular_pipeline(nodes) assert pipeline.inputs() == set(inputs) @@ -162,13 +136,13 @@ def test_outputs(self, input_data): nodes = input_data["nodes"] outputs = input_data["outputs"] - pipeline = Pipeline(nodes) + pipeline = modular_pipeline(nodes) assert pipeline.outputs() == set(outputs) def test_pipeline_to_json(self, input_data): nodes = input_data["nodes"] - json_rep = Pipeline(nodes).to_json() + json_rep = modular_pipeline(nodes).to_json() for pipeline_node in nodes: assert pipeline_node.name in json_rep assert all(node_input in json_rep for node_input in pipeline_node.inputs) @@ -185,7 +159,7 @@ def test_transcoded_inputs_outputs(self): pattern = "The following datasets are used with transcoding, " pattern += "but were referenced without the separator: B." with pytest.raises(ValueError, match=pattern): - Pipeline( + modular_pipeline( [ node(identity, "A", "B", name="node1"), node(identity, "B@pandas", "C", name="node2"), @@ -196,7 +170,7 @@ def test_transcoded_inputs_outputs(self): def test_duplicates_in_transcoded_outputs(self): with pytest.raises(OutputNotUniqueError, match="['B']"): - Pipeline( + modular_pipeline( [ node(identity, "A", "B@pandas", name="node1"), node(identity, "A", "B@spark", name="node2"), diff --git a/tests/runner/conftest.py b/tests/runner/conftest.py new file mode 100644 index 0000000000..85ef9b9aa5 --- /dev/null +++ b/tests/runner/conftest.py @@ -0,0 +1,168 @@ +from random import random + +import pandas as pd +import pytest + +from kedro.io import DataCatalog, LambdaDataSet, MemoryDataSet +from kedro.pipeline import node, pipeline + + +def source(): + return "stuff" + + +def identity(arg): + return arg + + +def sink(arg): # pylint: disable=unused-argument + pass + + +def fan_in(*args): + return args + + +def exception_fn(*args): + raise Exception("test exception") # pylint: disable=broad-exception-raised + + +def return_none(arg): + arg = None + return arg + + +def return_not_serialisable(arg): # pylint: disable=unused-argument + return lambda x: x + + +def multi_input_list_output(arg1, arg2): + return [arg1, arg2] + + +@pytest.fixture +def conflicting_feed_dict(pandas_df_feed_dict): + ds1 = MemoryDataSet({"data": 0}) + ds3 = pandas_df_feed_dict["ds3"] + return {"ds1": ds1, "ds3": ds3} + + +@pytest.fixture +def pandas_df_feed_dict(): + pandas_df = pd.DataFrame({"Name": ["Alex", "Bob"], "Age": [15, 25]}) + return {"ds3": pandas_df} + + +@pytest.fixture +def catalog(): + return DataCatalog() + + +@pytest.fixture +def memory_catalog(): + ds1 = MemoryDataSet({"data": 42}) + ds2 = MemoryDataSet([1, 2, 3, 4, 5]) + return DataCatalog({"ds1": ds1, "ds2": ds2}) + + +@pytest.fixture +def persistent_dataset_catalog(): + def _load(): + return 0 + + # pylint: disable=unused-argument + def _save(arg): + pass + + persistent_dataset = LambdaDataSet(load=_load, save=_save) + return DataCatalog( + { + "ds0_A": persistent_dataset, + "ds0_B": persistent_dataset, + "ds2_A": persistent_dataset, + "ds2_B": persistent_dataset, + } + ) + + +@pytest.fixture +def fan_out_fan_in(): + return pipeline( + [ + node(identity, "A", "B"), + node(identity, "B", "C"), + node(identity, "B", "D"), + node(identity, "B", "E"), + node(fan_in, ["C", "D", "E"], "Z"), + ] + ) + + +@pytest.fixture +def branchless_no_input_pipeline(): + """The pipeline runs in the order A->B->C->D->E.""" + return pipeline( + [ + node(identity, "D", "E", name="node1"), + node(identity, "C", "D", name="node2"), + node(identity, "A", "B", name="node3"), + node(identity, "B", "C", name="node4"), + node(random, None, "A", name="node5"), + ] + ) + + +@pytest.fixture +def branchless_pipeline(): + return pipeline( + [ + node(identity, "ds1", "ds2", name="node1"), + node(identity, "ds2", "ds3", name="node2"), + ] + ) + + +@pytest.fixture +def saving_result_pipeline(): + return pipeline([node(identity, "ds", "dsX")]) + + +@pytest.fixture +def saving_none_pipeline(): + return pipeline( + [node(random, None, "A"), node(return_none, "A", "B"), node(identity, "B", "C")] + ) + + +@pytest.fixture +def unfinished_outputs_pipeline(): + return pipeline( + [ + node(identity, {"arg": "ds4"}, "ds8", name="node1"), + node(sink, "ds7", None, name="node2"), + node(multi_input_list_output, ["ds3", "ds4"], ["ds6", "ds7"], name="node3"), + node(identity, "ds2", "ds5", name="node4"), + node(identity, "ds1", "ds4", name="node5"), + ] + ) # Outputs: ['ds8', 'ds5', 'ds6'] == ['ds1', 'ds2', 'ds3'] + + +@pytest.fixture +def two_branches_crossed_pipeline(): + """A ``Pipeline`` with an X-shape (two branches with one common node)""" + return pipeline( + [ + node(identity, "ds0_A", "ds1_A", name="node1_A"), + node(identity, "ds0_B", "ds1_B", name="node1_B"), + node( + multi_input_list_output, + ["ds1_A", "ds1_B"], + ["ds2_A", "ds2_B"], + name="node2", + ), + node(identity, "ds2_A", "ds3_A", name="node3_A"), + node(identity, "ds2_B", "ds3_B", name="node3_B"), + node(identity, "ds3_A", "ds4_A", name="node4_A"), + node(identity, "ds3_B", "ds4_B", name="node4_B"), + ] + ) diff --git a/tests/runner/test_parallel_runner.py b/tests/runner/test_parallel_runner.py index f6b76429f0..a74cff8d53 100644 --- a/tests/runner/test_parallel_runner.py +++ b/tests/runner/test_parallel_runner.py @@ -1,100 +1,43 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from __future__ import annotations +import importlib import sys from concurrent.futures.process import ProcessPoolExecutor -from typing import Any, Dict +from typing import Any import pytest +from kedro.framework.hooks import _create_hook_manager from kedro.io import ( AbstractDataSet, DataCatalog, - DataSetError, - LambdaDataSet, - MemoryDataSet, + DatasetError, + LambdaDataset, + MemoryDataset, ) -from kedro.pipeline import Pipeline, node -from kedro.pipeline.decorators import log_time +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner from kedro.runner.parallel_runner import ( _MAX_WINDOWS_WORKERS, ParallelRunnerManager, _run_node_synchronization, - _SharedMemoryDataSet, + _SharedMemoryDataset, +) +from tests.runner.conftest import ( + exception_fn, + identity, + return_none, + return_not_serialisable, + sink, + source, ) -def source(): - return "stuff" - - -def identity(arg): - return arg - - -def sink(arg): # pylint: disable=unused-argument - pass - - -def fan_in(*args): - return args - - -def exception_fn(arg): - raise Exception("test exception") - - -def return_none(arg): - arg = None - return arg - - -def return_not_serializable(arg): # pylint: disable=unused-argument - return lambda x: x - - -@pytest.fixture -def catalog(): - return DataCatalog() - - -@pytest.fixture -def fan_out_fan_in(): - return Pipeline( - [ - node(identity, "A", "B"), - node(identity, "B", "C"), - node(identity, "B", "D"), - node(identity, "B", "E"), - node(fan_in, ["C", "D", "E"], "Z"), - ] - ) +def test_deprecation(): + class_name = "_SharedMemoryDataSet" + with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + getattr(importlib.import_module("kedro.runner.parallel_runner"), class_name) @pytest.mark.skipif( @@ -104,20 +47,30 @@ class TestValidParallelRunner: def test_create_default_data_set(self): # data_set is a proxy to a dataset in another process. data_set = ParallelRunner().create_default_data_set("") - assert isinstance(data_set, _SharedMemoryDataSet) + assert isinstance(data_set, _SharedMemoryDataset) @pytest.mark.parametrize("is_async", [False, True]) def test_parallel_run(self, is_async, fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) + catalog.add_feed_dict({"A": 42}) result = ParallelRunner(is_async=is_async).run(fan_out_fan_in, catalog) assert "Z" in result assert len(result["Z"]) == 3 assert result["Z"] == (42, 42, 42) @pytest.mark.parametrize("is_async", [False, True]) - def test_memory_data_set_input(self, is_async, fan_out_fan_in): - pipeline = Pipeline([fan_out_fan_in]) - catalog = DataCatalog({"A": MemoryDataSet("42")}) + def test_parallel_run_with_plugin_manager(self, is_async, fan_out_fan_in, catalog): + catalog.add_feed_dict({"A": 42}) + result = ParallelRunner(is_async=is_async).run( + fan_out_fan_in, catalog, hook_manager=_create_hook_manager() + ) + assert "Z" in result + assert len(result["Z"]) == 3 + assert result["Z"] == (42, 42, 42) + + @pytest.mark.parametrize("is_async", [False, True]) + def test_memory_dataset_input(self, is_async, fan_out_fan_in): + pipeline = modular_pipeline([fan_out_fan_in]) + catalog = DataCatalog({"A": MemoryDataset("42")}) result = ParallelRunner(is_async=is_async).run(pipeline, catalog) assert "Z" in result assert len(result["Z"]) == 3 @@ -150,7 +103,7 @@ def test_specified_max_workers_bellow_cpu_cores_count( cpu_cores, user_specified_number, expected_number, - ): # pylint: disable=too-many-arguments + ): # noqa: too-many-arguments """ The system has 2 cores, but we initialize the runner with max_workers=4. `fan_out_fan_in` pipeline needs 3 processes. @@ -163,7 +116,7 @@ def test_specified_max_workers_bellow_cpu_cores_count( wraps=ProcessPoolExecutor, ) - catalog.add_feed_dict(dict(A=42)) + catalog.add_feed_dict({"A": 42}) result = ParallelRunner( max_workers=user_specified_number, is_async=is_async ).run(fan_out_fan_in, catalog) @@ -189,36 +142,38 @@ def test_max_worker_windows(self, mocker): @pytest.mark.parametrize("is_async", [False, True]) class TestInvalidParallelRunner: def test_task_validation(self, is_async, fan_out_fan_in, catalog): - """ParallelRunner cannot serialize the lambda function.""" - catalog.add_feed_dict(dict(A=42)) - pipeline = Pipeline([fan_out_fan_in, node(lambda x: x, "Z", "X")]) + """ParallelRunner cannot serialise the lambda function.""" + catalog.add_feed_dict({"A": 42}) + pipeline = modular_pipeline([fan_out_fan_in, node(lambda x: x, "Z", "X")]) with pytest.raises(AttributeError): ParallelRunner(is_async=is_async).run(pipeline, catalog) def test_task_exception(self, is_async, fan_out_fan_in, catalog): - catalog.add_feed_dict(feed_dict=dict(A=42)) - pipeline = Pipeline([fan_out_fan_in, node(exception_fn, "Z", "X")]) + catalog.add_feed_dict(feed_dict={"A": 42}) + pipeline = modular_pipeline([fan_out_fan_in, node(exception_fn, "Z", "X")]) with pytest.raises(Exception, match="test exception"): ParallelRunner(is_async=is_async).run(pipeline, catalog) - def test_memory_data_set_output(self, is_async, fan_out_fan_in): + def test_memory_dataset_output(self, is_async, fan_out_fan_in): """ParallelRunner does not support output to externally - created MemoryDataSets. + created MemoryDatasets. """ - pipeline = Pipeline([fan_out_fan_in]) - catalog = DataCatalog({"C": MemoryDataSet()}, dict(A=42)) + pipeline = modular_pipeline([fan_out_fan_in]) + catalog = DataCatalog({"C": MemoryDataset()}, {"A": 42}) with pytest.raises(AttributeError, match="['C']"): ParallelRunner(is_async=is_async).run(pipeline, catalog) def test_node_returning_none(self, is_async): - pipeline = Pipeline([node(identity, "A", "B"), node(return_none, "B", "C")]) - catalog = DataCatalog({"A": MemoryDataSet("42")}) - pattern = "Saving `None` to a `DataSet` is not allowed" - with pytest.raises(DataSetError, match=pattern): + pipeline = modular_pipeline( + [node(identity, "A", "B"), node(return_none, "B", "C")] + ) + catalog = DataCatalog({"A": MemoryDataset("42")}) + pattern = "Saving 'None' to a 'Dataset' is not allowed" + with pytest.raises(DatasetError, match=pattern): ParallelRunner(is_async=is_async).run(pipeline, catalog) - def test_data_set_not_serializable(self, is_async, fan_out_fan_in): - """Data set A cannot be serializable because _load and _save are not + def test_data_set_not_serialisable(self, is_async, fan_out_fan_in): + """Data set A cannot be serialisable because _load and _save are not defined in global scope. """ @@ -228,24 +183,24 @@ def _load(): def _save(arg): assert arg == 0 # pragma: no cover - # Data set A cannot be serialized - catalog = DataCatalog({"A": LambdaDataSet(load=_load, save=_save)}) + # Data set A cannot be serialised + catalog = DataCatalog({"A": LambdaDataset(load=_load, save=_save)}) - pipeline = Pipeline([fan_out_fan_in]) + pipeline = modular_pipeline([fan_out_fan_in]) with pytest.raises(AttributeError, match="['A']"): ParallelRunner(is_async=is_async).run(pipeline, catalog) - def test_memory_dataset_not_serializable(self, is_async, catalog): - """Memory dataset cannot be serializable because of data it stores.""" - data = return_not_serializable(None) - pipeline = Pipeline([node(return_not_serializable, "A", "B")]) - catalog.add_feed_dict(feed_dict=dict(A=42)) + def test_memory_dataset_not_serialisable(self, is_async, catalog): + """Memory dataset cannot be serialisable because of data it stores.""" + data = return_not_serialisable(None) + pipeline = modular_pipeline([node(return_not_serialisable, "A", "B")]) + catalog.add_feed_dict(feed_dict={"A": 42}) pattern = ( - r"{0} cannot be serialized. ParallelRunner implicit memory datasets " - r"can only be used with serializable data".format(str(data.__class__)) + rf"{str(data.__class__)} cannot be serialised. ParallelRunner implicit " + rf"memory datasets can only be used with serialisable data" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DatasetError, match=pattern): ParallelRunner(is_async=is_async).run(pipeline, catalog) def test_unable_to_schedule_all_nodes( @@ -254,7 +209,7 @@ def test_unable_to_schedule_all_nodes( """Test the error raised when `futures` variable is empty, but `todo_nodes` is not (can barely happen in real life). """ - catalog.add_feed_dict(dict(A=42)) + catalog.add_feed_dict({"A": 42}) runner = ParallelRunner(is_async=is_async) real_node_deps = fan_out_fan_in.node_dependencies @@ -273,49 +228,7 @@ def test_unable_to_schedule_all_nodes( runner.run(fan_out_fan_in, catalog) -@log_time -def decorated_identity(*args, **kwargs): - return identity(*args, **kwargs) - - -@pytest.fixture -def decorated_fan_out_fan_in(): - return Pipeline( - [ - node(decorated_identity, "A", "B"), - node(decorated_identity, "B", "C"), - node(decorated_identity, "B", "D"), - node(decorated_identity, "B", "E"), - node(fan_in, ["C", "D", "E"], "Z"), - ] - ) - - -@pytest.mark.skipif( - sys.platform.startswith("win"), reason="Due to bug in parallel runner" -) -@pytest.mark.parametrize("is_async", [False, True]) -class TestParallelRunnerDecorator: - def test_decorate_pipeline(self, is_async, fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) - result = ParallelRunner(is_async=is_async).run( - fan_out_fan_in.decorate(log_time), catalog - ) - assert "Z" in result - assert len(result["Z"]) == 3 - assert result["Z"] == (42, 42, 42) - - def test_decorated_nodes(self, is_async, decorated_fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) - result = ParallelRunner(is_async=is_async).run( - decorated_fan_out_fan_in, catalog - ) - assert "Z" in result - assert len(result["Z"]) == 3 - assert result["Z"] == (42, 42, 42) - - -class LoggingDataSet(AbstractDataSet): +class LoggingDataset(AbstractDataSet): def __init__(self, log, name, value=None): self.log = log self.name = name @@ -332,14 +245,12 @@ def _release(self) -> None: self.log.append(("release", self.name)) self.value = None - def _describe(self) -> Dict[str, Any]: + def _describe(self) -> dict[str, Any]: return {} if not sys.platform.startswith("win"): - ParallelRunnerManager.register( # pylint: disable=no-member - "LoggingDataSet", LoggingDataSet - ) + ParallelRunnerManager.register("LoggingDataset", LoggingDataset) # noqa: no-member @pytest.mark.skipif( @@ -351,15 +262,15 @@ def test_dont_release_inputs_and_outputs(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() - pipeline = Pipeline( + pipeline = modular_pipeline( [node(identity, "in", "middle"), node(identity, "middle", "out")] ) - # pylint: disable=no-member + # noqa: no-member catalog = DataCatalog( { - "in": runner._manager.LoggingDataSet(log, "in", "stuff"), - "middle": runner._manager.LoggingDataSet(log, "middle"), - "out": runner._manager.LoggingDataSet(log, "out"), + "in": runner._manager.LoggingDataset(log, "in", "stuff"), + "middle": runner._manager.LoggingDataset(log, "middle"), + "out": runner._manager.LoggingDataset(log, "out"), } ) ParallelRunner().run(pipeline, catalog) @@ -371,18 +282,18 @@ def test_release_at_earliest_opportunity(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() - pipeline = Pipeline( + pipeline = modular_pipeline( [ node(source, None, "first"), node(identity, "first", "second"), node(sink, "second", None), ] ) - # pylint: disable=no-member + # noqa: no-member catalog = DataCatalog( { - "first": runner._manager.LoggingDataSet(log, "first"), - "second": runner._manager.LoggingDataSet(log, "second"), + "first": runner._manager.LoggingDataset(log, "first"), + "second": runner._manager.LoggingDataset(log, "second"), } ) runner.run(pipeline, catalog) @@ -399,16 +310,16 @@ def test_count_multiple_loads(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() - pipeline = Pipeline( + pipeline = modular_pipeline( [ node(source, None, "dataset"), node(sink, "dataset", None, name="bob"), node(sink, "dataset", None, name="fred"), ] ) - # pylint: disable=no-member + # noqa: no-member catalog = DataCatalog( - {"dataset": runner._manager.LoggingDataSet(log, "dataset")} + {"dataset": runner._manager.LoggingDataset(log, "dataset")} ) runner.run(pipeline, catalog) @@ -423,13 +334,13 @@ def test_release_transcoded(self, is_async): runner = ParallelRunner(is_async=is_async) log = runner._manager.list() - pipeline = Pipeline( + pipeline = modular_pipeline( [node(source, None, "ds@save"), node(sink, "ds@load", None)] ) catalog = DataCatalog( { - "ds@save": LoggingDataSet(log, "save"), - "ds@load": LoggingDataSet(log, "load"), + "ds@save": LoggingDataset(log, "save"), + "ds@load": LoggingDataset(log, "load"), } ) @@ -458,53 +369,30 @@ def mock_run_node(self, mocker): def mock_configure_project(self, mocker): return mocker.patch("kedro.framework.project.configure_project") - @pytest.mark.parametrize("conf_logging", [{"fake_logging_config": True}, dict()]) def test_package_name_and_logging_provided( self, mock_logging, mock_run_node, mock_configure_project, is_async, - conf_logging, mocker, ): mocker.patch("multiprocessing.get_start_method", return_value="spawn") node_ = mocker.sentinel.node catalog = mocker.sentinel.catalog - run_id = "fake_run_id" + session_id = "fake_session_id" package_name = mocker.sentinel.package_name _run_node_synchronization( node_, catalog, is_async, - run_id, + session_id, package_name=package_name, - conf_logging=conf_logging, - ) - mock_run_node.assert_called_once_with(node_, catalog, is_async, run_id) - mock_logging.assert_called_once_with(conf_logging) - mock_configure_project.assert_called_once_with(package_name) - - def test_package_name_provided( - self, - mock_logging, - mock_run_node, - mock_configure_project, - is_async, - mocker, - ): - mocker.patch("multiprocessing.get_start_method", return_value="spawn") - node_ = mocker.sentinel.node - catalog = mocker.sentinel.catalog - run_id = "fake_run_id" - package_name = mocker.sentinel.package_name - - _run_node_synchronization( - node_, catalog, is_async, run_id, package_name=package_name + logging_config={"fake_logging_config": True}, ) - mock_run_node.assert_called_once_with(node_, catalog, is_async, run_id) - mock_logging.assert_called_once_with({}) + mock_run_node.assert_called_once() + mock_logging.assert_called_once_with({"fake_logging_config": True}) mock_configure_project.assert_called_once_with(package_name) def test_package_name_not_provided( @@ -513,11 +401,11 @@ def test_package_name_not_provided( mocker.patch("multiprocessing.get_start_method", return_value="fork") node_ = mocker.sentinel.node catalog = mocker.sentinel.catalog - run_id = "fake_run_id" + session_id = "fake_session_id" package_name = mocker.sentinel.package_name _run_node_synchronization( - node_, catalog, is_async, run_id, package_name=package_name + node_, catalog, is_async, session_id, package_name=package_name ) - mock_run_node.assert_called_once_with(node_, catalog, is_async, run_id) + mock_run_node.assert_called_once() mock_logging.assert_not_called() diff --git a/tests/runner/test_run_node.py b/tests/runner/test_run_node.py new file mode 100644 index 0000000000..ad95b4838b --- /dev/null +++ b/tests/runner/test_run_node.py @@ -0,0 +1,89 @@ +import pytest + +from kedro.framework.hooks.manager import _NullPluginManager +from kedro.pipeline import node +from kedro.runner import run_node + + +def generate_one(): + yield from range(10) + + +def generate_tuple(): + for i in range(10): + yield i, i * i + + +def generate_list(): + for i in range(10): + yield [i, i * i] + + +def generate_dict(): + for i in range(10): + yield {"idx": i, "square": i * i} + + +class TestRunGeneratorNode: + def test_generator_fail_async(self, mocker, catalog): + fake_dataset = mocker.Mock() + catalog.add("result", fake_dataset) + n = node(generate_one, inputs=None, outputs="result") + + with pytest.raises(Exception, match="nodes wrapping generator functions"): + run_node(n, catalog, _NullPluginManager(), is_async=True) + + def test_generator_node_one(self, mocker, catalog): + fake_dataset = mocker.Mock() + catalog.add("result", fake_dataset) + n = node(generate_one, inputs=None, outputs="result") + run_node(n, catalog, _NullPluginManager()) + + expected = [((i,),) for i in range(10)] + assert 10 == fake_dataset.save.call_count + assert fake_dataset.save.call_args_list == expected + + def test_generator_node_tuple(self, mocker, catalog): + left = mocker.Mock() + right = mocker.Mock() + catalog.add("left", left) + catalog.add("right", right) + n = node(generate_tuple, inputs=None, outputs=["left", "right"]) + run_node(n, catalog, _NullPluginManager()) + + expected_left = [((i,),) for i in range(10)] + expected_right = [((i * i,),) for i in range(10)] + assert 10 == left.save.call_count + assert left.save.call_args_list == expected_left + assert 10 == right.save.call_count + assert right.save.call_args_list == expected_right + + def test_generator_node_list(self, mocker, catalog): + left = mocker.Mock() + right = mocker.Mock() + catalog.add("left", left) + catalog.add("right", right) + n = node(generate_list, inputs=None, outputs=["left", "right"]) + run_node(n, catalog, _NullPluginManager()) + + expected_left = [((i,),) for i in range(10)] + expected_right = [((i * i,),) for i in range(10)] + assert 10 == left.save.call_count + assert left.save.call_args_list == expected_left + assert 10 == right.save.call_count + assert right.save.call_args_list == expected_right + + def test_generator_node_dict(self, mocker, catalog): + left = mocker.Mock() + right = mocker.Mock() + catalog.add("left", left) + catalog.add("right", right) + n = node(generate_dict, inputs=None, outputs={"idx": "left", "square": "right"}) + run_node(n, catalog, _NullPluginManager()) + + expected_left = [((i,),) for i in range(10)] + expected_right = [((i * i,),) for i in range(10)] + assert 10 == left.save.call_count + assert left.save.call_args_list == expected_left + assert 10 == right.save.call_count + assert right.save.call_args_list == expected_right diff --git a/tests/runner/test_sequential_runner.py b/tests/runner/test_sequential_runner.py index a0608f2247..cf91b76c49 100644 --- a/tests/runner/test_sequential_runner.py +++ b/tests/runner/test_sequential_runner.py @@ -1,130 +1,40 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -# pylint: disable=unused-argument -from random import random -from typing import Any, Dict +from __future__ import annotations + +import re +from typing import Any import pandas as pd import pytest -from kedro.io import ( - AbstractDataSet, - DataCatalog, - DataSetError, - LambdaDataSet, - MemoryDataSet, -) -from kedro.pipeline import Pipeline, node +from kedro.framework.hooks import _create_hook_manager +from kedro.io import AbstractDataSet, DataCatalog, DatasetError, LambdaDataset +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import SequentialRunner +from tests.runner.conftest import exception_fn, identity, sink, source -@pytest.fixture -def memory_catalog(): - ds1 = MemoryDataSet({"data": 42}) - ds2 = MemoryDataSet([1, 2, 3, 4, 5]) - return DataCatalog({"ds1": ds1, "ds2": ds2}) - - -@pytest.fixture -def pandas_df_feed_dict(): - pandas_df = pd.DataFrame({"Name": ["Alex", "Bob"], "Age": [15, 25]}) - return {"ds3": pandas_df} - - -@pytest.fixture -def conflicting_feed_dict(pandas_df_feed_dict): - ds1 = MemoryDataSet({"data": 0}) - ds3 = pandas_df_feed_dict["ds3"] - return {"ds1": ds1, "ds3": ds3} - - -def source(): - return "stuff" - - -def identity(arg): - return arg - - -def sink(arg): - pass - - -def return_none(arg): - return None - - -def multi_input_list_output(arg1, arg2): - return [arg1, arg2] - - -@pytest.fixture -def branchless_no_input_pipeline(): - """The pipeline runs in the order A->B->C->D->E.""" - return Pipeline( - [ - node(identity, "D", "E", name="node1"), - node(identity, "C", "D", name="node2"), - node(identity, "A", "B", name="node3"), - node(identity, "B", "C", name="node4"), - node(random, None, "A", name="node5"), - ] - ) - - -@pytest.fixture -def branchless_pipeline(): - return Pipeline( - [ - node(identity, "ds1", "ds2", name="node1"), - node(identity, "ds2", "ds3", name="node2"), - ] - ) - - -@pytest.fixture -def saving_result_pipeline(): - return Pipeline([node(identity, "ds", "dsX")]) - +class TestValidSequentialRunner: + def test_run_with_plugin_manager(self, fan_out_fan_in, catalog): + catalog.add_feed_dict({"A": 42}) + result = SequentialRunner().run( + fan_out_fan_in, catalog, hook_manager=_create_hook_manager() + ) + assert "Z" in result + assert result["Z"] == (42, 42, 42) -@pytest.fixture -def saving_none_pipeline(): - return Pipeline( - [node(random, None, "A"), node(return_none, "A", "B"), node(identity, "B", "C")] - ) + def test_run_without_plugin_manager(self, fan_out_fan_in, catalog): + catalog.add_feed_dict({"A": 42}) + result = SequentialRunner().run(fan_out_fan_in, catalog) + assert "Z" in result + assert result["Z"] == (42, 42, 42) @pytest.mark.parametrize("is_async", [False, True]) class TestSeqentialRunnerBranchlessPipeline: - def test_no_input_seq(self, is_async, branchless_no_input_pipeline): + def test_no_input_seq(self, is_async, branchless_no_input_pipeline, catalog): outputs = SequentialRunner(is_async=is_async).run( - branchless_no_input_pipeline, DataCatalog() + branchless_no_input_pipeline, catalog ) assert "E" in outputs assert len(outputs) == 1 @@ -142,10 +52,10 @@ def test_no_feed(self, is_async, memory_catalog, branchless_pipeline): assert "ds3" in outputs assert outputs["ds3"]["data"] == 42 - def test_node_returning_none(self, is_async, saving_none_pipeline): - pattern = "Saving `None` to a `DataSet` is not allowed" - with pytest.raises(DataSetError, match=pattern): - SequentialRunner(is_async=is_async).run(saving_none_pipeline, DataCatalog()) + def test_node_returning_none(self, is_async, saving_none_pipeline, catalog): + pattern = "Saving 'None' to a 'Dataset' is not allowed" + with pytest.raises(DatasetError, match=pattern): + SequentialRunner(is_async=is_async).run(saving_none_pipeline, catalog) def test_result_saved_not_returned(self, is_async, saving_result_pipeline): """The pipeline runs ds->dsX but save does not save the output.""" @@ -158,8 +68,8 @@ def _save(arg): catalog = DataCatalog( { - "ds": LambdaDataSet(load=_load, save=_save), - "dsX": LambdaDataSet(load=_load, save=_save), + "ds": LambdaDataset(load=_load, save=_save), + "dsX": LambdaDataset(load=_load, save=_save), } ) output = SequentialRunner(is_async=is_async).run( @@ -169,23 +79,14 @@ def _save(arg): assert output == {} -@pytest.fixture -def unfinished_outputs_pipeline(): - return Pipeline( - [ - node(identity, dict(arg="ds4"), "ds8", name="node1"), - node(sink, "ds7", None, name="node2"), - node(multi_input_list_output, ["ds3", "ds4"], ["ds6", "ds7"], name="node3"), - node(identity, "ds2", "ds5", name="node4"), - node(identity, "ds1", "ds4", name="node5"), - ] - ) # Outputs: ['ds8', 'ds5', 'ds6'] == ['ds1', 'ds2', 'ds3'] - - @pytest.mark.parametrize("is_async", [False, True]) -class TestSeqentialRunnerBranchedPipeline: +class TestSequentialRunnerBranchedPipeline: def test_input_seq( - self, is_async, memory_catalog, unfinished_outputs_pipeline, pandas_df_feed_dict + self, + is_async, + memory_catalog, + unfinished_outputs_pipeline, + pandas_df_feed_dict, ): memory_catalog.add_feed_dict(pandas_df_feed_dict, replace=True) outputs = SequentialRunner(is_async=is_async).run( @@ -216,15 +117,15 @@ def test_conflict_feed_catalog( assert outputs["ds8"]["data"] == 0 assert isinstance(outputs["ds6"], pd.DataFrame) - def test_unsatisfied_inputs(self, is_async, unfinished_outputs_pipeline): + def test_unsatisfied_inputs(self, is_async, unfinished_outputs_pipeline, catalog): """ds1, ds2 and ds3 were not specified.""" with pytest.raises(ValueError, match=r"not found in the DataCatalog"): SequentialRunner(is_async=is_async).run( - unfinished_outputs_pipeline, DataCatalog() + unfinished_outputs_pipeline, catalog ) -class LoggingDataSet(AbstractDataSet): +class LoggingDataset(AbstractDataSet): def __init__(self, log, name, value=None): self.log = log self.name = name @@ -241,7 +142,7 @@ def _release(self) -> None: self.log.append(("release", self.name)) self.value = None - def _describe(self) -> Dict[str, Any]: + def _describe(self) -> dict[str, Any]: return {} @@ -249,24 +150,24 @@ def _describe(self) -> Dict[str, Any]: class TestSequentialRunnerRelease: def test_dont_release_inputs_and_outputs(self, is_async): log = [] - pipeline = Pipeline( + test_pipeline = modular_pipeline( [node(identity, "in", "middle"), node(identity, "middle", "out")] ) catalog = DataCatalog( { - "in": LoggingDataSet(log, "in", "stuff"), - "middle": LoggingDataSet(log, "middle"), - "out": LoggingDataSet(log, "out"), + "in": LoggingDataset(log, "in", "stuff"), + "middle": LoggingDataset(log, "middle"), + "out": LoggingDataset(log, "out"), } ) - SequentialRunner(is_async=is_async).run(pipeline, catalog) + SequentialRunner(is_async=is_async).run(test_pipeline, catalog) # we don't want to see release in or out in here assert log == [("load", "in"), ("load", "middle"), ("release", "middle")] def test_release_at_earliest_opportunity(self, is_async): log = [] - pipeline = Pipeline( + test_pipeline = modular_pipeline( [ node(source, None, "first"), node(identity, "first", "second"), @@ -275,11 +176,11 @@ def test_release_at_earliest_opportunity(self, is_async): ) catalog = DataCatalog( { - "first": LoggingDataSet(log, "first"), - "second": LoggingDataSet(log, "second"), + "first": LoggingDataset(log, "first"), + "second": LoggingDataset(log, "second"), } ) - SequentialRunner(is_async=is_async).run(pipeline, catalog) + SequentialRunner(is_async=is_async).run(test_pipeline, catalog) # we want to see "release first" before "load second" assert log == [ @@ -291,41 +192,41 @@ def test_release_at_earliest_opportunity(self, is_async): def test_count_multiple_loads(self, is_async): log = [] - pipeline = Pipeline( + test_pipeline = modular_pipeline( [ node(source, None, "dataset"), node(sink, "dataset", None, name="bob"), node(sink, "dataset", None, name="fred"), ] ) - catalog = DataCatalog({"dataset": LoggingDataSet(log, "dataset")}) - SequentialRunner(is_async=is_async).run(pipeline, catalog) + catalog = DataCatalog({"dataset": LoggingDataset(log, "dataset")}) + SequentialRunner(is_async=is_async).run(test_pipeline, catalog) # we want to the release after both the loads assert log == [("load", "dataset"), ("load", "dataset"), ("release", "dataset")] def test_release_transcoded(self, is_async): log = [] - pipeline = Pipeline( + test_pipeline = modular_pipeline( [node(source, None, "ds@save"), node(sink, "ds@load", None)] ) catalog = DataCatalog( { - "ds@save": LoggingDataSet(log, "save"), - "ds@load": LoggingDataSet(log, "load"), + "ds@save": LoggingDataset(log, "save"), + "ds@load": LoggingDataset(log, "load"), } ) - SequentialRunner(is_async=is_async).run(pipeline, catalog) + SequentialRunner(is_async=is_async).run(test_pipeline, catalog) # we want to see both datasets being released assert log == [("release", "save"), ("load", "load"), ("release", "load")] @pytest.mark.parametrize( - "pipeline", + "test_pipeline", [ - Pipeline([node(identity, "ds1", "ds2", confirms="ds1")]), - Pipeline( + modular_pipeline([node(identity, "ds1", "ds2", confirms="ds1")]), + modular_pipeline( [ node(identity, "ds1", "ds2"), node(identity, "ds2", None, confirms="ds1"), @@ -333,8 +234,43 @@ def test_release_transcoded(self, is_async): ), ], ) - def test_confirms(self, mocker, pipeline, is_async): + def test_confirms(self, mocker, test_pipeline, is_async): fake_dataset_instance = mocker.Mock() catalog = DataCatalog(data_sets={"ds1": fake_dataset_instance}) - SequentialRunner(is_async=is_async).run(pipeline, catalog) + SequentialRunner(is_async=is_async).run(test_pipeline, catalog) fake_dataset_instance.confirm.assert_called_once_with() + + +@pytest.mark.parametrize( + "failing_node_names,expected_pattern", + [ + (["node1_A"], r"No nodes ran."), + (["node2"], r"(node1_A,node1_B|node1_B,node1_A)"), + (["node3_A"], r"(node3_A,node3_B|node3_B,node3_A)"), + (["node4_A"], r"(node3_A,node3_B|node3_B,node3_A)"), + (["node3_A", "node4_A"], r"(node3_A,node3_B|node3_B,node3_A)"), + (["node2", "node4_A"], r"(node1_A,node1_B|node1_B,node1_A)"), + ], +) +class TestSuggestResumeScenario: + def test_suggest_resume_scenario( + self, + caplog, + two_branches_crossed_pipeline, + persistent_dataset_catalog, + failing_node_names, + expected_pattern, + ): + nodes = {n.name: n for n in two_branches_crossed_pipeline.nodes} + for name in failing_node_names: + two_branches_crossed_pipeline -= modular_pipeline([nodes[name]]) + two_branches_crossed_pipeline += modular_pipeline( + [nodes[name]._copy(func=exception_fn)] + ) + with pytest.raises(Exception): + SequentialRunner().run( + two_branches_crossed_pipeline, + persistent_dataset_catalog, + hook_manager=_create_hook_manager(), + ) + assert re.search(expected_pattern, caplog.text) diff --git a/tests/runner/test_thread_runner.py b/tests/runner/test_thread_runner.py index 3afd517cc6..a9348548a7 100644 --- a/tests/runner/test_thread_runner.py +++ b/tests/runner/test_thread_runner.py @@ -1,98 +1,39 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. +from __future__ import annotations from concurrent.futures import ThreadPoolExecutor -from typing import Any, Dict +from typing import Any import pytest -from kedro.io import AbstractDataSet, DataCatalog, DataSetError, MemoryDataSet -from kedro.pipeline import Pipeline, node -from kedro.pipeline.decorators import log_time +from kedro.framework.hooks import _create_hook_manager +from kedro.io import AbstractDataSet, DataCatalog, DatasetError, MemoryDataset +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ThreadRunner - - -def source(): - return "stuff" - - -def identity(arg): - return arg - - -def sink(arg): # pylint: disable=unused-argument - pass - - -def fan_in(*args): - return args - - -def exception_fn(arg): - raise Exception("test exception") - - -def return_none(arg): - arg = None - return arg - - -@pytest.fixture -def catalog(): - return DataCatalog() - - -@pytest.fixture -def fan_out_fan_in(): - return Pipeline( - [ - node(identity, "A", "B"), - node(identity, "B", "C"), - node(identity, "B", "D"), - node(identity, "B", "E"), - node(fan_in, ["C", "D", "E"], "Z"), - ] - ) +from tests.runner.conftest import exception_fn, identity, return_none, sink, source class TestValidThreadRunner: def test_create_default_data_set(self): data_set = ThreadRunner().create_default_data_set("") - assert isinstance(data_set, MemoryDataSet) + assert isinstance(data_set, MemoryDataset) def test_thread_run(self, fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) + catalog.add_feed_dict({"A": 42}) result = ThreadRunner().run(fan_out_fan_in, catalog) assert "Z" in result assert result["Z"] == (42, 42, 42) - def test_memory_data_set_input(self, fan_out_fan_in): - catalog = DataCatalog({"A": MemoryDataSet("42")}) + def test_thread_run_with_plugin_manager(self, fan_out_fan_in, catalog): + catalog.add_feed_dict({"A": 42}) + result = ThreadRunner().run( + fan_out_fan_in, catalog, hook_manager=_create_hook_manager() + ) + assert "Z" in result + assert result["Z"] == (42, 42, 42) + + def test_memory_dataset_input(self, fan_out_fan_in): + catalog = DataCatalog({"A": MemoryDataset("42")}) result = ThreadRunner().run(fan_out_fan_in, catalog) assert "Z" in result assert result["Z"] == ("42", "42", "42") @@ -114,7 +55,7 @@ def test_specified_max_workers( catalog, user_specified_number, expected_number, - ): # pylint: disable=too-many-arguments + ): # noqa: too-many-arguments """ We initialize the runner with max_workers=4. `fan_out_fan_in` pipeline needs 3 threads. @@ -125,7 +66,7 @@ def test_specified_max_workers( wraps=ThreadPoolExecutor, ) - catalog.add_feed_dict(dict(A=42)) + catalog.add_feed_dict({"A": 42}) result = ThreadRunner(max_workers=user_specified_number).run( fan_out_fan_in, catalog ) @@ -141,11 +82,11 @@ def test_init_with_negative_process_count(self): class TestIsAsync: def test_thread_run(self, fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) + catalog.add_feed_dict({"A": 42}) pattern = ( - "`ThreadRunner` doesn't support loading and saving the " + "'ThreadRunner' doesn't support loading and saving the " "node inputs and outputs asynchronously with threads. " - "Setting `is_async` to False." + "Setting 'is_async' to False." ) with pytest.warns(UserWarning, match=pattern): result = ThreadRunner(is_async=True).run(fan_out_fan_in, catalog) @@ -155,54 +96,22 @@ def test_thread_run(self, fan_out_fan_in, catalog): class TestInvalidThreadRunner: def test_task_exception(self, fan_out_fan_in, catalog): - catalog.add_feed_dict(feed_dict=dict(A=42)) - pipeline = Pipeline([fan_out_fan_in, node(exception_fn, "Z", "X")]) + catalog.add_feed_dict(feed_dict={"A": 42}) + pipeline = modular_pipeline([fan_out_fan_in, node(exception_fn, "Z", "X")]) with pytest.raises(Exception, match="test exception"): ThreadRunner().run(pipeline, catalog) def test_node_returning_none(self): - pipeline = Pipeline([node(identity, "A", "B"), node(return_none, "B", "C")]) - catalog = DataCatalog({"A": MemoryDataSet("42")}) - pattern = "Saving `None` to a `DataSet` is not allowed" - with pytest.raises(DataSetError, match=pattern): + pipeline = modular_pipeline( + [node(identity, "A", "B"), node(return_none, "B", "C")] + ) + catalog = DataCatalog({"A": MemoryDataset("42")}) + pattern = "Saving 'None' to a 'Dataset' is not allowed" + with pytest.raises(DatasetError, match=pattern): ThreadRunner().run(pipeline, catalog) -@log_time -def decorated_identity(*args, **kwargs): - return identity(*args, **kwargs) - - -@pytest.fixture -def decorated_fan_out_fan_in(): - return Pipeline( - [ - node(decorated_identity, "A", "B"), - node(decorated_identity, "B", "C"), - node(decorated_identity, "B", "D"), - node(decorated_identity, "B", "E"), - node(fan_in, ["C", "D", "E"], "Z"), - ] - ) - - -class TestThreadRunnerDecorator: - def test_decorate_pipeline(self, fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) - result = ThreadRunner().run(fan_out_fan_in.decorate(log_time), catalog) - assert "Z" in result - assert len(result["Z"]) == 3 - assert result["Z"] == (42, 42, 42) - - def test_decorated_nodes(self, decorated_fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) - result = ThreadRunner().run(decorated_fan_out_fan_in, catalog) - assert "Z" in result - assert len(result["Z"]) == 3 - assert result["Z"] == (42, 42, 42) - - -class LoggingDataSet(AbstractDataSet): +class LoggingDataset(AbstractDataSet): def __init__(self, log, name, value=None): self.log = log self.name = name @@ -219,7 +128,7 @@ def _release(self) -> None: self.log.append(("release", self.name)) self.value = None - def _describe(self) -> Dict[str, Any]: + def _describe(self) -> dict[str, Any]: return {} @@ -227,14 +136,14 @@ class TestThreadRunnerRelease: def test_dont_release_inputs_and_outputs(self): log = [] - pipeline = Pipeline( + pipeline = modular_pipeline( [node(identity, "in", "middle"), node(identity, "middle", "out")] ) catalog = DataCatalog( { - "in": LoggingDataSet(log, "in", "stuff"), - "middle": LoggingDataSet(log, "middle"), - "out": LoggingDataSet(log, "out"), + "in": LoggingDataset(log, "in", "stuff"), + "middle": LoggingDataset(log, "middle"), + "out": LoggingDataset(log, "out"), } ) ThreadRunner().run(pipeline, catalog) @@ -246,7 +155,7 @@ def test_release_at_earliest_opportunity(self): runner = ThreadRunner() log = [] - pipeline = Pipeline( + pipeline = modular_pipeline( [ node(source, None, "first"), node(identity, "first", "second"), @@ -255,8 +164,8 @@ def test_release_at_earliest_opportunity(self): ) catalog = DataCatalog( { - "first": LoggingDataSet(log, "first"), - "second": LoggingDataSet(log, "second"), + "first": LoggingDataset(log, "first"), + "second": LoggingDataset(log, "second"), } ) runner.run(pipeline, catalog) @@ -273,14 +182,14 @@ def test_count_multiple_loads(self): runner = ThreadRunner() log = [] - pipeline = Pipeline( + pipeline = modular_pipeline( [ node(source, None, "dataset"), node(sink, "dataset", None, name="bob"), node(sink, "dataset", None, name="fred"), ] ) - catalog = DataCatalog({"dataset": LoggingDataSet(log, "dataset")}) + catalog = DataCatalog({"dataset": LoggingDataset(log, "dataset")}) runner.run(pipeline, catalog) # we want to the release after both the loads @@ -293,13 +202,13 @@ def test_count_multiple_loads(self): def test_release_transcoded(self): log = [] - pipeline = Pipeline( + pipeline = modular_pipeline( [node(source, None, "ds@save"), node(sink, "ds@load", None)] ) catalog = DataCatalog( { - "ds@save": LoggingDataSet(log, "save"), - "ds@load": LoggingDataSet(log, "load"), + "ds@save": LoggingDataset(log, "save"), + "ds@load": LoggingDataset(log, "load"), } ) diff --git a/tests/test_cli_logging_setup.py b/tests/test_cli_logging_setup.py deleted file mode 100644 index b920e19013..0000000000 --- a/tests/test_cli_logging_setup.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""This module contains unit tests for methods in the Kedro __init__.py -""" - -import logging - -from kedro.config.default_logger import LOGGING_CONFIG - - -def test_cli_logging_setup(): - def to_names(handlers): - return [h.name for h in handlers] - - assert LOGGING_CONFIG is not None - - # Check root logger is set up correctly - root_handler_names = to_names(logging.getLogger().handlers) - all_handlers = ["console", "info_file_handler", "error_file_handler"] - intersection = set(root_handler_names).intersection(all_handlers) - assert len(intersection) == 3 - - # check cli logger is set up correctly - cli_handlers = to_names(logging.getLogger("kedro.framework.cli").handlers) - assert len(cli_handlers) == 1 - assert "console" in cli_handlers diff --git a/tests/test_import.py b/tests/test_import.py new file mode 100644 index 0000000000..81436ecfc6 --- /dev/null +++ b/tests/test_import.py @@ -0,0 +1,27 @@ +import pytest + +import kedro + + +def test_import_kedro_with_no_official_support_raise_error(mocker): + """Test importing kedro with python>=3.11 should fail""" + mocker.patch("kedro.sys.version_info", (3, 11)) + + # We use the parent class to avoid issues with `exec_module` + with pytest.raises(UserWarning) as excinfo: + kedro.__loader__.exec_module(kedro) + + assert "Kedro is not yet fully compatible" in str(excinfo.value) + + +def test_import_kedro_with_no_official_support_emits_warning(mocker): + """Test importing kedro python>=3.11 and controlled warnings should work""" + mocker.patch("kedro.sys.version_info", (3, 11)) + mocker.patch("kedro.sys.warnoptions", ["default:Kedro is not yet fully compatible"]) + + # We use the parent class to avoid issues with `exec_module` + with pytest.warns(UserWarning) as record: + kedro.__loader__.exec_module(kedro) + + assert len(record) == 1 + assert "Kedro is not yet fully compatible" in record[0].message.args[0] diff --git a/tests/test_utils.py b/tests/test_utils.py index f5145ccebf..4e99f3f726 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """Test a set of helper functions being used across kedro components.""" import pytest @@ -49,7 +21,7 @@ def test_load_obj_default_path(self): def test_load_obj_invalid_attribute(self): with pytest.raises( - AttributeError, match=r"Object `InvalidClass` cannot be loaded" + AttributeError, match=r"Object 'InvalidClass' cannot be loaded" ): load_obj("InvalidClass", "tests.test_utils") diff --git a/tests/tools/ipython/__init__.py b/tests/tools/ipython/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/tools/ipython/test_ipython_loader.py b/tests/tools/ipython/test_ipython_loader.py deleted file mode 100644 index 4b8c7efc06..0000000000 --- a/tests/tools/ipython/test_ipython_loader.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test Kedro extras.""" -from pathlib import Path - -import pytest - -from tools.ipython import ipython_loader -from tools.ipython.ipython_loader import locate_ipython_startup_dir - - -@pytest.fixture -def dummy_project_dir(tmp_path): - # need to resolve tmp_path for tests to pass on MacOS - root = Path(tmp_path / "dummy_project").resolve() - root.mkdir() - startup_path = root / ".ipython" / "profile_default" / "startup" - startup_path.mkdir(parents=True) - yield root - - -@pytest.fixture -def nested_project_dir(dummy_project_dir): - nested = dummy_project_dir / "some_dir" / "another_dummy_project" - startup_path = nested / ".ipython" / "profile_default" / "startup" - startup_path.mkdir(parents=True) - yield nested.resolve() - - -@pytest.fixture -def startup_script(dummy_project_dir): - script = "dummy_project_var1 = 111" - script_path = ( - dummy_project_dir - / ".ipython" - / "profile_default" - / "startup" - / "01-startup-script.py" - ) - script_path.write_text(script, encoding="utf-8") - return script_path - - -@pytest.fixture -def bad_startup_script(dummy_project_dir): - script = "raise ValueError('bad script!')" - script_path = ( - dummy_project_dir - / ".ipython" - / "profile_default" - / "startup" - / "00-bad-script.py" - ) - script_path.write_text(script, encoding="utf-8") - return script_path - - -class TestIpythonStartupDir: - """Test locating IPython startup directory.""" - - def test_locate(self, dummy_project_dir): - ipython_dir = dummy_project_dir / ".ipython" / "profile_default" / "startup" - assert locate_ipython_startup_dir(dummy_project_dir) == ipython_dir - - path = dummy_project_dir / "notebooks" / "foo" / "bar" - path.mkdir(parents=True) - assert locate_ipython_startup_dir(path) == ipython_dir - - def test_locate_nested(self, nested_project_dir, dummy_project_dir): - root_ipython_dir = ( - dummy_project_dir / ".ipython" / "profile_default" / "startup" - ) - nested_ipython_dir = ( - nested_project_dir / ".ipython" / "profile_default" / "startup" - ) - assert locate_ipython_startup_dir(nested_project_dir) == nested_ipython_dir - assert locate_ipython_startup_dir(nested_project_dir.parent) == root_ipython_dir - - path = nested_project_dir / "notebooks" / "foo" / "bar" - path.mkdir(parents=True) - assert locate_ipython_startup_dir(path) == nested_ipython_dir - - path = dummy_project_dir / "other" / "dir" - path.mkdir(parents=True) - assert locate_ipython_startup_dir(path) == root_ipython_dir - - @pytest.mark.usefixtures("dummy_project_dir") - def test_locate_no_project(self, tmp_path): - assert locate_ipython_startup_dir(str(tmp_path)) is None - assert locate_ipython_startup_dir(Path("/")) is None - - -class TestRunStartupScripts: - """Test running IPython startup scripts from the project.""" - - def test_run(self, dummy_project_dir, startup_script, caplog): - ipython_loader.run_startup_scripts(dummy_project_dir) - expected_message = f"Startup script `{startup_script}` successfully executed" - - assert getattr(ipython_loader, "dummy_project_var1") == 111 - assert len(caplog.records) == 1 - assert caplog.records[0].message == expected_message - - def test_run_bad_script(self, dummy_project_dir, bad_startup_script, caplog): - ipython_loader.run_startup_scripts(dummy_project_dir) - expected_error_message = ( - f"Startup script `{bad_startup_script}` failed:\nValueError: bad script!" - ) - assert len(caplog.records) == 1 - assert caplog.records[0].message == expected_error_message - - def test_run_both_scripts( - self, dummy_project_dir, startup_script, bad_startup_script, caplog - ): - ipython_loader.run_startup_scripts(dummy_project_dir) - expected_error_message = ( - f"Startup script `{bad_startup_script}` failed:\nValueError: bad script!" - ) - expected_success_message = "Startup script `{}` successfully executed".format( - startup_script - ) - - assert len(caplog.records) == 2 - assert caplog.records[0].message == expected_error_message - assert caplog.records[1].message == expected_success_message - - def test_modify_globals(self): - """Test modify_globals context manager.""" - with ipython_loader.modify_globals(__file__="new_file_value", new_key=999): - assert ipython_loader.__file__ == "new_file_value" - assert getattr(ipython_loader, "new_key") == 999 - assert ipython_loader.__file__ != "new_file_value" - assert not hasattr(ipython_loader, "some_new_key") - - def test_ipython_loader_main(self, mocker, dummy_project_dir, caplog): - mocker.patch("pathlib.Path.cwd", return_value=dummy_project_dir) - script_path = ( - dummy_project_dir - / ".ipython" - / "profile_default" - / "startup" - / "startup_script.py" - ) - script_path.write_text("dummy_project_var2 = 2222", encoding="utf-8") - ipython_loader.main() - - assert getattr(ipython_loader, "dummy_project_var2") == 2222 - assert len(caplog.records) == 1 - expected_message = f"Startup script `{script_path}` successfully executed" - assert caplog.records[0].message == expected_message diff --git a/tests/tools/test_cli.py b/tests/tools/test_cli.py new file mode 100644 index 0000000000..1b80ad8064 --- /dev/null +++ b/tests/tools/test_cli.py @@ -0,0 +1,155 @@ +"""Testing module for CLI tools""" +import shutil +from collections import namedtuple +from pathlib import Path + +import pytest + +from kedro import __version__ as kedro_version +from kedro.framework.cli.cli import KedroCLI, cli +from kedro.framework.startup import ProjectMetadata +from tools.cli import get_cli_structure + +REPO_NAME = "cli_tools_dummy_project" +PACKAGE_NAME = "cli_tools_dummy_package" +DEFAULT_KEDRO_COMMANDS = [ + "activate-nbstripout", + "build-docs", + "build-reqs", + "catalog", + "ipython", + "jupyter", + "lint", + "new", + "package", + "pipeline", + "micropkg", + "registry", + "run", + "starter", + "test", +] + + +@pytest.fixture +def fake_root_dir(tmp_path): + try: + yield Path(tmp_path).resolve() + finally: + shutil.rmtree(tmp_path, ignore_errors=True) + + +@pytest.fixture +def fake_metadata(fake_root_dir): + metadata = ProjectMetadata( + fake_root_dir / REPO_NAME / "pyproject.toml", + PACKAGE_NAME, + "CLI Tools Testing Project", + fake_root_dir / REPO_NAME, + kedro_version, + fake_root_dir / REPO_NAME / "src", + kedro_version, + ) + return metadata + + +class TestCLITools: + def test_get_cli_structure_raw(self, mocker, fake_metadata): + Module = namedtuple("Module", ["cli"]) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) + mocker.patch( + "kedro.framework.cli.cli._is_project", + return_value=True, + ) + mocker.patch( + "kedro.framework.cli.cli.bootstrap_project", + return_value=fake_metadata, + ) + kedro_cli = KedroCLI(fake_metadata.project_path) + raw_cli_structure = get_cli_structure(kedro_cli, get_help=False) + + # raw CLI structure tests + assert isinstance(raw_cli_structure, dict) + assert isinstance(raw_cli_structure["kedro"], dict) + + for k, v in raw_cli_structure["kedro"].items(): + assert isinstance(k, str) + assert isinstance(v, dict) + + assert sorted(list(raw_cli_structure["kedro"])) == sorted( + DEFAULT_KEDRO_COMMANDS + ) + + def test_get_cli_structure_depth(self, mocker, fake_metadata): + Module = namedtuple("Module", ["cli"]) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) + mocker.patch( + "kedro.framework.cli.cli._is_project", + return_value=True, + ) + mocker.patch( + "kedro.framework.cli.cli.bootstrap_project", + return_value=fake_metadata, + ) + kedro_cli = KedroCLI(fake_metadata.project_path) + raw_cli_structure = get_cli_structure(kedro_cli, get_help=False) + assert isinstance(raw_cli_structure["kedro"]["new"], dict) + assert sorted(list(raw_cli_structure["kedro"]["new"].keys())) == sorted( + [ + "--verbose", + "-v", + "--config", + "-c", + "--starter", + "-s", + "--checkout", + "--directory", + "--help", + ] + ) + # now check that once params and args are reached, the values are None + assert raw_cli_structure["kedro"]["new"]["--starter"] is None + assert raw_cli_structure["kedro"]["new"]["--checkout"] is None + assert raw_cli_structure["kedro"]["new"]["--help"] is None + assert raw_cli_structure["kedro"]["new"]["-c"] is None + + def test_get_cli_structure_help(self, mocker, fake_metadata): + Module = namedtuple("Module", ["cli"]) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) + mocker.patch( + "kedro.framework.cli.cli._is_project", + return_value=True, + ) + mocker.patch( + "kedro.framework.cli.cli.bootstrap_project", + return_value=fake_metadata, + ) + kedro_cli = KedroCLI(fake_metadata.project_path) + help_cli_structure = get_cli_structure(kedro_cli, get_help=True) + + assert isinstance(help_cli_structure, dict) + assert isinstance(help_cli_structure["kedro"], dict) + + for k, v in help_cli_structure["kedro"].items(): + assert isinstance(k, str) + if isinstance(v, dict): + for sub_key in v: + assert isinstance(help_cli_structure["kedro"][k][sub_key], str) + assert help_cli_structure["kedro"][k][sub_key].startswith( + "Usage: [OPTIONS]" + ) + elif isinstance(v, str): + assert v.startswith("Usage: [OPTIONS]") + + assert sorted(list(help_cli_structure["kedro"])) == sorted( + DEFAULT_KEDRO_COMMANDS + ) diff --git a/tests/versioning/test_journal.py b/tests/versioning/test_journal.py deleted file mode 100644 index 5af9c8113f..0000000000 --- a/tests/versioning/test_journal.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. -# pylint: disable=protected-access - -import json -import logging -import logging.config -from importlib import reload - -import pytest - -from kedro.versioning.journal import Journal, _git_sha - - -@pytest.fixture() -def setup_logging(tmp_path): - config = { - "version": 1, - "loggers": { - "kedro.journal": { - "level": "INFO", - "handlers": ["journal_file_handler"], - "propagate": False, - } - }, - "handlers": { - "journal_file_handler": { - "class": "kedro.versioning.journal.JournalFileHandler", - "level": "INFO", - "base_dir": str(tmp_path), - } - }, - } - reload(logging) - logging.config.dictConfig(config) - - -@pytest.fixture -def fake_git_sha(mocker): - return mocker.patch("kedro.versioning.journal._git_sha", return_value="git_sha") - - -@pytest.mark.usefixtures("fake_git_sha") -class TestJournal: - @pytest.mark.usefixtures("setup_logging") - def test_context_record(self, tmp_path): - """Test journal initialisation""" - record_data = {"run_id": "fake_id", "project_path": str(tmp_path)} - journal = Journal(record_data) - file_path = list(tmp_path.glob("journal_*")) - - assert len(file_path) == 1 - assert journal.run_id in str(file_path[0]) - log = json.loads(file_path[0].read_text()) - assert log["type"] == "ContextJournalRecord" - assert log["project_path"] == str(tmp_path) - assert log["git_sha"] == "git_sha" - assert "run_id" in log - - def test_invalid_context_record(self, tmp_path, caplog): - record_data = { - "run_id": "fake_id", - "project_path": str(tmp_path), - "blah": lambda x: x, - } - _ = Journal(record_data) - - assert "Unable to record" in caplog.record_tuples[0][2] - - @pytest.mark.usefixtures("setup_logging") - def test_log_catalog(self, tmp_path): - record_data = {"run_id": "fake_id", "project_path": str(tmp_path)} - journal = Journal(record_data) - journal.log_catalog("fake_data", "fake_operation", "fake_version") - file_path = list(tmp_path.glob("journal_*")) - - assert journal.run_id in str(file_path[0]) - assert len(file_path) == 1 - with file_path[0].open() as log_file: - context_log = json.loads(log_file.readline()) - catalog_log = json.loads(log_file.readline()) - assert catalog_log["type"] == "DatasetJournalRecord" - assert catalog_log["name"] == "fake_data" - assert catalog_log["operation"] == "fake_operation" - assert catalog_log["version"] == "fake_version" - assert catalog_log["run_id"] == context_log["run_id"] - - def test_deprecation_warning(self, tmp_path): - record_data = {"run_id": "fake_id", "project_path": str(tmp_path)} - with pytest.warns(DeprecationWarning): - Journal(record_data) - - -def test_git_sha(tmp_path, mocker): - mocker.patch("subprocess.check_output", return_value="mocked_return".encode()) - result = _git_sha(tmp_path) - assert result == "mocked_return" - - -def test_invalid_git_sha(tmp_path, caplog): - _git_sha(tmp_path) - assert "Unable to git describe" in caplog.record_tuples[0][2] diff --git a/tools/circleci/check-no-version-pypi.sh b/tools/circleci/check-no-version-pypi.sh new file mode 100755 index 0000000000..2579b33169 --- /dev/null +++ b/tools/circleci/check-no-version-pypi.sh @@ -0,0 +1,10 @@ +KEDRO_VERSION=$1 + +PYPI_ENDPOINT="https://pypi.org/pypi/kedro/${KEDRO_VERSION}/json/" + +STATUS_CODE=$(curl --location --silent \ +--output /dev/null \ +--write-out "%{http_code}\n" \ +"${PYPI_ENDPOINT}") + +[ "${STATUS_CODE}" == "404" ] diff --git a/tools/circleci/circle-release.sh b/tools/circleci/circle-release.sh new file mode 100755 index 0000000000..ae655ffffb --- /dev/null +++ b/tools/circleci/circle-release.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +# Exit script if you try to use an uninitialized variable. +set -o nounset + +# Exit script if a statement returns a non-true return value. +set -o errexit + +PROJECT_SLUG=$1 +CIRCLE_ENDPOINT="https://circleci.com/api/v2/project/${PROJECT_SLUG}/pipeline" + +PAYLOAD=$(cat <<-END +{ + "branch": "${CIRCLE_BRANCH}", + "parameters": {"release_kedro": true} +} +END +) + +curl -X POST \ + --silent --show-error --fail --retry 3 \ + --output /dev/null \ + --header "Content-Type: application/json" \ + --header "Circle-Token: ${CIRCLE_RELEASE_TOKEN}" \ + --data "${PAYLOAD}" \ + "${CIRCLE_ENDPOINT}" diff --git a/tools/circleci/docker_build_img/Dockerfile b/tools/circleci/docker_build_img/Dockerfile new file mode 100644 index 0000000000..8f0c7313a9 --- /dev/null +++ b/tools/circleci/docker_build_img/Dockerfile @@ -0,0 +1,28 @@ +FROM cimg/python:3.8 + +WORKDIR /home/circleci + +RUN sudo apt-get update && \ + sudo apt-get install curl pandoc openjdk-8-jdk-headless -y && \ + sudo apt-get clean && \ + sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java + +# Update cacerts: https://stackoverflow.com/a/50103533/1684058 +RUN sudo rm /etc/ssl/certs/java/cacerts && \ + sudo update-ca-certificates -f + +RUN curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ + > miniconda.sh && bash miniconda.sh -b -p /home/circleci/miniconda + +RUN sudo rm -rf ~/.pyenv/ /opt/circleci/.pyenv/ + +ARG PIP_REQS +ARG PY_VERSION +ARG CONDA_ENV_NAME=kedro_builder + +# Install/Setup anaconda env +RUN bash -c "source /home/circleci/miniconda/etc/profile.d/conda.sh && \ + echo \"$PIP_REQS\" > /tmp/requirements.txt && \ + conda create --name=$CONDA_ENV_NAME python=$PY_VERSION && \ + conda activate $CONDA_ENV_NAME && \ + pip install --no-cache-dir --prefer-binary --upgrade -r /tmp/requirements.txt" diff --git a/tools/circleci/docker_build_img/build.sh b/tools/circleci/docker_build_img/build.sh new file mode 100755 index 0000000000..f45404c4c2 --- /dev/null +++ b/tools/circleci/docker_build_img/build.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -e +cd "$(dirname "$0")" + +KEDRO_REPO=$1 +ECR_IMAGE_URL=$2 +PY_VERSION=$3 + +get_pip_reqs() { + local project_path=$1 + cat $project_path/*requirements.txt | grep -v requirements +} + +docker_build() { + local pip_reqs="$1" + local image=$ECR_IMAGE_URL:$PY_VERSION + echo "Building docker image: $image" + docker build -t $image \ + --build-arg PIP_REQS="$pip_reqs" \ + --build-arg PY_VERSION=$PY_VERSION \ + . +} + +docker_push() { + local image=$ECR_IMAGE_URL:$PY_VERSION + echo "Pushing docker image: $image" + docker push $image +} + +main() { + local pip_reqs="$(get_pip_reqs $KEDRO_REPO)" + docker_build "$pip_reqs" + docker_push +} + +main diff --git a/tools/circleci/github_scripts/attempt_merge_pr.sh b/tools/circleci/github_scripts/attempt_merge_pr.sh new file mode 100755 index 0000000000..a9553e326b --- /dev/null +++ b/tools/circleci/github_scripts/attempt_merge_pr.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +# Script to attempt to merge an automatic PR from main to develop. + +# Exit script if you try to use an uninitialized variable. +set -o nounset + +# Exit script if a statement returns a non-true return value. +set -o errexit + +SOURCE_BRANCH=$1 +TARGET_BRANCH=$2 +GITHUB_TAGGING_TOKEN=$3 +GITHUB_USER="kedro-org" +GITHUB_REPO="kedro" + +# Array of GitHub mergeable states that are valid to proceed with automatic PR merging. +# Adding "unstable" can be convenient, as it allows to auto-merge pull requests that +# pass just the required CI checks (whereas "clean" requires all CI checks to pass, +# regardless of whether they are required or not). More info: +# https://docs.github.com/en/graphql/reference/enums#mergestatestatus +# https://github.com/octokit/octokit.net/issues/1763 +VALID_MERGEABLE_STATES=("clean" "unstable") + +find_github_pr() { + # Find a PR from source to target branch + # Returns PR number if GitHub returned exactly one such PR + endpoint="https://api.github.com/repos/${GITHUB_USER}/${GITHUB_REPO}/pulls?base=${TARGET_BRANCH}&head=${GITHUB_USER}:${SOURCE_BRANCH}&state=open" + response=$(curl --silent --header "Authorization: token ${GITHUB_TAGGING_TOKEN}" "${endpoint}") + num_open_prs=$(echo "$response" | tr '\r\n' ' ' | jq "length") + if [ "$num_open_prs" -eq 1 ]; then + echo "$response" | tr '\r\n' ' ' | jq ".[0].number" + fi +} + +check_pr_mergeable() { + # Check that the given PR is in a mergeable state + pr=$1 + endpoint="https://api.github.com/repos/${GITHUB_USER}/${GITHUB_REPO}/pulls/${pr}" + response=$(curl --silent --header "Authorization: token ${GITHUB_TAGGING_TOKEN}" "${endpoint}") + mergeable=$(echo "${response}" | tr '\r\n' ' ' | jq ".mergeable // false") # default to false + echo "PR ${pr} mergeable: ${mergeable}" + mergeable_state=$(echo "${response}" | tr '\r\n' ' ' | jq --raw-output ".mergeable_state // \"unknown\"") + echo "PR ${pr} mergeable_state: ${mergeable_state}" + [ "${mergeable}" == true ] && [[ " ${VALID_MERGEABLE_STATES[@]} " =~ " ${mergeable_state} " ]] +} + +toggle_merge_commits() { + # Turns merge commits on or off for the repository + allow_merge_commit=$1 + endpoint="https://api.github.com/repos/${GITHUB_USER}/${GITHUB_REPO}" + payload="{\"allow_merge_commit\": ${allow_merge_commit}}" + status_code=$(curl -X PATCH \ + --silent \ + --header "Authorization: token ${GITHUB_TAGGING_TOKEN}" \ + --header "Content-Type: application/json" \ + --data "${payload}" \ + --output /dev/null \ + --write-out "%{http_code}\n" \ + "${endpoint}") + [ "${status_code}" -eq 200 ] +} + +delete_git_ref() { + # Delete a reference + git_ref=$1 + endpoint="https://api.github.com/repos/${GITHUB_USER}/${GITHUB_REPO}/git/refs/${git_ref}" + status_code=$(curl -X DELETE \ + --silent \ + --header "Authorization: token ${GITHUB_TAGGING_TOKEN}" \ + --output /dev/null \ + --write-out "%{http_code}\n" \ + "${endpoint}") + [ "${status_code}" -eq 204 ] +} + +merge_pr() { + # Merge a given PR using merge commit + pr=$1 + toggle_merge_commits true + endpoint="https://api.github.com/repos/${GITHUB_USER}/${GITHUB_REPO}/pulls/${pr}/merge" + payload='{"merge_method": "merge"}' + response=$(curl -X PUT \ + --silent \ + --header "Authorization: token ${GITHUB_TAGGING_TOKEN}" \ + --header "Content-Type: application/json" \ + --data "${payload}" \ + "${endpoint}") + toggle_merge_commits false + merged=$(echo "${response}" | tr '\r\n' ' ' | jq ".merged // false") # default to false + if [ "${merged}" == true ]; then + echo "PR ${pr} successfully merged" + delete_git_ref "heads/${SOURCE_BRANCH}" + echo "Branch ${SOURCE_BRANCH} successfully deleted" + else + message=$(echo "${response}" | tr '\r\n' ' ' | jq --raw-output ".message") + echo "PR ${pr} NOT merged. Message: ${message}" + fi + [ "${merged}" == true ] +} + +pr_number=$(find_github_pr) + +if [ -z "${pr_number}" ]; then + echo "No PR found from ${SOURCE_BRANCH} to ${TARGET_BRANCH}" + exit 0 +fi + +if check_pr_mergeable "${pr_number}"; then + merge_pr "${pr_number}" +else + echo "PR ${pr_number} is not in a mergeable state" +fi diff --git a/tools/circleci/github_scripts/kedro_version.py b/tools/circleci/github_scripts/kedro_version.py new file mode 100755 index 0000000000..7fade68941 --- /dev/null +++ b/tools/circleci/github_scripts/kedro_version.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +""" +Get version of Kedro +""" + +import os.path +import re +import sys +from pathlib import Path + +VERSION_MATCHSTR = r'\s*__version__\s*=\s*"(\d+\.\d+\.\d+)"' + + +def get_kedro_version(init_file_path): + match_obj = re.search(VERSION_MATCHSTR, Path(init_file_path).read_text()) + return match_obj.group(1) + + +def main(argv): + kedro_path = argv[1] + init_file_path = os.path.join(kedro_path, "__init__.py") + print(get_kedro_version(init_file_path)) + + +if __name__ == "__main__": + main(sys.argv) diff --git a/tools/circleci/github_scripts/merge.sh b/tools/circleci/github_scripts/merge.sh new file mode 100755 index 0000000000..d863061983 --- /dev/null +++ b/tools/circleci/github_scripts/merge.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# Script to merge a source branch into a target branch and raise a PR on conflict. + +# Exit script if you try to use an uninitialized variable. +set -o nounset + +# Exit script if a statement returns a non-true return value. +set -o errexit + +# The git directory where this script will be invoked +GIT_DIRECTORY=$1 +cd $GIT_DIRECTORY + +# The source & target branches to perform the merge, i.e. +# git merge source target +SOURCE_BRANCH=$2 +TARGET_BRANCH=$3 +# A branch created to raise a PR in case SOURCE_BRANCH is push-protected. +PR_BRANCH="merge-${SOURCE_BRANCH}-to-${TARGET_BRANCH}" + +# The Github details to raise a PR +GITHUB_TAGGING_TOKEN=$4 +GITHUB_USER="kedro-org" +GITHUB_REPO="kedro" +GITHUB_ENDPOINT="https://api.github.com/repos/${GITHUB_USER}/${GITHUB_REPO}/pulls" +PAYLOAD=$(cat <<-END +{ + "title": "[AUTO-MERGE] Merge ${SOURCE_BRANCH} into ${TARGET_BRANCH} via ${PR_BRANCH}", + "head": "${PR_BRANCH}", + "base": "${TARGET_BRANCH}", + "body": "A new change in ${SOURCE_BRANCH} cannot be merged into ${TARGET_BRANCH} as part of the regular sync job, hence this PR. Please resolve the conflicts manually, and make sure to obtain 2 approvals once the builds pass.\\n\\n### IMPORTANT NOTICE\\n\\nPlease let CircleCI merge this PR automatically, with merge commit enabled." +} +END +) + +# Attempt to merge the source branch into the target branch after updating the target branch +# with latest changes from origin. +MERGE_STATUS=0 +# We need to reconfigure origin.fetch because we originally clone with --single-branch +git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" +git fetch origin $TARGET_BRANCH && git checkout -b $TARGET_BRANCH "origin/${TARGET_BRANCH}" +git merge --no-edit $SOURCE_BRANCH $TARGET_BRANCH || MERGE_STATUS=1 + +if [ $MERGE_STATUS -eq 0 ] +then + # If the merge was successful, attempt to push the target branch to origin. + # We don't do any error handling here because if this fails, something really wrong is going on, + # so let's just fail the job and debug it manually. + echo "Successfully merged ${SOURCE_BRANCH} into ${TARGET_BRANCH}. Now pushing ${TARGET_BRANCH} to origin..." + git push origin $TARGET_BRANCH +else + # If the merge was not successful, i.e. there was some conflict between source branch and target branch, + # abandon the merge and raise a PR instead. + git merge --abort + + # Check if the PR_BRANCH already exists + PR_BRANCH_EXIST=$(git ls-remote --heads origin $PR_BRANCH | wc -l) + + # If it doesn't exists, push and raise a PR + if [ $PR_BRANCH_EXIST -eq 0 ] + then + echo "Failed to merge ${SOURCE_BRANCH} into ${TARGET_BRANCH}. Raising a pull request instead..." + # Create a new branch from which to raise a PR, as ${SOURCE_BRANCH} might be push-protected + git checkout -b ${PR_BRANCH} ${SOURCE_BRANCH} + git push origin ${PR_BRANCH} + STATUS=$(curl -X POST \ + --output /dev/null --location --silent --write-out "%{http_code}\n" --retry 3 \ + --header "Authorization: token ${GITHUB_TAGGING_TOKEN}" \ + --header "Content-Type: application/json" \ + --data "${PAYLOAD}" \ + "${GITHUB_ENDPOINT}") + [ "${STATUS}" == "201" ] + else + echo "Failed to merge ${SOURCE_BRANCH} into ${TARGET_BRANCH} and it seems like another manual merge between ${SOURCE_BRANCH} and ${TARGET_BRANCH} is in progress. Doing nothing here." + fi +fi + +git checkout ${SOURCE_BRANCH} diff --git a/tools/circleci/github_scripts/release.sh b/tools/circleci/github_scripts/release.sh new file mode 100755 index 0000000000..2b49880e64 --- /dev/null +++ b/tools/circleci/github_scripts/release.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -eu + +GITHUB_USER=$1 +GITHUB_REPO=$2 +GITHUB_TAGGING_TOKEN=$3 +VERSION=$4 + +GITHUB_ENDPOINT="https://api.github.com/repos/${GITHUB_USER}/${GITHUB_REPO}/releases" + +PAYLOAD=$(cat <<-END +{ + "tag_name": "${VERSION}", + "target_commitish": "main", + "name": "${VERSION}", + "body": "Release ${VERSION}", + "draft": false, + "prerelease": false +} +END +) + +STATUS=$(curl -X POST \ + --output /dev/null --location --silent --write-out "%{http_code}\n" --retry 3 \ + --header "Authorization: token ${GITHUB_TAGGING_TOKEN}" \ + --header "Content-Type: application/json" \ + --data "${PAYLOAD}" \ + "${GITHUB_ENDPOINT}") + +[ "${STATUS}" == "201" ] || [ "${STATUS}" == "422" ] diff --git a/tools/circleci/requirements.txt b/tools/circleci/requirements.txt new file mode 100644 index 0000000000..224e670115 --- /dev/null +++ b/tools/circleci/requirements.txt @@ -0,0 +1,3 @@ +pip>=21.2 +setuptools>=65.5.1 +twine~=3.0 diff --git a/tools/circleci/rtd-build.sh b/tools/circleci/rtd-build.sh new file mode 100755 index 0000000000..aeecf2ece5 --- /dev/null +++ b/tools/circleci/rtd-build.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Script to trigger the documentation build for Kedro in ReadTheDocs. + +# Exit script if you try to use an uninitialized variable. +set -o nounset + +# Exit script if a statement returns a non-true return value. +set -o errexit + +RTD_TOKEN=$1 +BUILD_VERSION=$2 # version of the docs to be built +RTD_ENDPOINT="https://readthedocs.org/api/v3/projects/kedro/versions/${BUILD_VERSION}/builds/" + +curl -X POST \ + --silent --show-error --fail --retry 3 \ + --header "Authorization: token ${RTD_TOKEN}" \ + --header "Content-Length: 0" \ + "${RTD_ENDPOINT}" diff --git a/tools/cli.py b/tools/cli.py new file mode 100644 index 0000000000..a978fbe919 --- /dev/null +++ b/tools/cli.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from typing import Any + +import click + + +def _recurse_cli( + cli_element: click.Command | click.Group | click.CommandCollection, + ctx: click.Context, + io_dict: dict[str, Any], + get_help: bool = False, +) -> None: + """ + Recursive function that checks the type of the command (key) and decides: + 1. In case of `click.Group` or `click.CommandCollection` (aggregate commands), + the function collects the name and recurses one layer deeper + for each sub-command. + 2. In case of `click.Command`, the terminus command has been reached. The function + collects the name, parameters and args, flattens them and saves them as + dictionary keys. + Args: + cli_element: CLI Collection as input for recursion, typically `KedroCLI`. + ctx: Click Context, created by the wrapper function. + io_dict: Input-output dictionary, mutated during the recursion. + get_help: Boolean fork - allows either: + raw structure - nested dictionary until final value of `None` + help structure - nested dictionary where leaves are `--help` cmd output + + Returns: + None (underlying `io_dict` is mutated by the recursion) + """ + if isinstance(cli_element, (click.Group, click.CommandCollection)): + element_name = cli_element.name or "kedro" + io_dict[element_name] = {} + for command_name in cli_element.list_commands(ctx): + _recurse_cli( + cli_element.get_command(ctx, command_name), # type: ignore + ctx, + io_dict[element_name], + get_help, + ) + + elif isinstance(cli_element, click.Command): + if get_help: # gets formatted CLI help incl params for printing + io_dict[cli_element.name] = cli_element.get_help(ctx) + else: # gets params for structure purposes + nested_parameter_list = [option.opts for option in cli_element.get_params(ctx)] + io_dict[cli_element.name] = dict.fromkeys( + [item for sublist in nested_parameter_list for item in sublist], None + ) + + +def get_cli_structure( + cli_obj: click.Command | click.Group | click.CommandCollection, + get_help: bool = False, +) -> dict[str, Any]: + """Convenience wrapper function for `_recurse_cli` to work within + `click.Context` and return a `dict`. + """ + output: dict[str, Any] = dict() + with click.Context(cli_obj) as ctx: # type: ignore + _recurse_cli(cli_obj, ctx, output, get_help) + return output diff --git a/tools/databricks_build.py b/tools/databricks_build.py new file mode 100644 index 0000000000..fcd56b08d7 --- /dev/null +++ b/tools/databricks_build.py @@ -0,0 +1,96 @@ +import logging +import os +from pathlib import Path + +from databricks_cli.clusters.api import ClusterApi +from databricks_cli.configure.provider import get_config +from databricks_cli.dbfs.api import DbfsApi +from databricks_cli.dbfs.dbfs_path import DbfsPath +from databricks_cli.libraries.api import LibrariesApi +from databricks_cli.sdk.api_client import ApiClient + +CLUSTER_ID = os.environ["DATABRICKS_CLUSTER_ID"] +DBFS_UPLOAD_PATH = DbfsPath("dbfs:/tmp/kedro-builds") + + +def _uninstall_existing_build() -> None: + """Uninstall an existing build with the same name as the build to install.""" + api_client = _get_api_client() + library_api = LibrariesApi(api_client) + libraries = [ + {"whl": f"{DBFS_UPLOAD_PATH.absolute_path}/{_get_build_file_path().name}"} + ] + library_api.uninstall_libraries(CLUSTER_ID, libraries) + logging.info("Triggered uninstall of Kedro wheel file on %s", CLUSTER_ID) + + +def _restart_cluster_if_running() -> None: + """Restart a Databricks cluster if it is currently running, otherwise no-op.""" + api_client = _get_api_client() + cluster_api = ClusterApi(api_client) + if cluster_api.get_cluster(CLUSTER_ID)["state"] == "TERMINATED": + logging.info( + "Cluster %s is not currently running. Launch it manually to apply" + "changes", + CLUSTER_ID, + ) + return + logging.info("Cluster %s is being restarted to apply changes.", CLUSTER_ID) + cluster_api.restart_cluster(CLUSTER_ID) + + +def _upload_build_to_dbfs() -> None: + """Upload the wheel file at the given path to DBFS.""" + api_client = _get_api_client() + dbfs_api = DbfsApi(api_client) + src_path = str(_get_build_file_path()) + dbfs_api.put_file( + src_path, + DbfsPath(f"{DBFS_UPLOAD_PATH.absolute_path}/{_get_build_file_path().name}"), + overwrite=True, + ) + logging.info("Uploaded Kedro wheel file to %s") + + +def _install_build() -> None: + """Install Kedro on the target cluster using the uploaded wheel file""" + api_client = _get_api_client() + library_api = LibrariesApi(api_client) + libraries = [ + {"whl": f"{DBFS_UPLOAD_PATH.absolute_path}/{_get_build_file_path().name}"} + ] + library_api.install_libraries(CLUSTER_ID, libraries) + logging.info("Triggered install of Kedro wheel file on %s", CLUSTER_ID) + + +def _get_api_client() -> ApiClient: + """Create an ApiClient object using the config""" + config = get_config() + if config.is_valid_with_token: + return ApiClient(host=config.host, token=config.token) + return ApiClient(user=config.username, password=config.password, host=config.host) + + +def _get_build_file_path() -> Path: + """Get the path of the whl file to install. If multiple whl files are found, + return the file with the highest version number. + """ + dist_path = Path(__file__).resolve().parent.parent / "dist" + whl_files = list(dist_path.glob("*.whl")) + whl_files.sort() + try: + return whl_files[-1] + except IndexError: + raise ValueError("No wheel files found in dist directory.") + + +def main() -> None: + """Main entry point for the script.""" + _uninstall_existing_build() + _restart_cluster_if_running() + _upload_build_to_dbfs() + _install_build() + + +if __name__ == "__main__": + main() diff --git a/tools/ipython/README.md b/tools/ipython/README.md deleted file mode 100644 index 87c8a9c9cf..0000000000 --- a/tools/ipython/README.md +++ /dev/null @@ -1,8 +0,0 @@ -> This directory contains extra scripts that can help improve your Kedro experience for certain use cases. Those are not essential for using Kedro CLI or library components. - - -## `ipython_loader.py` - -This script helps to locate `.ipython` directory and run IPython startup scripts in it when working with Jupyter Notebooks and IPython sessions. This script will automatically locate `.ipython/profile_default/startup` directory starting from the current working directory and going up the directory tree. If the directory was found, all Python scripts in it are be executed. - -The details can be found in [the user guide](https://kedro.readthedocs.io/en/stable/04_user_guide/11_ipython.html#ipython-loader). diff --git a/tools/ipython/__init__.py b/tools/ipython/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tools/ipython/ipython_loader.py b/tools/ipython/ipython_loader.py deleted file mode 100644 index 4e53b90aa3..0000000000 --- a/tools/ipython/ipython_loader.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script helps to locate IPython startup directory and run all Python scripts in -it when working with Jupyter Notebooks and IPython sessions. -""" - -import contextlib -import pathlib -import typing - - -def locate_ipython_startup_dir( - start_dir: typing.Union[pathlib.Path, str] = None -) -> typing.Union[pathlib.Path, None]: - """Locate `.ipython` directory recursively starting from `start_dir` directory - and going up the directory tree. - - Args: - start_dir: The directory where the search starts. Defaults to the current - working directory. - - Returns: - Path to `.ipython/profile_default/startup` directory or None if - that has not been found. - - """ - this_script_dir = pathlib.Path(__file__).parent.resolve() - current_dir = pathlib.Path(start_dir or pathlib.Path.cwd()).expanduser().resolve() - - while True: - startup_dir = current_dir / ".ipython" / "profile_default" / "startup" - if startup_dir.is_dir() and startup_dir != this_script_dir: - return startup_dir - if current_dir.parent == current_dir: - break # reached the root of the file system - current_dir = current_dir.parent - return None - - -@contextlib.contextmanager -def modify_globals(**kwargs: typing.Any): - """Temporarily modifies globals() before they are passed to exec(). - - Args: - kwargs: New keys to add/modify in the globals. - - Yields: - None: None. - """ - globals_ = globals() - overwritten = {k: globals_[k] for k in globals_.keys() & kwargs.keys()} - try: - globals_.update(kwargs) - yield - finally: - for var in kwargs: - globals_.pop(var, None) - globals_.update(overwritten) - - -def run_startup_scripts(startup_dir: pathlib.Path): - """Run all Python scripts from the startup directory. - - Args: - startup_dir: Path to IPython startup directory. - - """ - # pylint: disable=import-outside-toplevel - import logging - from sys import stdout - - fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - logging.basicConfig(format=fmt, stream=stdout) - - startup_dir = startup_dir.resolve() - startup_scripts = sorted(f_ for f_ in startup_dir.rglob("*.py") if f_.is_file()) - - for script in startup_scripts: - with modify_globals(__file__=str(script)): - try: - compiled = compile( - script.read_text(encoding="utf-8"), str(script), "exec" - ) - exec(compiled, globals()) # pylint: disable=exec-used # nosec - except Exception as err: # pylint: disable=broad-except - logging.error( - "Startup script `%s` failed:\n%s: %s", - str(script), - err.__class__.__name__, - str(err), - ) - else: - logging.info("Startup script `%s` successfully executed", str(script)) - - -def main(): - """Locate IPython startup directory and run all Python scripts in it.""" - startup_dir = locate_ipython_startup_dir() - if startup_dir: - run_startup_scripts(startup_dir) - - -if __name__ == "__main__": # pragma: no cover - main() - - # cleanup the global scope - del contextlib, pathlib, typing - del locate_ipython_startup_dir, modify_globals, run_startup_scripts, main diff --git a/tools/license_and_headers.py b/tools/license_and_headers.py deleted file mode 100644 index c16d319d2f..0000000000 --- a/tools/license_and_headers.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - -import glob -import sys -from itertools import chain -from textwrap import indent - -PATHS_REQUIRING_HEADER = ["kedro", "tests"] -LICENSE_MD = "LICENSE.md" - -RED_COLOR = "\033[0;31m" -NO_COLOR = "\033[0m" - - -def files_at_path(path: str): - return glob.iglob(path + "/**/*.py", recursive=True) - - -def files_missing_substring(file_names, substring): - for file_name in file_names: - with open(file_name, "r", encoding="utf-8") as current_file: - content = current_file.read() - - if content.strip() and substring not in content: - yield file_name - - -def main(): - with open(LICENSE_MD) as header_f: - header = indent(header_f.read(), " ") - header = indent(header, "#", lambda line: True) - - # find all .py files recursively - files = chain.from_iterable(files_at_path(path) for path in PATHS_REQUIRING_HEADER) - - # find all files which do not contain the header and are non-empty - files_with_missing_header = list(files_missing_substring(files, header)) - - # exit with an error and print all files without header in read, if any - if files_with_missing_header: - sys.exit( - RED_COLOR - + "The legal header is missing from the following files:\n- " - + "\n- ".join(files_with_missing_header) - + NO_COLOR - + "\nPlease add it by copy-pasting the below:\n\n" - + header - + "\n" - ) - - -if __name__ == "__main__": - main() diff --git a/tools/print_env.sh b/tools/print_env.sh index daa2b0ce15..0a559a6d25 100755 --- a/tools/print_env.sh +++ b/tools/print_env.sh @@ -1,33 +1,5 @@ #!/usr/bin/env bash -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - set -e print_sep="==============================" @@ -44,13 +16,9 @@ eval_command() { eval_command CONDA "conda info 2>/dev/null || echo \"Conda not found\"" eval_command PYTHON "which python && python -V" eval_command PIP "python -m pip -V" -eval_command PYLINT "python -m pylint --version" eval_command PYTEST "python -m pytest --version" eval_command BLACK "python -m black --version" eval_command BEHAVE "python -m behave --version" -eval_command MYPY "python -m mypy --version" -eval_command FLAKE8 "python -m flake8 --version" -eval_command ISORT "python -m isort --version" eval_command PRE-COMMIT "python -m pre_commit --version" eval_command SPARK "python -c \\ \"import pyspark; print(f'PySpark: {pyspark.__version__}')\" 2>/dev/null && \\ diff --git a/trufflehog-ignore.txt b/trufflehog-ignore.txt index d59ab4b6b8..cb5551a327 100644 --- a/trufflehog-ignore.txt +++ b/trufflehog-ignore.txt @@ -1,6 +1,7 @@ docs/package.json docs/package-lock.json docs/source/meta/images/KedroArchitecture.drawio +docs/source/nodes_and_pipelines/nodes.md static/img/kedro_gitflow.svg .idea/ .git/ @@ -8,3 +9,5 @@ static/img/kedro_gitflow.svg .coverage.* .*\.log .*\.iml +tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py +docs/source/meta/images/kedro_gitflow.svg