Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PySpark to the add-ons flow #3169

Merged
merged 31 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
832445a
add pyspark config to add-ons
SajidAlamQB Oct 12, 2023
312c6d2
add pyspark option in starters and prompts
SajidAlamQB Oct 12, 2023
0c81a33
add pyspark kedro hook and make it work with add-ons
SajidAlamQB Oct 12, 2023
985e867
add pyspark hook to settings.py
SajidAlamQB Oct 12, 2023
0d6ce55
add kedro-dataset[spark.SparkDataSet] to add-ons
SajidAlamQB Oct 12, 2023
302668a
dont add pyspark add ons for e2e test
SajidAlamQB Oct 12, 2023
8ccab73
fix test
SajidAlamQB Oct 12, 2023
251a092
changes based of review
SajidAlamQB Oct 13, 2023
1d2d233
Delete hooks.py
SajidAlamQB Oct 13, 2023
1d5b599
remove spark_config from utils.py
SajidAlamQB Oct 13, 2023
169e0be
remove parameter files from conf/base
SajidAlamQB Oct 13, 2023
f8f60aa
Merge branch 'develop' into pyspark-add-on-flow
SajidAlamQB Oct 19, 2023
5953484
fix pyspark add-ons
SajidAlamQB Oct 19, 2023
615b1a7
Add guard clause for parse_add_ons_input
SajidAlamQB Oct 20, 2023
d767a1d
Merge branch 'develop' into pyspark-add-on-flow
SajidAlamQB Oct 25, 2023
f19813a
changes for refactoring
SajidAlamQB Oct 25, 2023
f5967c1
Merge branch 'develop' into pyspark-add-on-flow
noklam Oct 25, 2023
9e7e71e
revert pyspark_path
SajidAlamQB Oct 25, 2023
07949c9
update add-ons example usage
SajidAlamQB Oct 25, 2023
7cbf619
ignore coverage on None add_ons_str
SajidAlamQB Oct 25, 2023
dd9658e
fix linting
noklam Oct 25, 2023
bf3fdf3
lint
SajidAlamQB Oct 25, 2023
f2bf5dd
Merge branch 'pyspark-add-on-flow' of https://github.com/kedro-org/ke…
SajidAlamQB Oct 25, 2023
343e805
refactor utils.py, strip pyproject.toml and update tests
SajidAlamQB Oct 26, 2023
9ef3129
Fix coverage
SajidAlamQB Oct 26, 2023
b3fa400
Revert "Fix coverage"
SajidAlamQB Oct 26, 2023
4111109
Revert "refactor utils.py, strip pyproject.toml and update tests"
SajidAlamQB Oct 26, 2023
6e49e69
changes based on review
SajidAlamQB Oct 26, 2023
00fdf1d
Update kedro/framework/cli/starters.py
SajidAlamQB Oct 26, 2023
fc9bca1
Update starters.py
SajidAlamQB Oct 26, 2023
6598768
lint
SajidAlamQB Oct 26, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion features/steps/cli_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def create_config_file(context):
context.root_project_dir = context.temp_dir / context.project_name
context.package_name = context.project_name.replace("-", "_")
config = {
"add_ons": "all",
"add_ons": "1-5",
SajidAlamQB marked this conversation as resolved.
Show resolved Hide resolved
"project_name": context.project_name,
"repo_name": context.project_name,
"output_dir": str(context.temp_dir),
Expand Down
41 changes: 33 additions & 8 deletions kedro/framework/cli/starters.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,10 @@ class KedroStarterSpec: # noqa: too-few-public-methods
3) Custom Logging: Provides more logging options\n
4) Documentation: Basic documentation setup with Sphinx\n
5) Data Structure: Provides a directory structure for storing data\n
6) Pyspark: Provides a basic PySpark set up\n

Example usage:\n
kedro new --addons=lint,test,log,docs,data (or any subset of these options)\n
kedro new --addons=lint,test,log,docs,data,pyspark (or any subset of these options)\n
kedro new --addons=all\n
kedro new --addons=none
"""
Expand All @@ -121,6 +122,7 @@ class KedroStarterSpec: # noqa: too-few-public-methods
"3": "Custom Logging",
"4": "Documentation",
"5": "Data Structure",
"6": "Pyspark",
}

# noqa: unused-argument
Expand Down Expand Up @@ -210,14 +212,17 @@ def _validate_range(start, end):
def _validate_selection(add_ons: list[str]):
for add_on in add_ons:
if int(add_on) < 1 or int(add_on) > len(ADD_ONS_DICT):
message = f"'{add_on}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5." # nosec
message = f"'{add_on}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5, 6." # nosec
click.secho(message, fg="red", err=True)
sys.exit(1)

if add_ons_str == "all":
return list(ADD_ONS_DICT)
if add_ons_str == "none":
return []
# Guard clause if add_ons_str is None, which can happen if prompts.yml is removed
if not add_ons_str:
return [] # pragma: no cover

# Split by comma
add_ons_choices = add_ons_str.split(",")
Expand Down Expand Up @@ -318,7 +323,10 @@ def new(config_path, starter_alias, selected_addons, checkout, directory, **kwar
config = _get_addons_from_cli_input(selected_addons, config)

cookiecutter_args = _make_cookiecutter_args(config, checkout, directory)
_create_project(template_path, cookiecutter_args)

project_template = fetch_template_based_on_add_ons(template_path, cookiecutter_args)

_create_project(project_template, cookiecutter_args)


@create_cli.group()
Expand Down Expand Up @@ -366,7 +374,14 @@ def _get_addons_from_cli_input(
Configuration for starting a new project, with the selected add-ons
from the `--addons` flag.
"""
string_to_number = {"lint": "1", "test": "2", "log": "3", "docs": "4", "data": "5"}
string_to_number = {
"lint": "1",
"test": "2",
"log": "3",
"docs": "4",
"data": "5",
"pyspark": "6",
}

if selected_addons is not None:
addons = selected_addons.split(",")
Expand All @@ -393,14 +408,14 @@ def _select_prompts_to_display(prompts_required: dict, selected_addons: str) ->
Returns:
the prompts_required dictionary, with all the redundant information removed.
"""
valid_addons = ["lint", "test", "log", "docs", "data", "all", "none"]
valid_addons = ["lint", "test", "log", "docs", "data", "pyspark", "all", "none"]

if selected_addons is not None:
addons = re.sub(r"\s", "", selected_addons).split(",")
for addon in addons:
if addon not in valid_addons:
click.secho(
"Please select from the available add-ons: lint, test, log, docs, data, all, none",
"Please select from the available add-ons: lint, test, log, docs, data, pyspark, all, none",
fg="red",
err=True,
)
Expand Down Expand Up @@ -493,6 +508,16 @@ def _make_cookiecutter_args(
return cookiecutter_args


def fetch_template_based_on_add_ons(template_path, cookiecutter_args: dict[str, Any]):
SajidAlamQB marked this conversation as resolved.
Show resolved Hide resolved
extra_context = cookiecutter_args["extra_context"]
add_ons = extra_context.get("add_ons")
if add_ons and "Pyspark" in add_ons:
cookiecutter_args["directory"] = "spaceflights-pyspark"
pyspark_path = "git+https://github.com/kedro-org/kedro-starters.git"
return pyspark_path
return template_path


def _create_project(template_path: str, cookiecutter_args: dict[str, Any]):
"""Creates a new kedro project using cookiecutter.

Expand Down Expand Up @@ -524,8 +549,8 @@ def _create_project(template_path: str, cookiecutter_args: dict[str, Any]):
)
add_ons = extra_context.get("add_ons")

# Only non-starter projects have configurable add-ons
if template_path == str(TEMPLATE_PATH):
# Only core template and spaceflights-pyspark have configurable add-ons
if template_path == str(TEMPLATE_PATH) or add_ons == "Pyspark":
SajidAlamQB marked this conversation as resolved.
Show resolved Hide resolved
if add_ons == "[]": # TODO: This should be a list
click.secho("\nYou have selected no add-ons")
else:
Expand Down
5 changes: 3 additions & 2 deletions kedro/templates/project/hooks/post_gen_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,19 @@
setup_template_add_ons,
sort_requirements,
)
from kedro.framework.cli.starters import _parse_add_ons_input


def main():
current_dir = Path.cwd()
requirements_file_path = current_dir / "requirements.txt"
pyproject_file_path = current_dir / "pyproject.toml"
python_package_name = '{{ cookiecutter.python_package }}'

# Get the selected add-ons from cookiecutter
selected_add_ons = "{{ cookiecutter.add_ons }}"

# Handle template directories and requirements according to selected add-ons
setup_template_add_ons(selected_add_ons, requirements_file_path, pyproject_file_path)
setup_template_add_ons(selected_add_ons, requirements_file_path, pyproject_file_path, python_package_name)

# Sort requirements.txt file in alphabetical order
sort_requirements(requirements_file_path)
Expand Down
35 changes: 34 additions & 1 deletion kedro/templates/project/hooks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"""

docs_pyproject_requirements = """
[project.optional-dependencies]
docs = [
"docutils<0.18.0",
"sphinx~=3.4.3",
Expand All @@ -48,7 +49,7 @@
"""


def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproject_file_path):
def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproject_file_path, python_package_name):
"""Removes directories and files related to unwanted addons from
a Kedro project template. Adds the necessary requirements for
the addons that were selected.
Expand Down Expand Up @@ -96,6 +97,38 @@ def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproj
if data_path.exists():
shutil.rmtree(str(data_path))

if "Pyspark" not in selected_add_ons_list: # If PySpark not selected
pass
else: # Use spaceflights-pyspark to create pyspark template
# Remove all .csv and .xlsx files from data/01_raw/
raw_data_path = current_dir / "data/01_raw/"
if raw_data_path.exists() and raw_data_path.is_dir():
for file_path in raw_data_path.glob("*.*"):
if file_path.suffix in [".csv", ".xlsx"]:
file_path.unlink()

# Remove parameter files from conf/base/
param_files = [
"parameters_data_processing.yml",
"parameters_data_science.yml",
]
conf_base_path = current_dir / "conf/base/"
if conf_base_path.exists() and conf_base_path.is_dir():
for param_file in param_files:
file_path = conf_base_path / param_file
if file_path.exists():
file_path.unlink()

# Remove specific pipeline subdirectories
pipelines_path = current_dir / f"src/{python_package_name}/pipelines/"
for pipeline_subdir in ["data_science", "data_processing"]:
shutil.rmtree(pipelines_path / pipeline_subdir, ignore_errors=True)

# Remove all test file from tests/pipelines/
test_pipeline_path = current_dir / "tests/pipelines/test_data_science.py"
if test_pipeline_path.exists():
test_pipeline_path.unlink()


def sort_requirements(requirements_file_path):
"""Sort the requirements.txt file in alphabetical order.
Expand Down
3 changes: 2 additions & 1 deletion kedro/templates/project/prompts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ add_ons:
3) Custom Logging : Provides more logging options
4) Documentation: Provides basic documentations setup with Sphinx
5) Data Structure: Provides a directory structure for storing data
6) PySpark : Provides a basic PySpark set up

Which add-ons would you like to include in your project? [1-5/1,3/all/none]:
Which add-ons would you like to include in your project? [1-6/1,3/all/none]:
regex_validator: "^(all|none|(\\d(,\\d)*|(\\d-\\d)))$"
error_message: |
Invalid input. Please select valid options for add-ons using comma-separated values, ranges, or 'all/none'.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,6 @@ dynamic = ["dependencies", "version"]

[project.entry-points."kedro.hooks"]

[project.optional-dependencies]
docs = [
"docutils<0.18.0",
"sphinx~=3.4.3",
"sphinx_rtd_theme==0.5.1",
"nbsphinx==0.8.1",
"sphinx-autodoc-typehints==1.11.1",
"sphinx_copybutton==0.3.1",
"ipykernel>=5.3, <7.0",
"Jinja2<3.1.0",
"myst-parser~=0.17.2",
]

[tool.setuptools.dynamic]
dependencies = {file = "requirements.txt"}
version = {attr = "{{ cookiecutter.python_package }}.__version__"}
Expand Down
23 changes: 16 additions & 7 deletions tests/framework/cli/test_starters.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,14 @@ def _make_cli_prompt_input_without_addons(


def _convert_addon_names_to_numbers(selected_addons: str):
string_to_number = {"lint": "1", "test": "2", "log": "3", "docs": "4", "data": "5"}
string_to_number = {
"lint": "1",
"test": "2",
"log": "3",
"docs": "4",
"data": "5",
"pyspark": "6",
}

addons = selected_addons.split(",")
for i in range(len(addons)):
Expand All @@ -81,6 +88,7 @@ def _get_expected_files(add_ons: str):
"3": 1,
"4": 2,
"5": 8,
"6": 2,
} # files added to template by each add-on
add_ons_list = _parse_add_ons_input(add_ons)

Expand Down Expand Up @@ -269,7 +277,7 @@ def test_starter_list_with_invalid_starter_plugin(
("1,2,3", ["1", "2", "3"]),
("2-4", ["2", "3", "4"]),
("3-3", ["3"]),
("all", ["1", "2", "3", "4", "5"]),
("all", ["1", "2", "3", "4", "5", "6"]),
("none", []),
],
)
Expand All @@ -291,12 +299,12 @@ def test_parse_add_ons_invalid_range(input, capsys):

@pytest.mark.parametrize(
"input,first_invalid",
[("0,3,5", "0"), ("1,3,6", "6"), ("0-4", "0"), ("3-6", "6")],
[("0,3,5", "0"), ("1,3,7", "7"), ("0-4", "0"), ("3-7", "7")],
merelcht marked this conversation as resolved.
Show resolved Hide resolved
)
def test_parse_add_ons_invalid_selection(input, first_invalid, capsys):
with pytest.raises(SystemExit):
_parse_add_ons_input(input)
message = f"'{first_invalid}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5."
message = f"'{first_invalid}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5, 6."
assert message in capsys.readouterr().err


Expand Down Expand Up @@ -881,7 +889,7 @@ def test_directory_flag_with_starter_alias(self, fake_kedro_cli):
class TestAddOnsFromUserPrompts:
@pytest.mark.parametrize(
"add_ons",
["1", "2", "3", "4", "5", "none", "2,3,4", "3-5", "all"],
["1", "2", "3", "4", "5", "6", "none", "2,3,4", "3-5", "all"],
)
def test_valid_add_ons(self, fake_kedro_cli, add_ons):
result = CliRunner().invoke(
Expand Down Expand Up @@ -913,7 +921,7 @@ def test_invalid_add_ons(self, fake_kedro_cli):
class TestAddOnsFromConfigFile:
@pytest.mark.parametrize(
"add_ons",
["1", "2", "3", "4", "5", "none", "2,3,4", "3-5", "all"],
["1", "2", "3", "4", "5", "6", "none", "2,3,4", "3-5", "all"],
)
def test_valid_add_ons(self, fake_kedro_cli, add_ons):
"""Test project created from config."""
Expand Down Expand Up @@ -963,6 +971,7 @@ class TestAddOnsFromCLI:
"log",
"docs",
"data",
"pyspark",
"none",
"test,log,docs",
"test,data,lint",
Expand Down Expand Up @@ -992,7 +1001,7 @@ def test_invalid_add_ons(self, fake_kedro_cli):

assert result.exit_code != 0
assert (
"Please select from the available add-ons: lint, test, log, docs, data, all, none"
"Please select from the available add-ons: lint, test, log, docs, data, pyspark, all, none"
in result.output
)

Expand Down