From 892cc8cc5d935f5f2d6a09a1a61d1c5e3fd1b660 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Fri, 21 Jun 2024 16:07:59 -0400 Subject: [PATCH 1/3] update the spark version to the current version --- tests/functional/adapter/test_python_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/adapter/test_python_model.py b/tests/functional/adapter/test_python_model.py index 1195cbd3e..a0f3b7c44 100644 --- a/tests/functional/adapter/test_python_model.py +++ b/tests/functional/adapter/test_python_model.py @@ -35,7 +35,7 @@ def model(dbt, spark): materialized='table', submission_method='job_cluster', job_cluster_config={ - "spark_version": "7.3.x-scala2.12", + "spark_version": "12.2.x-scala2.12", "node_type_id": "i3.xlarge", "num_workers": 0, "spark_conf": { From 021809a9257ef3fd52ef054085a9fd65b9af571f Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Mon, 24 Jun 2024 18:37:50 -0400 Subject: [PATCH 2/3] update pin for pydanic to resolve https://github.com/explosion/spaCy/issues/12659 --- tests/functional/adapter/test_python_model.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/functional/adapter/test_python_model.py b/tests/functional/adapter/test_python_model.py index da535f512..2f0efa5c2 100644 --- a/tests/functional/adapter/test_python_model.py +++ b/tests/functional/adapter/test_python_model.py @@ -48,7 +48,7 @@ def model(dbt, spark): "ResourceClass": "SingleNode" } }, - packages=['spacy', 'torch', 'pydantic<1.10.3'] + packages=['spacy', 'torch', 'pydantic>=1.10.8'] ) data = [[1,2]] * 10 return spark.createDataFrame(data, schema=['test', 'test2']) @@ -67,11 +67,23 @@ def model(dbt, spark): @pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") class TestChangingSchemaSpark: + """ + Confirm that we can setup a spot instance and parse required packages into the Databricks job. + + Notes: + - This test generates a spot instance on demand using the settings from `job_cluster_config` + in `models__simple_python_model` above. It takes several minutes to run due to creating the cluster. + The job can be monitored via "Data Engineering > Job Runs" or "Workflows > Job Runs" + in the Databricks UI (instead of via the normal cluster). + - The `spark_version` argument will need to periodically be updated. It will eventually become + unsupported and start experiencing issues. + - See https://github.com/explosion/spaCy/issues/12659 for why we're pinning pydantic + """ + @pytest.fixture(scope="class") def models(self): return {"simple_python_model.py": models__simple_python_model} - @pytest.mark.skip("https://github.com/dbt-labs/dbt-spark/issues/1054") def test_changing_schema_with_log_validation(self, project, logs_dir): run_dbt(["run"]) write_file( From 9c033f0b397a3be85a4f9e3ddbf124d5dec48be5 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Mon, 24 Jun 2024 21:11:42 -0400 Subject: [PATCH 3/3] exclude koalas dataframes from test --- tests/functional/adapter/test_python_model.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/functional/adapter/test_python_model.py b/tests/functional/adapter/test_python_model.py index 2f0efa5c2..cd798d1da 100644 --- a/tests/functional/adapter/test_python_model.py +++ b/tests/functional/adapter/test_python_model.py @@ -15,9 +15,22 @@ class TestPythonModelSpark(BasePythonModelTests): @pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") class TestPySpark(BasePySparkTests): - @pytest.mark.skip("https://github.com/dbt-labs/dbt-spark/issues/1054") def test_different_dataframes(self, project): - return super().test_different_dataframes(project) + """ + Test that python models are supported using dataframes from: + - pandas + - pyspark + - pyspark.pandas (formerly dataspark.koalas) + + Note: + The CI environment is on Apache Spark >3.1, which includes koalas as pyspark.pandas. + The only Databricks runtime that supports Apache Spark <=3.1 is 9.1 LTS, which is EOL 2024-09-23. + For more information, see: + - https://github.com/databricks/koalas + - https://docs.databricks.com/en/release-notes/runtime/index.html + """ + results = run_dbt(["run", "--exclude", "koalas_df"]) + assert len(results) == 3 @pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint")