From faf18e7bca99c2f111a010be5d3ae06f5e1350fc Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 2 Jan 2024 23:59:28 -0600 Subject: [PATCH] added sampling and string length tests --- README.md | 4 +- integration_tests/packages.yml | 4 +- .../string_length_colnames_with_spaces.sql | 32 +++++ .../string_length_users.sql | 39 ++++++ .../test_selection_users.sql | 18 ++- macros/helpers/sql_functions.sql | 8 ++ macros/schema.yml | 52 +++++++ .../test_aggregation/get_test_suggestions.sql | 16 ++- .../get_accepted_values_test_suggestions.sql | 16 ++- .../get_range_test_suggestions.sql | 16 ++- .../get_string_length_test_suggestions.sql | 129 ++++++++++++++++++ .../get_uniqueness_test_suggestions.sql | 22 ++- 12 files changed, 346 insertions(+), 10 deletions(-) create mode 100644 integration_tests/tests/generate_string_length_tests/string_length_colnames_with_spaces.sql create mode 100644 integration_tests/tests/generate_string_length_tests/string_length_users.sql create mode 100644 macros/helpers/sql_functions.sql create mode 100644 macros/schema.yml create mode 100644 macros/test_generation/get_string_length_test_suggestions.sql diff --git a/README.md b/README.md index 8643b41..39013aa 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,9 @@ ## About `dbt-testgen` is a [dbt](https://github.com/dbt-labs/dbt) package that autogenerates dbt test YAML based on real data. -Inspired by [dbt-codegen](https://github.com/dbt-labs/dbt-codegen) and [deequ Constraint Suggestion](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md) +Code documentation available at [here](https://kgmcquate.github.io/dbt-testgen/) + +Inspired by [dbt-codegen](https://github.com/dbt-labs/dbt-codegen) and [deequ Constraint Suggestion](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md). ## Install `dbt-testgen` currently supports `dbt 1.2.x` or higher. diff --git a/integration_tests/packages.yml b/integration_tests/packages.yml index 3d6b503..ff73650 100644 --- a/integration_tests/packages.yml +++ b/integration_tests/packages.yml @@ -1,4 +1,6 @@ packages: - local: ../ - package: dbt-labs/dbt_utils - version: 1.1.1 \ No newline at end of file + version: 1.1.1 + - package: calogica/dbt_expectations + version: 0.10.1 \ No newline at end of file diff --git a/integration_tests/tests/generate_string_length_tests/string_length_colnames_with_spaces.sql b/integration_tests/tests/generate_string_length_tests/string_length_colnames_with_spaces.sql new file mode 100644 index 0000000..0271eec --- /dev/null +++ b/integration_tests/tests/generate_string_length_tests/string_length_colnames_with_spaces.sql @@ -0,0 +1,32 @@ + + +{% set actual_yaml = testgen.to_yaml( + testgen.get_string_length_test_suggestions( + ref('colnames_with_spaces'), + sample=true, + limit=100 + ) + ) +%} + +{% set expected_yaml %} +models: +- name: colnames_with_spaces + columns: + - name: first name + description: String length test generated by dbt-testgen + tests: + - dbt_expectations.expect_column_value_lengths_to_be_between: + min_value: 3 + max_value: 5 + row_condition: '"first name" is not null' + - name: current city + description: String length test generated by dbt-testgen + tests: + - dbt_expectations.expect_column_value_lengths_to_be_between: + min_value: 7 + max_value: 13 + row_condition: '"current city" is not null' +{% endset %} + +{{ assert_equal (actual_yaml | trim, expected_yaml | trim) }} \ No newline at end of file diff --git a/integration_tests/tests/generate_string_length_tests/string_length_users.sql b/integration_tests/tests/generate_string_length_tests/string_length_users.sql new file mode 100644 index 0000000..8d6e2e4 --- /dev/null +++ b/integration_tests/tests/generate_string_length_tests/string_length_users.sql @@ -0,0 +1,39 @@ + + +{% set actual_yaml = testgen.to_yaml( + testgen.get_string_length_test_suggestions( + ref('users'), + sample=true, + limit=100 + ) + ) +%} + +{% set expected_yaml %} +models: +- name: users + columns: + - name: username + description: String length test generated by dbt-testgen + tests: + - dbt_expectations.expect_column_value_lengths_to_be_between: + min_value: 8 + max_value: 15 + row_condition: '"username" is not null' + - name: email + description: String length test generated by dbt-testgen + tests: + - dbt_expectations.expect_column_value_lengths_to_be_between: + min_value: 18 + max_value: 25 + row_condition: '"email" is not null' + - name: user_status + description: String length test generated by dbt-testgen + tests: + - dbt_expectations.expect_column_value_lengths_to_be_between: + min_value: 6 + max_value: 8 + row_condition: '"user_status" is not null' +{% endset %} + +{{ assert_equal (actual_yaml | trim, expected_yaml | trim) }} \ No newline at end of file diff --git a/integration_tests/tests/test_test_selection/test_selection_users.sql b/integration_tests/tests/test_test_selection/test_selection_users.sql index eda1fd7..33aff0d 100644 --- a/integration_tests/tests/test_test_selection/test_selection_users.sql +++ b/integration_tests/tests/test_test_selection/test_selection_users.sql @@ -21,22 +21,34 @@ models: min_value: 1 max_value: 30 - name: username - description: Uniqueness test generated by dbt-testgen + description: String length test generated by dbt-testgen tests: - unique - not_null + - dbt_expectations.expect_column_value_lengths_to_be_between: + min_value: 8 + max_value: 15 + row_condition: '"username" is not null' - name: email - description: Uniqueness test generated by dbt-testgen + description: String length test generated by dbt-testgen tests: - unique - not_null + - dbt_expectations.expect_column_value_lengths_to_be_between: + min_value: 18 + max_value: 25 + row_condition: '"email" is not null' - name: user_status - description: Accepted values test generated by dbt-testgen + description: String length test generated by dbt-testgen tests: - accepted_values: values: - active - inactive + - dbt_expectations.expect_column_value_lengths_to_be_between: + min_value: 6 + max_value: 8 + row_condition: '"user_status" is not null' - name: age description: Numeric range test generated by dbt-testgen tests: diff --git a/macros/helpers/sql_functions.sql b/macros/helpers/sql_functions.sql new file mode 100644 index 0000000..f72cee9 --- /dev/null +++ b/macros/helpers/sql_functions.sql @@ -0,0 +1,8 @@ + +{% macro get_random_function() %} + {{ return(adapter.dispatch('get_random_function', 'testgen')()) }} +{% endmacro %} + +{% macro default__get_random_function(colname) %} + {{ return("RANDOM") }} +{% endmacro %} diff --git a/macros/schema.yml b/macros/schema.yml new file mode 100644 index 0000000..232b4a9 --- /dev/null +++ b/macros/schema.yml @@ -0,0 +1,52 @@ +version: 2 + +macros: + - name: get_test_suggestions + description: Generates YAML schema file that includes tests for your data + arguments: + - name: column_name + type: string + description: The name of the column you want to convert + - name: precision + type: integer + description: Number of decimal places. Defaults to 2. + + - name: table_relation + type: Relation + description: | + The [dbt Relation](https://docs.getdbt.com/reference/dbt-classes#relation) + you wish to generate tests for. + Example: ref("mymodel") + - name: sample + type: bool + description: Take a random sample when using the `limit` argument + - name: limit + type: integer + description: Use only this number of records to generate tests. + - name: resource_type + type: string + description: The type of resource that `table_relation` is - 'models', 'seeds', or 'sources' + - name: column_config + type: dict + description: "Configurations to set on columns. Example - {'quote': true}" + - name: exclude_types + type: list + description: Column types to exclude from tests. + - name: exclude_cols + type: list + description: Columns to exclude from tests. + - name: tags + type: list + description: Tags to put on the tests. + - name: tests + type: list + description: "Types of tests to generate. Example: ['uniqueness', 'accepted_values', 'range']" + - name: composite_key_length + type: integer + description: Max length of the composite key for uniqueness tests. + - name: dbt_config + type: dict + description: Existing parsed DBT Schema file to add tests onto. + - name: return_object + type: bool + description: Return the DBT Schema file as a dict object instead of printing YAML. \ No newline at end of file diff --git a/macros/test_aggregation/get_test_suggestions.sql b/macros/test_aggregation/get_test_suggestions.sql index c1770d1..87e4d2b 100644 --- a/macros/test_aggregation/get_test_suggestions.sql +++ b/macros/test_aggregation/get_test_suggestions.sql @@ -8,7 +8,7 @@ exclude_types = [], exclude_cols = [], tags = [], - tests = ["uniqueness", "accepted_values", "range"], + tests = ["uniqueness", "accepted_values", "range", "string_length"], composite_key_length = 1, dbt_config = None, return_object = false @@ -58,6 +58,20 @@ ) %} {% endif %} + {% if "string_length" in tests %} + {% set dbt_config = testgen.get_string_length_test_suggestions( + table_relation=table_relation, + sample=sample, + limit=limit, + resource_type=resource_type, + column_config=column_config, + exclude_types=exclude_types, + exclude_cols=exclude_cols, + tags=tags, + dbt_config=dbt_config + ) %} + {% endif %} + {% if return_object %} {{ return(dbt_config) }} {% else %} diff --git a/macros/test_generation/get_accepted_values_test_suggestions.sql b/macros/test_generation/get_accepted_values_test_suggestions.sql index 9874852..5854912 100644 --- a/macros/test_generation/get_accepted_values_test_suggestions.sql +++ b/macros/test_generation/get_accepted_values_test_suggestions.sql @@ -79,14 +79,28 @@ testgen.array_agg(column.column) ~ " AS UNIQUE_VALUES from ( select " ~ adapter.quote(column.column) ~ " - from " ~ table_relation ~ " + from base group by " ~ adapter.quote(column.column) ~ " ) t1 " ) %} {% endfor %} + {% if limit != None %} + {% if sample == true %} + {% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %} + {% else %} + {% set limit_stmt = "LIMIT " ~ limit %} + {% endif %} + {% else %} + {% set limit_stmt = "" %} + {% endif %} + {% set count_distinct_sql %} + WITH base AS ( + SELECT * FROM {{ table_relation }} + {{ limit_stmt }} + ) SELECT * FROM ( {{ count_distinct_exprs | join("\nUNION ALL\n") }} ) t2 diff --git a/macros/test_generation/get_range_test_suggestions.sql b/macros/test_generation/get_range_test_suggestions.sql index 2905c1f..97ed431 100644 --- a/macros/test_generation/get_range_test_suggestions.sql +++ b/macros/test_generation/get_range_test_suggestions.sql @@ -45,6 +45,16 @@ {% endif %} {% endfor %} + {% if limit != None %} + {% if sample == true %} + {% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %} + {% else %} + {% set limit_stmt = "LIMIT " ~ limit %} + {% endif %} + {% else %} + {% set limit_stmt = "" %} + {% endif %} + {% set min_max_exprs = [] %} {% for column in number_cols %} {% do min_max_exprs.append( @@ -52,12 +62,16 @@ "MIN(" ~ adapter.quote(column.column) ~ ") as COL_MIN, " ~ "MAX(" ~ adapter.quote(column.column) ~ ") as COL_MAX, " ~ loop.index ~ " AS ORDERING " ~ - "FROM " ~ table_relation + "FROM base" ) %} {% endfor %} {% set min_max_sql %} + WITH base AS ( + SELECT * FROM {{ table_relation }} + {{ limit_stmt }} + ) SELECT * FROM ( {{ min_max_exprs | join("\nUNION ALL\n") }} ) t1 diff --git a/macros/test_generation/get_string_length_test_suggestions.sql b/macros/test_generation/get_string_length_test_suggestions.sql new file mode 100644 index 0000000..90e30ee --- /dev/null +++ b/macros/test_generation/get_string_length_test_suggestions.sql @@ -0,0 +1,129 @@ + +{% macro get_string_length_test_suggestions( + table_relation, + sample = false, + limit = None, + resource_type = "models", + column_config = {}, + exclude_types = [], + exclude_cols = [], + tags = ["string_length"], + dbt_config = None + ) %} + {# Run macro for the specific target DB #} + {% if execute %} + {{ return(adapter.dispatch('get_string_length_test_suggestions', 'testgen')(table_relation, sample, limit, resource_type, column_config, exclude_types, exclude_cols, tags, dbt_config, **kwargs)) }} + {% endif%} +{%- endmacro %} + + +{% macro default__get_string_length_test_suggestions( + table_relation, + sample = false, + limit = None, + resource_type = "models", + column_config = {}, + exclude_types = [], + exclude_cols = [], + tags = ["string_length"], + dbt_config = None + ) +%} + {# kwargs is used for test configurations #} + {# {% if tags != None %} + {% do test_config.update({"tags": tags}) %} + {% endif %} #} + + {% set columns = adapter.get_columns_in_relation(table_relation) %} + {% set columns = testgen.exclude_column_types(columns, exclude_types) %} + {% set columns = testgen.exclude_column_names(columns, exclude_cols) %} + + {% set string_cols = [] %} + {% for column in columns %} + {% if column.is_string() %} + {% do string_cols.append(column) %} + {% endif %} + {% endfor %} + + {% if limit != None %} + {% if sample == true %} + {% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %} + {% else %} + {% set limit_stmt = "LIMIT " ~ limit %} + {% endif %} + {% else %} + {% set limit_stmt = "" %} + {% endif %} + + {% set min_max_exprs = [] %} + {% for column in string_cols %} + {% do min_max_exprs.append( + "SELECT '" ~ column.column ~ "' AS COLNAME, " ~ + "MIN(LENGTH(" ~ adapter.quote(column.column) ~ ")) as COL_MIN, " ~ + "MAX(LENGTH(" ~ adapter.quote(column.column) ~ ")) as COL_MAX, " ~ + loop.index ~ " AS ORDERING " ~ + "FROM base + WHERE " ~ adapter.quote(column.column) ~ " IS NOT NULL" + ) %} + {% endfor %} + + + {% set min_max_sql %} + WITH base AS ( + SELECT * FROM {{ table_relation }} + {{ limit_stmt }} + ) + SELECT * FROM ( + {{ min_max_exprs | join("\nUNION ALL\n") }} + ) t1 + ORDER BY ORDERING ASC + {% endset %} + + {% set results = testgen.query_as_list(min_max_sql) %} + + {% set column_tests = [] %} + {% for result in results %} + + {% if result[1] == result[2] %} + {% set test = { + "dbt_expectations.expect_column_value_lengths_to_equal": { + "value": result[1], + "row_condition": adapter.quote(result[0]) ~ " is not null" + } + } + %} + {% else %} + {% set test = { + "dbt_expectations.expect_column_value_lengths_to_be_between": { + "min_value": result[1], + "max_value": result[2], + "row_condition": adapter.quote(result[0]) ~ " is not null" + } + } + %} + {% endif %} + + {% set col_config = { + "name": result[0], + "description": "String length test generated by dbt-testgen", + "tests": [test] + } + %} + + {% for k,v in column_config.items() %} + {% do col_config.update({k: v}) %} + {% endfor %} + + {% do column_tests.append(col_config) %} + {% endfor %} + + {% set model = {"name": table_relation.identifier, "columns": column_tests} %} + + {% set new_dbt_config = {resource_type: [model]} %} + + {% set merged_dbt_config = testgen.merge_dbt_configs(dbt_config, new_dbt_config) %} + + {% do return(merged_dbt_config) %} + +{% endmacro %} + diff --git a/macros/test_generation/get_uniqueness_test_suggestions.sql b/macros/test_generation/get_uniqueness_test_suggestions.sql index d68a8c4..6d2f633 100644 --- a/macros/test_generation/get_uniqueness_test_suggestions.sql +++ b/macros/test_generation/get_uniqueness_test_suggestions.sql @@ -76,6 +76,16 @@ {% set limit_expr = "" %} {% endif %} + {% if limit != None %} + {% if sample == true %} + {% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %} + {% else %} + {% set limit_stmt = "LIMIT " ~ limit %} + {% endif %} + {% else %} + {% set limit_stmt = "" %} + {% endif %} + {% set count_distinct_exprs = [] %} {% for column_combo in column_combinations %} {% set column_combo_quoted = [] %} @@ -85,19 +95,27 @@ {% do count_distinct_exprs.append( "SELECT " ~ loop.index ~ " AS ORDERING, count(1) AS CARDINALITY from ( - SELECT 1 FROM " ~ table_relation ~ " + SELECT 1 FROM base GROUP BY " ~ column_combo_quoted|join(", ") ~ " ) t" ) %} {% endfor %} {% set count_distinct_sql %} + WITH base AS ( + SELECT * FROM {{ table_relation }} + {{ limit_stmt }} + ) {{ count_distinct_exprs | join("\nUNION ALL\n") }} ORDER BY ordering ASC {% endset %} {% set count_sql %} - {{ "SELECT count(1) AS TABLE_COUNT FROM " ~ table_relation }} + WITH base AS ( + SELECT * FROM {{ table_relation }} + {{ limit_stmt }} + ) + SELECT count(1) AS TABLE_COUNT FROM base {% endset%} {% set table_count = testgen.query_as_list(count_sql)[0][0] %}