Skip to content

Commit

Permalink
added sampling and string length tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kevin committed Jan 3, 2024
1 parent ddc92f6 commit faf18e7
Show file tree
Hide file tree
Showing 12 changed files with 346 additions and 10 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
## About
`dbt-testgen` is a [dbt](https://github.com/dbt-labs/dbt) package that autogenerates dbt test YAML based on real data.

Inspired by [dbt-codegen](https://github.com/dbt-labs/dbt-codegen) and [deequ Constraint Suggestion](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md)
Code documentation available at [here](https://kgmcquate.github.io/dbt-testgen/)

Inspired by [dbt-codegen](https://github.com/dbt-labs/dbt-codegen) and [deequ Constraint Suggestion](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md).

## Install
`dbt-testgen` currently supports `dbt 1.2.x` or higher.
Expand Down
4 changes: 3 additions & 1 deletion integration_tests/packages.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
packages:
- local: ../
- package: dbt-labs/dbt_utils
version: 1.1.1
version: 1.1.1
- package: calogica/dbt_expectations
version: 0.10.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@


{% set actual_yaml = testgen.to_yaml(
testgen.get_string_length_test_suggestions(
ref('colnames_with_spaces'),
sample=true,
limit=100
)
)
%}

{% set expected_yaml %}
models:
- name: colnames_with_spaces
columns:
- name: first name
description: String length test generated by dbt-testgen
tests:
- dbt_expectations.expect_column_value_lengths_to_be_between:
min_value: 3
max_value: 5
row_condition: '"first name" is not null'
- name: current city
description: String length test generated by dbt-testgen
tests:
- dbt_expectations.expect_column_value_lengths_to_be_between:
min_value: 7
max_value: 13
row_condition: '"current city" is not null'
{% endset %}

{{ assert_equal (actual_yaml | trim, expected_yaml | trim) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@


{% set actual_yaml = testgen.to_yaml(
testgen.get_string_length_test_suggestions(
ref('users'),
sample=true,
limit=100
)
)
%}

{% set expected_yaml %}
models:
- name: users
columns:
- name: username
description: String length test generated by dbt-testgen
tests:
- dbt_expectations.expect_column_value_lengths_to_be_between:
min_value: 8
max_value: 15
row_condition: '"username" is not null'
- name: email
description: String length test generated by dbt-testgen
tests:
- dbt_expectations.expect_column_value_lengths_to_be_between:
min_value: 18
max_value: 25
row_condition: '"email" is not null'
- name: user_status
description: String length test generated by dbt-testgen
tests:
- dbt_expectations.expect_column_value_lengths_to_be_between:
min_value: 6
max_value: 8
row_condition: '"user_status" is not null'
{% endset %}

{{ assert_equal (actual_yaml | trim, expected_yaml | trim) }}
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,34 @@ models:
min_value: 1
max_value: 30
- name: username
description: Uniqueness test generated by dbt-testgen
description: String length test generated by dbt-testgen
tests:
- unique
- not_null
- dbt_expectations.expect_column_value_lengths_to_be_between:
min_value: 8
max_value: 15
row_condition: '"username" is not null'
- name: email
description: Uniqueness test generated by dbt-testgen
description: String length test generated by dbt-testgen
tests:
- unique
- not_null
- dbt_expectations.expect_column_value_lengths_to_be_between:
min_value: 18
max_value: 25
row_condition: '"email" is not null'
- name: user_status
description: Accepted values test generated by dbt-testgen
description: String length test generated by dbt-testgen
tests:
- accepted_values:
values:
- active
- inactive
- dbt_expectations.expect_column_value_lengths_to_be_between:
min_value: 6
max_value: 8
row_condition: '"user_status" is not null'
- name: age
description: Numeric range test generated by dbt-testgen
tests:
Expand Down
8 changes: 8 additions & 0 deletions macros/helpers/sql_functions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

{% macro get_random_function() %}
{{ return(adapter.dispatch('get_random_function', 'testgen')()) }}
{% endmacro %}

{% macro default__get_random_function(colname) %}
{{ return("RANDOM") }}
{% endmacro %}
52 changes: 52 additions & 0 deletions macros/schema.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
version: 2

macros:
- name: get_test_suggestions
description: Generates YAML schema file that includes tests for your data
arguments:
- name: column_name
type: string
description: The name of the column you want to convert
- name: precision
type: integer
description: Number of decimal places. Defaults to 2.

- name: table_relation
type: Relation
description: |
The [dbt Relation](https://docs.getdbt.com/reference/dbt-classes#relation)
you wish to generate tests for.
Example: ref("mymodel")
- name: sample
type: bool
description: Take a random sample when using the `limit` argument
- name: limit
type: integer
description: Use only this number of records to generate tests.
- name: resource_type
type: string
description: The type of resource that `table_relation` is - 'models', 'seeds', or 'sources'
- name: column_config
type: dict
description: "Configurations to set on columns. Example - {'quote': true}"
- name: exclude_types
type: list
description: Column types to exclude from tests.
- name: exclude_cols
type: list
description: Columns to exclude from tests.
- name: tags
type: list
description: Tags to put on the tests.
- name: tests
type: list
description: "Types of tests to generate. Example: ['uniqueness', 'accepted_values', 'range']"
- name: composite_key_length
type: integer
description: Max length of the composite key for uniqueness tests.
- name: dbt_config
type: dict
description: Existing parsed DBT Schema file to add tests onto.
- name: return_object
type: bool
description: Return the DBT Schema file as a dict object instead of printing YAML.
16 changes: 15 additions & 1 deletion macros/test_aggregation/get_test_suggestions.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
exclude_types = [],
exclude_cols = [],
tags = [],
tests = ["uniqueness", "accepted_values", "range"],
tests = ["uniqueness", "accepted_values", "range", "string_length"],
composite_key_length = 1,
dbt_config = None,
return_object = false
Expand Down Expand Up @@ -58,6 +58,20 @@
) %}
{% endif %}

{% if "string_length" in tests %}
{% set dbt_config = testgen.get_string_length_test_suggestions(
table_relation=table_relation,
sample=sample,
limit=limit,
resource_type=resource_type,
column_config=column_config,
exclude_types=exclude_types,
exclude_cols=exclude_cols,
tags=tags,
dbt_config=dbt_config
) %}
{% endif %}

{% if return_object %}
{{ return(dbt_config) }}
{% else %}
Expand Down
16 changes: 15 additions & 1 deletion macros/test_generation/get_accepted_values_test_suggestions.sql
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,28 @@
testgen.array_agg(column.column) ~ " AS UNIQUE_VALUES
from (
select " ~ adapter.quote(column.column) ~ "
from " ~ table_relation ~ "
from base
group by " ~ adapter.quote(column.column) ~ "
) t1
"
) %}
{% endfor %}

{% if limit != None %}
{% if sample == true %}
{% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %}
{% else %}
{% set limit_stmt = "LIMIT " ~ limit %}
{% endif %}
{% else %}
{% set limit_stmt = "" %}
{% endif %}

{% set count_distinct_sql %}
WITH base AS (
SELECT * FROM {{ table_relation }}
{{ limit_stmt }}
)
SELECT * FROM (
{{ count_distinct_exprs | join("\nUNION ALL\n") }}
) t2
Expand Down
16 changes: 15 additions & 1 deletion macros/test_generation/get_range_test_suggestions.sql
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,33 @@
{% endif %}
{% endfor %}

{% if limit != None %}
{% if sample == true %}
{% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %}
{% else %}
{% set limit_stmt = "LIMIT " ~ limit %}
{% endif %}
{% else %}
{% set limit_stmt = "" %}
{% endif %}

{% set min_max_exprs = [] %}
{% for column in number_cols %}
{% do min_max_exprs.append(
"SELECT '" ~ column.column ~ "' AS COLNAME, " ~
"MIN(" ~ adapter.quote(column.column) ~ ") as COL_MIN, " ~
"MAX(" ~ adapter.quote(column.column) ~ ") as COL_MAX, " ~
loop.index ~ " AS ORDERING " ~
"FROM " ~ table_relation
"FROM base"
) %}
{% endfor %}


{% set min_max_sql %}
WITH base AS (
SELECT * FROM {{ table_relation }}
{{ limit_stmt }}
)
SELECT * FROM (
{{ min_max_exprs | join("\nUNION ALL\n") }}
) t1
Expand Down
Loading

0 comments on commit faf18e7

Please sign in to comment.