From b564211a9d56d5890e10aa99de25528a736c1187 Mon Sep 17 00:00:00 2001 From: jasnonaz Date: Wed, 15 Sep 2021 09:14:24 -0400 Subject: [PATCH] Cherry Pick: not null proportion schema test (#411) * Add not_null_proportion schema test and related integration tests * Update CHANGELOG * Fix csv formatting and numeric typecasting Co-authored-by: Simo Tumelius --- CHANGELOG.md | 1 + README.md | 17 ++++++++++++ .../schema_tests/data_not_null_proportion.csv | 11 ++++++++ .../models/schema_tests/schema.yml | 11 ++++++++ macros/schema_tests/not_null_proportion.sql | 26 +++++++++++++++++++ 5 files changed, 66 insertions(+) create mode 100644 integration_tests/data/schema_tests/data_not_null_proportion.csv create mode 100644 macros/schema_tests/not_null_proportion.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a7ef4f0..4db1437b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,7 @@ If you were relying on the position to match up your optional arguments, this ma ## Features * Add new argument, `order_by`, to `get_column_values` (code originally in [#289](https://github.com/fishtown-analytics/dbt-utils/pull/289/) from [@clausherther](https://github.com/clausherther), merged via [#349](https://github.com/fishtown-analytics/dbt-utils/pull/349/)) * Add `slugify` macro, and use it in the pivot macro. :rotating_light: This macro uses the `re` module, which is only available in dbt v0.19.0+. As a result, this feature introduces a breaking change. ([#314](https://github.com/fishtown-analytics/dbt-utils/pull/314)) +* Add `not_null_proportion` schema test that allows the user to specify the minimum (`at_least`) tolerated proportion (e.g., `0.95`) of non-null values ## Under the hood * Update the default implementation of concat macro to use `||` operator ([#373](https://github.com/fishtown-analytics/dbt-utils/pull/314) from [@ChristopheDuong](https://github.com/ChristopheDuong)). Note this may be a breaking change for adapters that support `concat()` but not `||`, such as Apache Spark. diff --git a/README.md b/README.md index bb6a1f30..d461391c 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Check [dbt Hub](https://hub.getdbt.com/fishtown-analytics/dbt_utils/latest/) for - [cardinality_equality](#cardinality_equality-source) - [unique_where](#unique_where-source) - [not_null_where](#not_null_where-source) + - [not_null_proportion](#not_null_proportion-source) - [relationships_where](#relationships_where-source) - [mutually_exclusive_ranges](#mutually_exclusive_ranges-source) - [unique_combination_of_columns](#unique_combination_of_columns-source) @@ -252,6 +253,22 @@ models: where: "_deleted = false" ``` +#### not_null_proportion ([source](macros/schema_tests/not_null_proportion.sql)) +This test validates that the proportion of non-null values present in a column is between a specified range [`at_least`, `at_most`] where `at_most` is an optional argument (default: `1.0`). + +**Usage:** +```yaml +version: 2 + +models: + - name: my_model + columns: + - name: id + tests: + - dbt_utils.not_null_proportion: + at_least: 0.95 +``` + #### not_accepted_values ([source](macros/schema_tests/not_accepted_values.sql)) This test validates that there are no rows that match the given values. diff --git a/integration_tests/data/schema_tests/data_not_null_proportion.csv b/integration_tests/data/schema_tests/data_not_null_proportion.csv new file mode 100644 index 00000000..b28bb8a2 --- /dev/null +++ b/integration_tests/data/schema_tests/data_not_null_proportion.csv @@ -0,0 +1,11 @@ +point_5,point_9 +1,1 +,2 +,3 +4,4 +5,5 +6,6 +,7 +,8 +, +10,10 \ No newline at end of file diff --git a/integration_tests/models/schema_tests/schema.yml b/integration_tests/models/schema_tests/schema.yml index 25ce3ac8..6d484099 100644 --- a/integration_tests/models/schema_tests/schema.yml +++ b/integration_tests/models/schema_tests/schema.yml @@ -157,3 +157,14 @@ models: inclusive: true where: "id <> -1" + - name: data_not_null_proportion + columns: + - name: point_5 + tests: + - dbt_utils.not_null_proportion: + at_least: 0.5 + at_most: 0.5 + - name: point_9 + tests: + - dbt_utils.not_null_proportion: + at_least: 0.9 diff --git a/macros/schema_tests/not_null_proportion.sql b/macros/schema_tests/not_null_proportion.sql new file mode 100644 index 00000000..20cb0258 --- /dev/null +++ b/macros/schema_tests/not_null_proportion.sql @@ -0,0 +1,26 @@ +{% macro test_not_null_proportion(model) %} + {{ return(adapter.dispatch('test_not_null_proportion', packages = dbt_utils._get_utils_namespaces())(model, **kwargs)) }} +{% endmacro %} + +{% macro default__test_not_null_proportion(model) %} + +{% set column_name = kwargs.get('column_name', kwargs.get('arg')) %} +{% set at_least = kwargs.get('at_least', kwargs.get('arg')) %} +{% set at_most = kwargs.get('at_most', kwargs.get('arg', 1)) %} + +with validation as ( + select + sum(case when {{ column_name }} is null then 0 else 1 end) / cast(count(*) as numeric) as not_null_proportion + from {{ model }} +), +validation_errors as ( + select + not_null_proportion + from validation + where not_null_proportion < {{ at_least }} or not_null_proportion > {{ at_most }} +) +select + count(*) +from validation_errors + +{% endmacro %}