From faf18e7bca99c2f111a010be5d3ae06f5e1350fc Mon Sep 17 00:00:00 2001
From: kevin <you@example.com>
Date: Tue, 2 Jan 2024 23:59:28 -0600
Subject: [PATCH] added sampling and string length tests

---
 README.md                                     |   4 +-
 integration_tests/packages.yml                |   4 +-
 .../string_length_colnames_with_spaces.sql    |  32 +++++
 .../string_length_users.sql                   |  39 ++++++
 .../test_selection_users.sql                  |  18 ++-
 macros/helpers/sql_functions.sql              |   8 ++
 macros/schema.yml                             |  52 +++++++
 .../test_aggregation/get_test_suggestions.sql |  16 ++-
 .../get_accepted_values_test_suggestions.sql  |  16 ++-
 .../get_range_test_suggestions.sql            |  16 ++-
 .../get_string_length_test_suggestions.sql    | 129 ++++++++++++++++++
 .../get_uniqueness_test_suggestions.sql       |  22 ++-
 12 files changed, 346 insertions(+), 10 deletions(-)
 create mode 100644 integration_tests/tests/generate_string_length_tests/string_length_colnames_with_spaces.sql
 create mode 100644 integration_tests/tests/generate_string_length_tests/string_length_users.sql
 create mode 100644 macros/helpers/sql_functions.sql
 create mode 100644 macros/schema.yml
 create mode 100644 macros/test_generation/get_string_length_test_suggestions.sql

diff --git a/README.md b/README.md
index 8643b41..39013aa 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,9 @@
 ## About
 `dbt-testgen` is a [dbt](https://github.com/dbt-labs/dbt) package that autogenerates dbt test YAML based on real data.
 
-Inspired by [dbt-codegen](https://github.com/dbt-labs/dbt-codegen) and [deequ Constraint Suggestion](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md)
+Code documentation available at [here](https://kgmcquate.github.io/dbt-testgen/)
+
+Inspired by [dbt-codegen](https://github.com/dbt-labs/dbt-codegen) and [deequ Constraint Suggestion](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md).
 
 ## Install
 `dbt-testgen` currently supports `dbt 1.2.x` or higher.
diff --git a/integration_tests/packages.yml b/integration_tests/packages.yml
index 3d6b503..ff73650 100644
--- a/integration_tests/packages.yml
+++ b/integration_tests/packages.yml
@@ -1,4 +1,6 @@
 packages:
     - local: ../
     - package: dbt-labs/dbt_utils
-      version: 1.1.1
\ No newline at end of file
+      version: 1.1.1
+    - package: calogica/dbt_expectations
+      version: 0.10.1
\ No newline at end of file
diff --git a/integration_tests/tests/generate_string_length_tests/string_length_colnames_with_spaces.sql b/integration_tests/tests/generate_string_length_tests/string_length_colnames_with_spaces.sql
new file mode 100644
index 0000000..0271eec
--- /dev/null
+++ b/integration_tests/tests/generate_string_length_tests/string_length_colnames_with_spaces.sql
@@ -0,0 +1,32 @@
+
+
+{% set actual_yaml = testgen.to_yaml(
+        testgen.get_string_length_test_suggestions(
+            ref('colnames_with_spaces'),
+            sample=true,
+            limit=100
+        )
+    )
+%}
+
+{% set expected_yaml %}
+models:
+- name: colnames_with_spaces
+  columns:
+  - name: first name
+    description: String length test generated by dbt-testgen
+    tests:
+    - dbt_expectations.expect_column_value_lengths_to_be_between:
+        min_value: 3
+        max_value: 5
+        row_condition: '"first name" is not null'
+  - name: current city
+    description: String length test generated by dbt-testgen
+    tests:
+    - dbt_expectations.expect_column_value_lengths_to_be_between:
+        min_value: 7
+        max_value: 13
+        row_condition: '"current city" is not null'
+{% endset %}
+
+{{ assert_equal (actual_yaml | trim, expected_yaml | trim) }}
\ No newline at end of file
diff --git a/integration_tests/tests/generate_string_length_tests/string_length_users.sql b/integration_tests/tests/generate_string_length_tests/string_length_users.sql
new file mode 100644
index 0000000..8d6e2e4
--- /dev/null
+++ b/integration_tests/tests/generate_string_length_tests/string_length_users.sql
@@ -0,0 +1,39 @@
+
+
+{% set actual_yaml = testgen.to_yaml(
+        testgen.get_string_length_test_suggestions(
+            ref('users'),
+            sample=true,
+            limit=100
+        )
+    )
+%}
+
+{% set expected_yaml %}
+models:
+- name: users
+  columns:
+  - name: username
+    description: String length test generated by dbt-testgen
+    tests:
+    - dbt_expectations.expect_column_value_lengths_to_be_between:
+        min_value: 8
+        max_value: 15
+        row_condition: '"username" is not null'
+  - name: email
+    description: String length test generated by dbt-testgen
+    tests:
+    - dbt_expectations.expect_column_value_lengths_to_be_between:
+        min_value: 18
+        max_value: 25
+        row_condition: '"email" is not null'
+  - name: user_status
+    description: String length test generated by dbt-testgen
+    tests:
+    - dbt_expectations.expect_column_value_lengths_to_be_between:
+        min_value: 6
+        max_value: 8
+        row_condition: '"user_status" is not null'
+{% endset %}
+
+{{ assert_equal (actual_yaml | trim, expected_yaml | trim) }}
\ No newline at end of file
diff --git a/integration_tests/tests/test_test_selection/test_selection_users.sql b/integration_tests/tests/test_test_selection/test_selection_users.sql
index eda1fd7..33aff0d 100644
--- a/integration_tests/tests/test_test_selection/test_selection_users.sql
+++ b/integration_tests/tests/test_test_selection/test_selection_users.sql
@@ -21,22 +21,34 @@ models:
         min_value: 1
         max_value: 30
   - name: username
-    description: Uniqueness test generated by dbt-testgen
+    description: String length test generated by dbt-testgen
     tests:
     - unique
     - not_null
+    - dbt_expectations.expect_column_value_lengths_to_be_between:
+        min_value: 8
+        max_value: 15
+        row_condition: '"username" is not null'
   - name: email
-    description: Uniqueness test generated by dbt-testgen
+    description: String length test generated by dbt-testgen
     tests:
     - unique
     - not_null
+    - dbt_expectations.expect_column_value_lengths_to_be_between:
+        min_value: 18
+        max_value: 25
+        row_condition: '"email" is not null'
   - name: user_status
-    description: Accepted values test generated by dbt-testgen
+    description: String length test generated by dbt-testgen
     tests:
     - accepted_values:
         values:
         - active
         - inactive
+    - dbt_expectations.expect_column_value_lengths_to_be_between:
+        min_value: 6
+        max_value: 8
+        row_condition: '"user_status" is not null'
   - name: age
     description: Numeric range test generated by dbt-testgen
     tests:
diff --git a/macros/helpers/sql_functions.sql b/macros/helpers/sql_functions.sql
new file mode 100644
index 0000000..f72cee9
--- /dev/null
+++ b/macros/helpers/sql_functions.sql
@@ -0,0 +1,8 @@
+
+{% macro get_random_function() %}
+    {{ return(adapter.dispatch('get_random_function', 'testgen')()) }}
+{% endmacro %}
+
+{% macro default__get_random_function(colname) %}
+    {{ return("RANDOM") }}
+{% endmacro %}
diff --git a/macros/schema.yml b/macros/schema.yml
new file mode 100644
index 0000000..232b4a9
--- /dev/null
+++ b/macros/schema.yml
@@ -0,0 +1,52 @@
+version: 2
+
+macros:
+  - name: get_test_suggestions
+    description: Generates YAML schema file that includes tests for your data
+    arguments:
+      - name: column_name
+        type: string
+        description: The name of the column you want to convert
+      - name: precision
+        type: integer
+        description: Number of decimal places. Defaults to 2.
+
+      - name: table_relation
+        type: Relation
+        description: |
+            The [dbt Relation](https://docs.getdbt.com/reference/dbt-classes#relation) 
+            you wish to generate tests for.
+            Example: ref("mymodel")
+      - name: sample
+        type: bool
+        description: Take a random sample when using the `limit` argument
+      - name: limit
+        type: integer
+        description: Use only this number of records to generate tests.
+      - name: resource_type
+        type: string
+        description: The type of resource that `table_relation` is - 'models', 'seeds', or 'sources'
+      - name: column_config
+        type: dict
+        description: "Configurations to set on columns. Example - {'quote': true}"
+      - name: exclude_types
+        type: list
+        description: Column types to exclude from tests.
+      - name: exclude_cols
+        type: list
+        description: Columns to exclude from tests.
+      - name: tags
+        type: list
+        description: Tags to put on the tests.
+      - name: tests
+        type: list
+        description: "Types of tests to generate. Example: ['uniqueness', 'accepted_values', 'range']"
+      - name: composite_key_length
+        type: integer
+        description: Max length of the composite key for uniqueness tests.
+      - name: dbt_config
+        type: dict
+        description: Existing parsed DBT Schema file to add tests onto.
+      - name: return_object
+        type: bool
+        description: Return the DBT Schema file as a dict object instead of printing YAML.
\ No newline at end of file
diff --git a/macros/test_aggregation/get_test_suggestions.sql b/macros/test_aggregation/get_test_suggestions.sql
index c1770d1..87e4d2b 100644
--- a/macros/test_aggregation/get_test_suggestions.sql
+++ b/macros/test_aggregation/get_test_suggestions.sql
@@ -8,7 +8,7 @@
         exclude_types = [],
         exclude_cols = [],
         tags = [],
-        tests = ["uniqueness", "accepted_values", "range"],
+        tests = ["uniqueness", "accepted_values", "range", "string_length"],
         composite_key_length = 1,
         dbt_config = None,
         return_object = false
@@ -58,6 +58,20 @@
             ) %}
         {% endif %}
 
+        {% if "string_length" in tests %}
+            {% set dbt_config = testgen.get_string_length_test_suggestions(
+                table_relation=table_relation,
+                sample=sample,
+                limit=limit,
+                resource_type=resource_type,
+                column_config=column_config,
+                exclude_types=exclude_types,
+                exclude_cols=exclude_cols,
+                tags=tags,
+                dbt_config=dbt_config
+            ) %}
+        {% endif %}
+
         {% if return_object %}
             {{ return(dbt_config) }}
         {% else %}
diff --git a/macros/test_generation/get_accepted_values_test_suggestions.sql b/macros/test_generation/get_accepted_values_test_suggestions.sql
index 9874852..5854912 100644
--- a/macros/test_generation/get_accepted_values_test_suggestions.sql
+++ b/macros/test_generation/get_accepted_values_test_suggestions.sql
@@ -79,14 +79,28 @@
                 testgen.array_agg(column.column) ~ " AS UNIQUE_VALUES
             from (
                 select " ~ adapter.quote(column.column) ~ "
-                from " ~ table_relation ~ "
+                from base
                 group by " ~ adapter.quote(column.column) ~ "
             ) t1
             "
         ) %}
     {% endfor %}
 
+    {% if limit != None %}
+        {% if sample == true %}
+            {% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %}
+        {% else %}
+            {% set limit_stmt = "LIMIT " ~ limit %}
+        {% endif %}
+    {% else %}
+        {% set limit_stmt = "" %}
+    {% endif %}
+
     {% set count_distinct_sql %}
+        WITH base AS (
+            SELECT * FROM {{ table_relation }}
+            {{ limit_stmt }}
+        )
         SELECT * FROM (
             {{ count_distinct_exprs | join("\nUNION ALL\n") }}
         ) t2
diff --git a/macros/test_generation/get_range_test_suggestions.sql b/macros/test_generation/get_range_test_suggestions.sql
index 2905c1f..97ed431 100644
--- a/macros/test_generation/get_range_test_suggestions.sql
+++ b/macros/test_generation/get_range_test_suggestions.sql
@@ -45,6 +45,16 @@
         {% endif %}
     {% endfor %}
 
+    {% if limit != None %}
+        {% if sample == true %}
+            {% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %}
+        {% else %}
+            {% set limit_stmt = "LIMIT " ~ limit %}
+        {% endif %}
+    {% else %}
+        {% set limit_stmt = "" %}
+    {% endif %}
+
     {% set min_max_exprs = [] %}
     {% for column in number_cols %}
         {% do min_max_exprs.append(
@@ -52,12 +62,16 @@
                 "MIN(" ~ adapter.quote(column.column) ~ ") as COL_MIN, " ~ 
                 "MAX(" ~ adapter.quote(column.column) ~ ") as COL_MAX, " ~ 
                 loop.index ~ " AS ORDERING " ~ 
-            "FROM " ~ table_relation
+            "FROM base"
         ) %}
     {% endfor %}
 
 
     {% set min_max_sql %}
+        WITH base AS (
+            SELECT * FROM {{ table_relation }}
+            {{ limit_stmt }}
+        )
         SELECT * FROM (
             {{ min_max_exprs | join("\nUNION ALL\n") }}
         ) t1
diff --git a/macros/test_generation/get_string_length_test_suggestions.sql b/macros/test_generation/get_string_length_test_suggestions.sql
new file mode 100644
index 0000000..90e30ee
--- /dev/null
+++ b/macros/test_generation/get_string_length_test_suggestions.sql
@@ -0,0 +1,129 @@
+
+{% macro get_string_length_test_suggestions(
+        table_relation,
+        sample = false,
+        limit = None,
+        resource_type = "models",
+        column_config = {},
+        exclude_types = [],
+        exclude_cols = [],
+        tags = ["string_length"],
+        dbt_config = None
+    ) %}
+    {# Run macro for the specific target DB #}
+    {% if execute %}
+        {{ return(adapter.dispatch('get_string_length_test_suggestions', 'testgen')(table_relation, sample, limit, resource_type, column_config, exclude_types, exclude_cols, tags, dbt_config, **kwargs)) }}
+    {% endif%}
+{%- endmacro %}
+
+
+{% macro default__get_string_length_test_suggestions(
+        table_relation,
+        sample = false,
+        limit = None,
+        resource_type = "models",
+        column_config = {},
+        exclude_types = [],
+        exclude_cols = [],
+        tags = ["string_length"],
+        dbt_config = None
+    ) 
+%}
+    {# kwargs is used for test configurations #}
+    {# {% if tags != None %}
+        {% do test_config.update({"tags": tags}) %}
+    {% endif %} #}
+
+    {% set columns = adapter.get_columns_in_relation(table_relation) %}
+    {% set columns = testgen.exclude_column_types(columns, exclude_types) %}
+    {% set columns = testgen.exclude_column_names(columns, exclude_cols) %}
+
+    {% set string_cols = [] %}
+    {% for column in columns %}
+        {% if column.is_string() %}
+            {% do string_cols.append(column) %}
+        {% endif %}
+    {% endfor %}
+
+    {% if limit != None %}
+        {% if sample == true %}
+            {% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %}
+        {% else %}
+            {% set limit_stmt = "LIMIT " ~ limit %}
+        {% endif %}
+    {% else %}
+        {% set limit_stmt = "" %}
+    {% endif %}
+
+    {% set min_max_exprs = [] %}
+    {% for column in string_cols %}
+        {% do min_max_exprs.append(
+            "SELECT '" ~ column.column ~ "' AS COLNAME, " ~ 
+                "MIN(LENGTH(" ~ adapter.quote(column.column) ~ ")) as COL_MIN, " ~ 
+                "MAX(LENGTH(" ~ adapter.quote(column.column) ~ ")) as COL_MAX, " ~ 
+                loop.index ~ " AS ORDERING " ~ 
+            "FROM base 
+            WHERE " ~ adapter.quote(column.column) ~ " IS NOT NULL"
+        ) %}
+    {% endfor %}
+
+
+    {% set min_max_sql %}
+        WITH base AS (
+            SELECT * FROM {{ table_relation }}
+            {{ limit_stmt }}
+        )
+        SELECT * FROM (
+            {{ min_max_exprs | join("\nUNION ALL\n") }}
+        ) t1
+        ORDER BY ORDERING ASC
+    {% endset %}
+
+    {% set results = testgen.query_as_list(min_max_sql) %}
+
+    {% set column_tests = [] %}
+    {% for result in results %}
+
+        {% if result[1] == result[2] %}
+            {% set test = {
+                    "dbt_expectations.expect_column_value_lengths_to_equal": {
+                        "value": result[1],
+                        "row_condition": adapter.quote(result[0]) ~ " is not null"
+                    }
+                }
+            %}
+        {% else %}
+            {% set test = {
+                    "dbt_expectations.expect_column_value_lengths_to_be_between": {
+                        "min_value": result[1],
+                        "max_value": result[2],
+                        "row_condition": adapter.quote(result[0]) ~ " is not null"
+                    }
+                }
+            %}
+        {% endif %}
+
+        {% set col_config = {
+                "name": result[0],
+                "description": "String length test generated by dbt-testgen",
+                "tests": [test]
+            }
+        %}
+
+        {% for k,v in column_config.items() %}
+            {% do col_config.update({k: v}) %}
+        {% endfor %}
+
+        {% do column_tests.append(col_config) %}
+    {% endfor %}
+
+    {% set model = {"name": table_relation.identifier,  "columns": column_tests} %}
+
+    {% set new_dbt_config = {resource_type: [model]} %}
+
+    {% set merged_dbt_config = testgen.merge_dbt_configs(dbt_config, new_dbt_config) %}
+
+    {% do return(merged_dbt_config) %}
+
+{% endmacro %}
+
diff --git a/macros/test_generation/get_uniqueness_test_suggestions.sql b/macros/test_generation/get_uniqueness_test_suggestions.sql
index d68a8c4..6d2f633 100644
--- a/macros/test_generation/get_uniqueness_test_suggestions.sql
+++ b/macros/test_generation/get_uniqueness_test_suggestions.sql
@@ -76,6 +76,16 @@
         {% set limit_expr = "" %}
     {% endif %}
 
+    {% if limit != None %}
+        {% if sample == true %}
+            {% set limit_stmt = "ORDER BY " ~ testgen.get_random_function() ~ "() LIMIT " ~ limit %}
+        {% else %}
+            {% set limit_stmt = "LIMIT " ~ limit %}
+        {% endif %}
+    {% else %}
+        {% set limit_stmt = "" %}
+    {% endif %}
+
     {% set count_distinct_exprs = [] %}
     {% for column_combo in column_combinations %}
         {% set column_combo_quoted = [] %}
@@ -85,19 +95,27 @@
         {% do count_distinct_exprs.append(
             "SELECT " ~ loop.index ~ " AS ORDERING, count(1) AS CARDINALITY
             from (
-                SELECT 1 FROM " ~ table_relation ~ " 
+                SELECT 1 FROM base
                 GROUP BY " ~ column_combo_quoted|join(", ") ~ "
             ) t"
         ) %}
     {% endfor %}
 
     {% set count_distinct_sql %}
+    WITH base AS (
+            SELECT * FROM {{ table_relation }}
+            {{ limit_stmt }}
+        )
     {{ count_distinct_exprs | join("\nUNION ALL\n") }}
     ORDER BY ordering ASC
     {% endset %}
 
     {% set count_sql %}
-        {{ "SELECT count(1) AS TABLE_COUNT FROM " ~ table_relation }} 
+        WITH base AS (
+            SELECT * FROM {{ table_relation }}
+            {{ limit_stmt }}
+        )
+        SELECT count(1) AS TABLE_COUNT FROM base
     {% endset%}
 
     {% set table_count = testgen.query_as_list(count_sql)[0][0] %}