From f4a5f5733083954f0180cd120361fa1cfd0cf82d Mon Sep 17 00:00:00 2001 From: Callum McCann Date: Thu, 17 Nov 2022 16:21:38 -0600 Subject: [PATCH] making rolling unbound --- README.md | 14 +- ...e_count_metric__secondary_calculations.sql | 6 +- .../generate_secondary_calculation_alias.sql | 12 +- .../secondary_calculation_rolling.sql | 4 + .../rolling.sql | 3 - .../rolling/test_unbound_rolling.py | 158 ++++++++++++++++++ 6 files changed, 182 insertions(+), 15 deletions(-) create mode 100644 tests/functional/metric_options/secondary_calculations/rolling/test_unbound_rolling.py diff --git a/README.md b/README.md index 385cef37..c53c967c 100644 --- a/README.md +++ b/README.md @@ -265,14 +265,14 @@ Constructor: `metrics.period_to_date(aggregate, period [, alias, metric_list])` ## Rolling ([source](/macros/secondary_calculations/secondary_calculation_rolling.sql)) -The rolling secondary calculation performs an aggregation on a defined number of rows in metric dataset. For example, if the user selects the `week` grain and sets a rolling secondary calculation to `4` then the value returned will be a rolling 4 week calculation of whatever aggregation type was selected. +The rolling secondary calculation performs an aggregation on a number of rows in metric dataset. For example, if the user selects the `week` grain and sets a rolling secondary calculation to `4` then the value returned will be a rolling 4 week calculation of whatever aggregation type was selected. If the `interval` input is not provided then the rolling caclulation will be unbounded on all preceding rows. -Constructor: `metrics.rolling(aggregate, interval [, alias, metric_list])` +Constructor: `metrics.rolling(aggregate [, interval, alias, metric_list])` | Input | Example | Description | Required | | -------------------------- | ----------- | ----------- | -----------| | `aggregate` | `max`, `average` | The aggregation to use in the window function. Options vary based on the primary aggregation and are enforced in [validate_aggregate_coherence()](/macros/secondary_calculations/validate_aggregate_coherence.sql). | Yes | -| `interval` | 1 | Integer - the number of time grains to look back | Yes | +| `interval` | 1 | Integer - the number of time grains to look back | No | | `alias` | `month_to_date` | The column alias for the resulting calculation | No | | `metric_list` | `base_sum_metric` | List of metrics that the secondary calculation should be applied to. Default is all metrics selected | No | @@ -328,10 +328,10 @@ The metrics package contains validation on the configurations you're able to pro Below is the list of metric configs currently accepted by this package. -| Config | Type | Accepted Values | Default Value | Description | -|-----------------------------|---------|-----------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `enabled` | boolean | True/False | True | Enables or disables a metric node. When disabled, dbt will not consider it as part of your project. | -| `treat_null_values_as_zero` | boolean | True/False | True | Controls the `coalesce` behavior for metrics. By default, when there are no observations for a metric, the output of the metric as well as period Over period secondary calculations will include a `coalesce({{ field }}, 0)` to return 0's rather than nulls. Setting this config to False instead returns `NULL` values. | +| Config | Type | Accepted Values | Default Value | Description | +|--------|------|-----------------|---------------|-------------| +| `enabled` | boolean | True/False | True | Enables or disables a metric node. When disabled, dbt will not consider it as part of your project. | +| `treat_null_values_as_zero` | boolean | True/False | True | Controls the `coalesce` behavior for metrics. By default, when there are no observations for a metric, the output of the metric as well as period Over period secondary calculations will include a `coalesce({{ field }}, 0)` to return 0's rather than nulls. Setting this config to False instead returns `NULL` values. | ## All_Time Grain diff --git a/integration_tests/models/metric_testing_models/base_count_metric__secondary_calculations.sql b/integration_tests/models/metric_testing_models/base_count_metric__secondary_calculations.sql index 905e6529..c003cf8e 100644 --- a/integration_tests/models/metric_testing_models/base_count_metric__secondary_calculations.sql +++ b/integration_tests/models/metric_testing_models/base_count_metric__secondary_calculations.sql @@ -3,15 +3,15 @@ from {{ metrics.calculate( metric('base_count_metric'), grain='month', - start_date = '2021-01-01', - end_date = '2021-04-01', + start_date = '2022-01-01', + end_date = '2022-04-01', secondary_calculations=[ {"calculation": "period_over_period", "interval": 1, "comparison_strategy": "difference", "alias": "pop_1mth"}, {"calculation": "period_over_period", "interval": 1, "comparison_strategy": "ratio"}, {"calculation": "period_to_date", "aggregate": "sum", "period": "year", "alias": "ytd_sum"}, {"calculation": "period_to_date", "aggregate": "max", "period": "month"}, {"calculation": "rolling", "interval": 3, "aggregate": "average", "alias": "avg_3mth"}, - {"calculation": "rolling", "interval": 3, "aggregate": "sum"}, + {"calculation": "rolling", "aggregate": "sum"}, ] ) }} \ No newline at end of file diff --git a/macros/secondary_calculations/generate_secondary_calculation_alias.sql b/macros/secondary_calculations/generate_secondary_calculation_alias.sql index 36ea4f6f..3df019e4 100644 --- a/macros/secondary_calculations/generate_secondary_calculation_alias.sql +++ b/macros/secondary_calculations/generate_secondary_calculation_alias.sql @@ -23,9 +23,17 @@ {%- elif calc_type == 'rolling' %} {%- if is_multiple_metrics -%} - {%- do return(metric_name ~ "_" ~ "rolling_" ~ calc_config.aggregate ~ "_" ~ calc_config.interval ~ "_" ~ grain) %} + {%- if calc_config.interval -%} + {%- do return(metric_name ~ "_" ~ "rolling_" ~ calc_config.aggregate ~ "_" ~ calc_config.interval ~ "_" ~ grain) %} + {%- else -%} + {%- do return(metric_name ~ "_" ~ "rolling_" ~ calc_config.aggregate) %} + {%- endif -%} {%- else -%} - {%- do return("rolling_" ~ calc_config.aggregate ~ "_" ~ calc_config.interval ~ "_" ~ grain) %} + {%- if calc_config.interval -%} + {%- do return("rolling_" ~ calc_config.aggregate ~ "_" ~ calc_config.interval ~ "_" ~ grain) %} + {%- else -%} + {%- do return("rolling_" ~ calc_config.aggregate) %} + {%- endif -%} {%- endif -%} {%- elif calc_type == 'period_to_date' %} diff --git a/macros/secondary_calculations/secondary_calculation_rolling.sql b/macros/secondary_calculations/secondary_calculation_rolling.sql index 9088edec..420f7b76 100644 --- a/macros/secondary_calculations/secondary_calculation_rolling.sql +++ b/macros/secondary_calculations/secondary_calculation_rolling.sql @@ -6,7 +6,11 @@ partition by {{ dimensions | join(", ") }} {% endif -%} order by date_{{grain}} + {% if calc_config.interval %} rows between {{ calc_config.interval - 1 }} preceding and current row + {% else %} + rows between unbounded preceding and current row + {% endif %} ) {% endset %} diff --git a/macros/secondary_calculations_configuration/rolling.sql b/macros/secondary_calculations_configuration/rolling.sql index e0a4b55c..66f7787b 100644 --- a/macros/secondary_calculations_configuration/rolling.sql +++ b/macros/secondary_calculations_configuration/rolling.sql @@ -4,9 +4,6 @@ {% if not aggregate %} {% set _ = missing_args.append("aggregate") %} {% endif %} - {% if not interval %} - {% set _ = missing_args.append("interval") %} - {% endif %} {% if missing_args | length > 0 %} {% do exceptions.raise_compiler_error( missing_args | join(", ") ~ ' not provided to rolling') %} {% endif %} diff --git a/tests/functional/metric_options/secondary_calculations/rolling/test_unbound_rolling.py b/tests/functional/metric_options/secondary_calculations/rolling/test_unbound_rolling.py new file mode 100644 index 00000000..a56ae807 --- /dev/null +++ b/tests/functional/metric_options/secondary_calculations/rolling/test_unbound_rolling.py @@ -0,0 +1,158 @@ +from struct import pack +import os +import pytest +from dbt.tests.util import run_dbt + +# our file contents +from tests.functional.fixtures import ( + fact_orders_source_csv, + fact_orders_sql, + fact_orders_yml, +) + +# models/rolling_count.sql +rolling_count_sql = """ +select * +from +{{ metrics.calculate(metric('rolling_count'), + grain='week', + secondary_calculations=[ + metrics.rolling(aggregate="min"), + metrics.rolling(aggregate="max"), + metrics.rolling(aggregate="sum"), + metrics.rolling(aggregate="average") + ] + ) +}} +""" + +# models/rolling_count.yml +rolling_count_yml = """ +version: 2 +models: + - name: rolling_count + tests: + - metrics.metric_equality: + compare_model: ref('rolling_count__expected') +metrics: + - name: rolling_count + model: ref('fact_orders') + label: Count Distinct + timestamp: order_date + time_grains: [day, week, month] + calculation_method: count + expression: customer_id + dimensions: + - had_discount + - order_country +""" + +# seeds/rolling_count__expected.csv +if os.getenv('dbt_target') == 'snowflake': + rolling_count__expected_csv = """ +date_week,rolling_count,rolling_count_rolling_min,rolling_count_rolling_max,rolling_count_rolling_sum,rolling_count_rolling_average +2022-01-03,2,2,2,2,2.000 +2022-01-10,1,1,2,3,1.500 +2022-01-17,3,1,3,6,2.000 +2022-01-24,1,1,3,7,1.750 +2022-01-31,1,1,3,8,1.600 +2022-02-07,1,1,3,9,1.500 +2022-02-14,1,1,3,10,1.428 +""".lstrip() +else: + rolling_count__expected_csv = """ +date_week,rolling_count,rolling_count_rolling_min,rolling_count_rolling_max,rolling_count_rolling_sum,rolling_count_rolling_average +2022-01-03,2,2,2,2,2.0000000000000000 +2022-01-10,1,1,2,3,1.5000000000000000 +2022-01-17,3,1,3,6,2.0000000000000000 +2022-01-24,1,1,3,7,1.7500000000000000 +2022-01-31,1,1,3,8,1.6000000000000000 +2022-02-07,1,1,3,9,1.5000000000000000 +2022-02-14,1,1,3,10,1.4285714285714286 +""".lstrip() + +# seeds/rolling_count__expected.yml +if os.getenv('dbt_target') == 'bigquery': + rolling_count__expected_yml = """ +version: 2 +seeds: + - name: rolling_count__expected + config: + column_types: + date_week: date + rolling_count: INT64 + rolling_count_rolling_min: INT64 + rolling_count_rolling_max: INT64 + rolling_count_rolling_sum: INT64 + rolling_count_rolling_average: FLOAT64 +""".lstrip() +else: + rolling_count__expected_yml = """""" + +class TestRollingCount: + + # configuration in dbt_project.yml + # setting bigquery as table to get around query complexity + # resource constraints with compunding views + if os.getenv('dbt_target') == 'bigquery': + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "name": "example", + "models": {"+materialized": "table"} + } + else: + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "name": "example", + "models": {"+materialized": "view"} + } + + # install current repo as package + @pytest.fixture(scope="class") + def packages(self): + return { + "packages": [ + {"local": os.getcwd()} + ] + } + + # everything that goes in the "seeds" directory + @pytest.fixture(scope="class") + def seeds(self): + return { + "fact_orders_source.csv": fact_orders_source_csv, + "rolling_count__expected.csv": rolling_count__expected_csv, + "rolling_count__expected.yml": rolling_count__expected_yml + } + + # everything that goes in the "models" directory + @pytest.fixture(scope="class") + def models(self): + return { + "fact_orders.sql": fact_orders_sql, + "fact_orders.yml": fact_orders_yml, + "rolling_count.sql": rolling_count_sql, + "rolling_count.yml": rolling_count_yml + } + + def test_build_completion(self,project,): + # running deps to install package + results = run_dbt(["deps"]) + + # seed seeds + results = run_dbt(["seed"]) + assert len(results) == 2 + + # initial run + results = run_dbt(["run"]) + assert len(results) == 3 + + # test tests + results = run_dbt(["test"]) # expect passing test + assert len(results) == 1 + + # # # validate that the results include pass + result_statuses = sorted(r.status for r in results) + assert result_statuses == ["pass"] \ No newline at end of file