Skip to content

Commit

Permalink
[FEATURE]: DataProfilerStructuredDataAssistant Float Rule (#7842)
Browse files Browse the repository at this point in the history
Co-authored-by: iniyam <iniyam@gmail.com>
Co-authored-by: iniyam <62399852+iniyam@users.noreply.github.com>
Co-authored-by: Alex Sherstinsky <alexsherstinsky@users.noreply.github.com>
  • Loading branch information
4 people authored May 18, 2023
1 parent cab08d2 commit c3629d7
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,11 @@ def get_rules(self) -> Optional[List[Rule]]:
Optional custom list of "Rule" objects implementing particular "DataAssistant" functionality.
"""
numeric_rule: Rule = self._build_numeric_rule()
float_rule: Rule = self._build_float_rule()

return [
numeric_rule,
float_rule,
]

def _build_data_assistant_result(
Expand Down Expand Up @@ -196,3 +198,114 @@ def _build_numeric_rule() -> Rule:
)

return rule

@staticmethod
def _build_float_rule() -> Rule:
"""
This method builds "Rule" object configured to emit "ExpectationConfiguration" objects for column "Domain" type.
This rule holds expectations which are associated with the float metrics in the data profiler report. There
are additional rules which are planned to be created, such as timestamp_rule, text_rule, categorical_rule, etc.
Currently, the float_rule uses DataProfilerColumnDomainBuilder, so it doesn't discriminate by data type when applying the
rule.
"""
data_profiler_column_domain_builder: DomainBuilder = (
DataProfilerColumnDomainBuilder()
)

data_profiler_profile_report_metric_single_batch_parameter_builder_for_metrics: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.build_metric_single_batch_parameter_builder(
metric_name="data_profiler.column_profile_report",
suffix=None,
metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
metric_value_kwargs={
"profile_path": f"{VARIABLES_KEY}profile_path",
},
)

data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations: ParameterBuilder = data_profiler_profile_report_metric_single_batch_parameter_builder_for_metrics

validation_parameter_builder_configs: Optional[List[ParameterBuilderConfig]]

validation_parameter_builder_configs = [
ParameterBuilderConfig(
**data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.to_json_dict(),
),
]
expect_column_min_to_be_between_expectation_configuration_builder: ExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder(
expectation_type="expect_column_min_to_be_between",
validation_parameter_builder_configs=validation_parameter_builder_configs,
column=f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
min_value=f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}.statistics.precision.min",
max_value=None,
strict_min=f"{VARIABLES_KEY}strict_min",
strict_max=None,
meta={
"profiler_details": f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
},
)

expect_column_max_to_be_between_expectation_configuration_builder: ExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder(
expectation_type="expect_column_max_to_be_between",
validation_parameter_builder_configs=validation_parameter_builder_configs,
column=f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
min_value=None,
max_value=f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}.statistics.precision.max",
strict_min=None,
strict_max=f"{VARIABLES_KEY}strict_max",
meta={
"profiler_details": f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
},
)

expect_column_mean_to_be_between_expectation_configuration_builder: ExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder(
expectation_type="expect_column_mean_to_be_between",
validation_parameter_builder_configs=validation_parameter_builder_configs,
column=f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
min_value=f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}.statistics.precision.mean",
max_value=f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}.statistics.precision.mean",
strict_min=f"{VARIABLES_KEY}strict_min",
strict_max=f"{VARIABLES_KEY}strict_max",
meta={
"profiler_details": f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
},
)

expect_column_stddev_to_be_between_expectation_configuration_builder: ExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder(
expectation_type="expect_column_stdev_to_be_between",
validation_parameter_builder_configs=validation_parameter_builder_configs,
column=f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
min_value=f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}.statistics.precision.std",
max_value=f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}.statistics.precision.std",
strict_min=f"{VARIABLES_KEY}strict_min",
strict_max=f"{VARIABLES_KEY}strict_max",
meta={
"profiler_details": f"{data_profiler_profile_report_metric_single_batch_parameter_builder_for_validations.json_serialized_fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
},
)

variables: dict = {
"strict_min": False,
"strict_max": False,
"profile_path": "default_profiler_path",
}

parameter_builders: List[ParameterBuilder] = [
data_profiler_profile_report_metric_single_batch_parameter_builder_for_metrics,
]

expectation_configuration_builders: List[ExpectationConfigurationBuilder] = [
expect_column_min_to_be_between_expectation_configuration_builder,
expect_column_max_to_be_between_expectation_configuration_builder,
expect_column_mean_to_be_between_expectation_configuration_builder,
expect_column_stddev_to_be_between_expectation_configuration_builder,
]

rule = Rule(
name="float_rule",
variables=variables,
domain_builder=data_profiler_column_domain_builder,
parameter_builders=parameter_builders,
expectation_configuration_builders=expectation_configuration_builders,
)

return rule
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import unittest
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, cast
from unittest import mock

Expand Down Expand Up @@ -50,9 +51,23 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena
"data_connector_query": {"index": -1},
}
exclude_column_names = [
"vendor_id",
"pickup_datetime",
"dropoff_datetime",
"passenger_count",
# "trip_distance",
"rate_code_id",
"store_and_fwd_flag",
"pickup_location_id",
"dropoff_location_id",
"payment_type",
# "fare_amount",
# "extra",
# "mta_tax",
# "tip_amount",
# "tolls_amount",
# "improvement_surcharge",
# "total_amount",
"congestion_surcharge",
]

Expand All @@ -65,6 +80,13 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena
"profile.pkl",
),
},
float_rule={
"profile_path": Path(
test_root_path,
"data_profiler_files",
"profile.pkl",
),
},
exclude_column_names=exclude_column_names,
estimation="flag_outliers",
)
Expand All @@ -88,9 +110,23 @@ def bobby_profile_data_profiler_structured_data_assistant_result(
}

exclude_column_names = [
"vendor_id",
"pickup_datetime",
"dropoff_datetime",
"passenger_count",
# "trip_distance",
"rate_code_id",
"store_and_fwd_flag",
"pickup_location_id",
"dropoff_location_id",
"payment_type",
# "fare_amount",
# "extra",
# "mta_tax",
# "tip_amount",
# "tolls_amount",
# "improvement_surcharge",
# "total_amount",
"congestion_surcharge",
]

Expand All @@ -104,6 +140,13 @@ def bobby_profile_data_profiler_structured_data_assistant_result(
"profile.pkl",
),
},
float_rule={
"profile_path": os.path.join( # noqa: PTH118
test_root_path,
"data_profiler_files",
"profile.pkl",
),
},
estimation="flag_outliers",
)

Expand All @@ -130,7 +173,7 @@ def test_profile_data_profiler_structured_data_assistant_result_serialization(
len(
bobby_profile_data_profiler_structured_data_assistant_result.profiler_config.rules
)
== 1
== 2
)


Expand Down Expand Up @@ -193,8 +236,7 @@ def test_profile_data_profiler_structured_data_assistant_metrics_count(
bobby_profile_data_profiler_structured_data_assistant_result.metrics_by_domain.items()
):
num_metrics += len(parameter_values_for_fully_qualified_parameter_names)

assert num_metrics == 28
assert num_metrics == 32


@pytest.mark.integration
Expand Down

0 comments on commit c3629d7

Please sign in to comment.