Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Contracts: Handle struct column specified both at root and nested levels + arrays of structs #806

Merged
merged 11 commits into from
Jul 11, 2023
7 changes: 7 additions & 0 deletions .changes/unreleased/Fixes-20230630-213112.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
kind: Fixes
body: 'Contracts: Handle struct column specified both at root and nested levels +
arrays of structs'
time: 2023-06-30T21:31:12.63257-04:00
custom:
Author: michelleark
Issue: 781 782
45 changes: 40 additions & 5 deletions dbt/adapters/bigquery/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from google.cloud.bigquery import SchemaField

_PARENT_DATA_TYPE_KEY = "__parent_data_type"

Self = TypeVar("Self", bound="BigQueryColumn")


Expand Down Expand Up @@ -215,15 +217,34 @@ def _update_nested_column_data_types(

if len(column_name_parts) == 1:
# Base case: column is not nested - store its data_type concatenated with constraint if provided.
nested_column_data_types[root_column_name] = (
column_data_type_and_constraints = (
column_data_type
if column_rendered_constraint is None
else f"{column_data_type} {column_rendered_constraint}"
)

if existing_nested_column_data_type := nested_column_data_types.get(root_column_name):
assert isinstance(existing_nested_column_data_type, dict) # keeping mypy happy
# entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY"
existing_nested_column_data_type.update(
{_PARENT_DATA_TYPE_KEY: column_data_type_and_constraints}
)
else:
nested_column_data_types.update({root_column_name: column_data_type_and_constraints})
else:
# Initialize nested dictionary
if root_column_name not in nested_column_data_types:
nested_column_data_types[root_column_name] = {}
parent_data_type = nested_column_data_types.get(root_column_name)
if isinstance(parent_data_type, dict):
# nested dictionary already initialized
pass
elif parent_data_type is None:
# initialize nested dictionary
nested_column_data_types.update({root_column_name: {}})
else:
# a parent specified its base type -- preserve its data_type and potential rendered constraints
# this is used to specify a top-level 'struct' or 'array' field with its own description, constraints, etc
nested_column_data_types.update(
{root_column_name: {_PARENT_DATA_TYPE_KEY: parent_data_type}}
)

# Recursively process rest of remaining column name
remaining_column_name = ".".join(column_name_parts[1:])
Expand Down Expand Up @@ -252,8 +273,22 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str,
if isinstance(unformatted_nested_data_type, str):
return unformatted_nested_data_type
else:
parent_data_type, *parent_constraints = unformatted_nested_data_type.pop(
_PARENT_DATA_TYPE_KEY, ""
).split() or [None]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need the or [None] here? I thought I tried this in a toy example where .pop() returned "" and it still worked as anticipated. But I might have missed some edge case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unfortunately not :( it broke a couple unit tests with the following error:

>>> foo, *bar = "".split()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
ValueError: not enough values to unpack (expected at least 1, got 0)


formatted_nested_types = [
f"{column_name} {_format_nested_data_type(column_type)}"
for column_name, column_type in unformatted_nested_data_type.items()
]
return f"""struct<{", ".join(formatted_nested_types)}>"""

formatted_nested_type = f"""struct<{", ".join(formatted_nested_types)}>"""

if parent_data_type and parent_data_type.lower() == "array":
formatted_nested_type = f"""array<{formatted_nested_type}>"""

if parent_constraints:
parent_constraints = " ".join(parent_constraints)
formatted_nested_type = f"""{formatted_nested_type} {parent_constraints}"""

return formatted_nested_type
71 changes: 71 additions & 0 deletions tests/unit/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,51 @@
{"b.nested": "not null"},
{"b": {"name": "b", "data_type": "struct<nested string not null>"}},
),
# Single nested column, 1 level - with corresponding parent column
(
{
"b": {"name": "b", "data_type": "struct"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
None,
{"b": {"name": "b", "data_type": "struct<nested string>"}},
),
# Single nested column, 1 level - with corresponding parent column specified last
(
{
"b.nested": {"name": "b.nested", "data_type": "string"},
"b": {"name": "b", "data_type": "struct"},
},
None,
{"b": {"name": "b", "data_type": "struct<nested string>"}},
),
# Single nested column, 1 level - with corresponding parent column + parent constraint
(
{
"b": {"name": "b", "data_type": "struct"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
{"b": "not null"},
{"b": {"name": "b", "data_type": "struct<nested string> not null"}},
),
# Single nested column, 1 level - with corresponding parent column as array
(
{
"b": {"name": "b", "data_type": "array"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
None,
{"b": {"name": "b", "data_type": "array<struct<nested string>>"}},
),
# Single nested column, 1 level - with corresponding parent column as array + constraint
(
{
"b": {"name": "b", "data_type": "array"},
"b.nested": {"name": "b.nested", "data_type": "string"},
},
{"b": "not null"},
{"b": {"name": "b", "data_type": "array<struct<nested string>> not null"}},
),
# Multiple nested columns, 1 level
(
{
Expand Down Expand Up @@ -128,6 +173,32 @@
},
},
),
# Nested columns, multiple levels - with parent arrays and constraints!
(
{
"b.user.names": {
"name": "b.user.names",
"data_type": "array",
},
"b.user.names.first": {
"name": "b.user.names.first",
"data_type": "string",
},
"b.user.names.last": {
"name": "b.user.names.last",
"data_type": "string",
},
"b.user.id": {"name": "b.user.id", "data_type": "int64"},
"b.user.country": {"name": "b.user.country", "data_type": "string"},
},
{"b.user.names.first": "not null", "b.user.id": "unique"},
{
"b": {
"name": "b",
"data_type": "struct<user struct<names array<struct<first string not null, last string>>, id int64 unique, country string>>",
},
},
),
],
)
def test_get_nested_column_data_types(columns, constraints, expected_nested_columns):
Expand Down