Skip to content

Commit

Permalink
Contracts: Handle struct column specified both at root and nested lev…
Browse files Browse the repository at this point in the history
…els + arrays of structs
  • Loading branch information
MichelleArk committed Jul 1, 2023
1 parent e3d34f8 commit bac87f5
Showing 1 changed file with 33 additions and 2 deletions.
35 changes: 33 additions & 2 deletions dbt/adapters/bigquery/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from google.cloud.bigquery import SchemaField

_PARENT_DATA_TYPE_KEY = "__parent_data_type"

Self = TypeVar("Self", bound="BigQueryColumn")


Expand Down Expand Up @@ -215,15 +217,29 @@ def _update_nested_column_data_types(

if len(column_name_parts) == 1:
# Base case: column is not nested - store its data_type concatenated with constraint if provided.
nested_column_data_types[root_column_name] = (
column_data_type_and_constraints = (
column_data_type
if column_rendered_constraint is None
else f"{column_data_type} {column_rendered_constraint}"
)
if root_column_name not in nested_column_data_types:
nested_column_data_types[root_column_name] = column_data_type_and_constraints
else:
# entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY"
existing_nested_column_data_type = nested_column_data_types[root_column_name]
assert isinstance(existing_nested_column_data_type, dict) # keeping mypy happy
existing_nested_column_data_type[
_PARENT_DATA_TYPE_KEY
] = column_data_type_and_constraints
else:
# Initialize nested dictionary
if root_column_name not in nested_column_data_types:
nested_column_data_types[root_column_name] = {}
elif not isinstance(nested_column_data_types[root_column_name], dict):
# a parent specified its base type -- preserve its data_type and potential rendered constraints
# this is used to specify a top-level 'struct' or 'array' field with its own description, constraints, etc
parent_data_type = nested_column_data_types[root_column_name]
nested_column_data_types[root_column_name] = {_PARENT_DATA_TYPE_KEY: parent_data_type}

# Recursively process rest of remaining column name
remaining_column_name = ".".join(column_name_parts[1:])
Expand Down Expand Up @@ -252,8 +268,23 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str,
if isinstance(unformatted_nested_data_type, str):
return unformatted_nested_data_type
else:
parent_data_type = unformatted_nested_data_type.pop(_PARENT_DATA_TYPE_KEY, None)
parent_constraints = None
if parent_data_type:
parent_data_type_flat = parent_data_type.split()
if len(parent_data_type_flat) > 1:
parent_data_type = parent_data_type_flat[0]
parent_constraints = " ".join(parent_data_type_flat[1:])

formatted_nested_types = [
f"{column_name} {_format_nested_data_type(column_type)}"
for column_name, column_type in unformatted_nested_data_type.items()
]
return f"""struct<{", ".join(formatted_nested_types)}>"""

formatted_nested_type = f"""struct<{", ".join(formatted_nested_types)}>"""
if parent_data_type and parent_data_type.lower() == "array":
formatted_nested_type = f"""array<{formatted_nested_type}>"""
if parent_constraints:
formatted_nested_type = f"""{formatted_nested_type} {parent_constraints}"""

return formatted_nested_type

0 comments on commit bac87f5

Please sign in to comment.