Skip to content

Commit

Permalink
add __q quality columns to Zensus tables when quality='on'
Browse files Browse the repository at this point in the history
  • Loading branch information
pmayd committed Jun 29, 2024
1 parent 97aa825 commit 9caf283
Show file tree
Hide file tree
Showing 35 changed files with 477 additions and 432,558 deletions.
2 changes: 2 additions & 0 deletions src/pystatis/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
"value_variable_label": "value_variable_label",
"value": "value",
"value_unit": "value_unit",
"value_q": "value_q",
"ars": "Amtlicher Regionalschlüssel (ARS)",
},
"en": {
Expand All @@ -73,6 +74,7 @@
"value_variable_label": "value_variable_label",
"value": "value",
"value_unit": "value_unit",
"value_q": "value_q",
"ars": "Official regional key (ARS)",
},
},
Expand Down
38 changes: 28 additions & 10 deletions src/pystatis/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,38 +219,56 @@ def parse_genesis_and_regio_table(data: pd.DataFrame, language: str) -> pd.DataF
def parse_zensus_table(data: pd.DataFrame, language: str) -> pd.DataFrame:
"""Parse Zensus table ffcsv format into a more readable format"""
column_name_dict = LANG_TO_COL_MAPPING["zensus"][language]
value_variable_label_col = column_name_dict["value_variable_label"]
value_unit_col = column_name_dict["value_unit"]
value_col = column_name_dict["value"]
time_label_col = column_name_dict["time_label"]
ars_label_code = column_name_dict["ars"]
time_col = column_name_dict["time"]
time_label_col = column_name_dict["time_label"]
value_col = column_name_dict["value"]
value_q_col = column_name_dict["value_q"]
value_unit_col = column_name_dict["value_unit"]
value_variable_label_col = column_name_dict["value_variable_label"]
variable_attribute_label_col = column_name_dict["variable_attribute_label"]
variable_label_col = column_name_dict["variable_label"]
ars_label_code = column_name_dict["ars"]

# quality columns are not yet supported for Zensus tables
if "value_q" in data.columns:
data = data.drop(columns=["value_q"])
warnings.warn("Quality columns are not supported for Zensus tables.", UserWarning)
quality = False
if value_q_col in data.columns:
quality = True

# add the unit to the column names for the value columns
data[value_variable_label_col] = data[value_variable_label_col].str.cat(
data[value_unit_col].fillna("Unknown_Unit"), sep="__"
)

if quality:
# with quality = 'on' we have an additional column value_q
# to still use pivot table we have to combine value and value_q
# so we can later split them again
data[value_col] = [[v, q] for v, q in zip(data[value_col], data[value_q_col])]
data = data.drop(columns=[value_q_col])

pivot_table = data.pivot(
index=data.columns[:-4].to_list(),
index=[col for col in data.columns if col not in data.filter(regex=r"^value").columns],
columns=value_variable_label_col,
values=value_col,
)

if quality:
for col in pivot_table.columns:
pivot_table.insert(
pivot_table.columns.to_list().index(col) + 1,
col + "__q",
pivot_table[col].apply(lambda x: x[1]),
)
pivot_table[col] = pivot_table[col].apply(lambda x: x[0])

value_columns = pivot_table.columns.to_list()
pivot_table.reset_index(inplace=True)
pivot_table.columns.name = None

time_label = data[time_label_col].iloc[0]
time = pd.DataFrame({time_label: pivot_table[time_col]})

# Some tables of Zensus can have a regional code (AGS) as first attribute
# If AGS column is present, add it to the final output
ags_code = None
pos_of_ags_col = np.where(data.iloc[0].isin(config.ZENSUS_AGS_CODES))[0]
if pos_of_ags_col.size > 0:
Expand Down
12,900 changes: 24 additions & 12,876 deletions tests/cassettes/1000A-0001.yaml

Large diffs are not rendered by default.

1,826 changes: 8 additions & 1,818 deletions tests/cassettes/11111-02-01-4.yaml

Large diffs are not rendered by default.

146 changes: 0 additions & 146 deletions tests/cassettes/12111-01-01-4.yaml

This file was deleted.

Loading

0 comments on commit 9caf283

Please sign in to comment.