Skip to content

Commit

Permalink
Made prettify table work for de and en
Browse files Browse the repository at this point in the history
  • Loading branch information
PiaSchroeder committed Feb 25, 2024
1 parent 79e9239 commit ab75e8b
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 26 deletions.
28 changes: 28 additions & 0 deletions src/pystatis/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,34 @@
"zensus": re.compile(r"^\d{4}[A-Z]-\d{4}$"),
"regio": re.compile(r"^((\d{5}-.{1,2}($|-.*$))|(A.*$)|([0-9A-Z]{10}$))"),
}
COLUMN_NAME_DICT = {
"genesis": {
"de": {
"time_label": "Zeit_Label",
"time": "Zeit",
"variable_code": "Auspraegung_Code",
"variable_label": "Merkmal_Label",
"variable_level": "Auspraegung_Label",
},
"en": {
"time_label": "time_label",
"time": "time",
"variable_code": "variable_code.1",
"variable_label": "variable_label",
"variable_level": "variable_code.2",
}
},
"zensus": {
"en": {
"time_label": "time_label",
"time": "time",
"variable_label": "variable_label",
"variable_level": "variable_attribute_label",
"value_label": "value_variable_label",
"value": "value",
}
}
}

logger = logging.getLogger(__name__)
config = ConfigParser(interpolation=None)
Expand Down
63 changes: 37 additions & 26 deletions src/pystatis/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

import json
from io import StringIO
import re

import pandas as pd

from pystatis import db
from pystatis.config import COLUMN_NAME_DICT
from pystatis.http_helper import load_data


Expand Down Expand Up @@ -74,7 +76,7 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs):

if prettify:
self.data = self.prettify_table(
self.data, db.identify_db(self.name)[0]
data=self.data, db_name=db.identify_db(self.name)[0], language=params["language"]
)

metadata = load_data(endpoint="metadata", method="table", params=params)
Expand All @@ -84,7 +86,7 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs):
self.metadata = metadata

@staticmethod
def prettify_table(data: pd.DataFrame, db_name: str) -> pd.DataFrame:
def prettify_table(data: pd.DataFrame, db_name: str, language: str) -> pd.DataFrame:
"""Reformat the data into a more readable table
Args:
Expand All @@ -97,80 +99,89 @@ def prettify_table(data: pd.DataFrame, db_name: str) -> pd.DataFrame:
"""
match db_name:
case "genesis":
pretty_data = Table.parse_genesis_table(data)
pretty_data = Table.parse_genesis_table(data, language)
case "zensus":
pretty_data = Table.parse_zensus_table(data)
pretty_data = Table.parse_zensus_table(data, language)
case "regio":
pretty_data = Table.parse_regio_table(data)
pretty_data = Table.parse_regio_table(data, language)
case _:
pretty_data = data

return pretty_data

@staticmethod
def parse_genesis_table(data: pd.DataFrame) -> pd.DataFrame:
def parse_genesis_table(data: pd.DataFrame, language: str) -> pd.DataFrame:
"""Parse GENESIS table ffcsv format into a more readable format"""

column_name_dict = COLUMN_NAME_DICT["genesis"][language]

# Extracts time column with name from first element of Zeit_Label column
time = pd.DataFrame({data["Zeit_Label"].iloc[0]: data["Zeit"]})
time = pd.DataFrame({data[column_name_dict["time_label"]].iloc[-1]: data[column_name_dict["time"]]})

# Extracts new column names from first values of the Merkmal_Label columns
# and assigns these to the relevant attribute columns (Auspraegung_Label)
attributes = data.filter(like="Auspraegung_Label")
attributes.columns = data.filter(like="Merkmal_Label").iloc[0].tolist()
attributes = data.filter(like=column_name_dict["variable_level"])
attributes.columns = data.filter(like=column_name_dict["variable_label"]).iloc[-1].tolist()

# Selects all columns containing the values
values = data.filter(like="__")

# Given a name like BEV036__Bevoelkerung_in_Hauptwohnsitzhaushalten__1000
# extracts the readable label and omit both the code and the unit
values.columns = [name.split("__")[1] for name in values.columns]
values.columns = [re.split(r"_{2,}", name)[1] for name in values.columns]

pretty_data = pd.concat([time, attributes, values], axis=1)
pretty_data = pd.concat([time, attributes, values], axis=1).dropna(axis=0, how="all")
return pretty_data

@staticmethod
def parse_zensus_table(data: pd.DataFrame) -> pd.DataFrame:
def parse_zensus_table(data: pd.DataFrame, language: str) -> pd.DataFrame:
"""Parse Zensus table ffcsv format into a more readable format"""

column_name_dict = COLUMN_NAME_DICT["zensus"]["en"]

# Extracts time column with name from first element of Zeit_Label column
time = pd.DataFrame({data["time_label"].iloc[0]: data["time"]})
time = pd.DataFrame({data[column_name_dict["time_label"]].iloc[-1]: data[column_name_dict["time"]]})

# Extracts new column names from first values of the Merkmal_Label columns
# and assigns these to the relevant attribute columns (Auspraegung_Label)
attributes = data.filter(like="variable_attribute_label")
attributes = data.filter(like=column_name_dict["variable_level"])
attributes.columns = (
data.filter(regex=r"\d+_variable_label").iloc[0].tolist()
data.filter(regex=r"\d+_"+column_name_dict["variable_label"]).iloc[-1].tolist()
)

values = pd.DataFrame(
{data["value_variable_label"].iloc[0]: data["value"]}
{data[column_name_dict["value_label"]].iloc[-1]: data[column_name_dict["value"]]}
)

pretty_data = pd.concat([time, attributes, values], axis=1)
pretty_data = pd.concat([time, attributes, values], axis=1).dropna(axis=0, how="all")
return pretty_data

@staticmethod
def parse_regio_table(data: pd.DataFrame) -> pd.DataFrame:
def parse_regio_table(data: pd.DataFrame, language: str) -> pd.DataFrame:
"""Parse Regionalstatistik table ffcsv format into a more readable format"""

column_name_dict = COLUMN_NAME_DICT["genesis"][language]

# Extracts time column with name from first element of Zeit_Label column
time = pd.DataFrame({data["Zeit_Label"].iloc[0]: data["Zeit"]})
time = pd.DataFrame({data[column_name_dict["time_label"]].iloc[-1]: data[column_name_dict["time"]]})

# Extracts new column names from first values of the Merkmal_Label columns
# and assigns these to the relevant attribute columns (Auspraegung_Label)
attributes = data.filter(like="Auspraegung_Label")
attributes.columns = data.filter(like="Merkmal_Label").iloc[0].tolist()
attributes = data.filter(like=column_name_dict["variable_level"])
attributes.columns = data.filter(like=column_name_dict["variable_label"]).iloc[-1].tolist()

# Extracts new column names from first values of the Merkmal_Label columns
# and assigns these to the relevant code columns (Auspraegung_Code)
codes = data.filter(like="Auspraegung_Code")
codes.columns = data.filter(like="Merkmal_Label").iloc[0].tolist()
codes.columns = [code + "_Code" for code in codes.columns]
codes = data.filter(like=column_name_dict["variable_code"])
codes.columns = data.filter(like=column_name_dict["variable_label"]).iloc[-1].tolist()
codes.columns = [code + " (Code)" for code in codes.columns]

# Selects all columns containing the values
values = data.filter(like="__")

# Given a name like BEV036__Bevoelkerung_in_Hauptwohnsitzhaushalten__1000
# extracts the readable label and omit both the code and the unit
values.columns = [name.split("__")[1] for name in values.columns]
values.columns = [re.split(r"_{2,}", name)[1] for name in values.columns]

pretty_data = pd.concat([time, attributes, codes, values], axis=1)
pretty_data = pd.concat([time, attributes, codes, values], axis=1).dropna(axis=0, how="all")
return pretty_data

0 comments on commit ab75e8b

Please sign in to comment.