Skip to content

Commit

Permalink
Feat/19 improve readability of the table format (#42)
Browse files Browse the repository at this point in the history
* Reformatting the raw data tables for readability

* Adding comments

* Applied suggested changes and run code formatting

* add tests for Table

---------

Co-authored-by: Michael Aydinbas <michael.aydinbas@new-work.se>
  • Loading branch information
zosiaboro and pmayd committed Jan 30, 2024
1 parent 75fff7b commit 0e2f0a2
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 4 deletions.
37 changes: 36 additions & 1 deletion src/pystatis/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ def __init__(self, name: str):
self.data = pd.DataFrame()
self.metadata: dict = {}

def get_data(self, area: str = "all", **kwargs):
def get_data(self, area: str = "all", prettify: bool = True, **kwargs):
"""Downloads raw data and metadata from GENESIS-Online.
Additional keyword arguments are passed on to the GENESIS-Online GET request for tablefile.
Args:
area (str, optional): Area to search for the object in GENESIS-Online. Defaults to "all".
prettify (bool, optional): Reformats the table into a readable format. Defaults to True.
"""
params = {"name": self.name, "area": area, "format": "ffcsv"}

Expand All @@ -38,12 +39,46 @@ def get_data(self, area: str = "all", **kwargs):
endpoint="data", method="tablefile", params=params, as_json=False
)
assert isinstance(raw_data, str) # nosec assert_used

self.raw_data = raw_data
data_str = StringIO(raw_data)
self.data = pd.read_csv(data_str, sep=";")

if prettify:
self.data = self.prettify_table(self.data)

metadata = load_data(
endpoint="metadata", method="table", params=params, as_json=True
)
assert isinstance(metadata, dict) # nosec assert_used

self.metadata = metadata

@staticmethod
def prettify_table(data: pd.DataFrame) -> pd.DataFrame:
"""Reformat the data into a more readable table
Args:
data (pd.DataFrame): A pandas dataframe created from raw_data
Returns:
pd.DataFrame: Formatted dataframe that omits all unnecessary Code columns
and includes informative columns names
"""
# Extracts time column with name from first element of Zeit_Label column
time = pd.DataFrame({data["Zeit_Label"].iloc[0]: data["Zeit"]})

# Extracts new column names from first values of the Merkmal_Label columns
# and assigns these to the relevant attribute columns (Auspraegung_Label)
attributes = data.filter(like="Auspraegung_Label")
attributes.columns = data.filter(like="Merkmal_Label").iloc[0].tolist()

# Selects all columns containing the values
values = data.filter(like="__")

# Given a name like BEV036__Bevoelkerung_in_Hauptwohnsitzhaushalten__1000
# extracts the readable label and omit both the code and the unit
values.columns = [name.split("__")[1] for name in values.columns]

pretty_data = pd.concat([time, attributes, values], axis=1)
return pretty_data
1 change: 0 additions & 1 deletion tests/test_config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import copy
import os
from configparser import ConfigParser
from pathlib import Path
Expand Down
2 changes: 0 additions & 2 deletions tests/test_db.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import logging
from configparser import ConfigParser

import pytest

from pystatis import config, db
from pystatis.exception import PystatisConfigError


@pytest.fixture()
Expand Down
57 changes: 57 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pandas as pd
import pytest

import pystatis

EASY_TABLE = """Statistik_Code;Statistik_Label;Zeit_Code;Zeit_Label;Zeit;1_Merkmal_Code;1_Merkmal_Label;1_Auspraegung_Code;1_Auspraegung_Label; FLC006__Gebietsflaeche__qkm
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;08;Baden-Württemberg;35747,85
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;09;Bayern;70541,58
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;11;Berlin;891,12
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;12;Brandenburg;29654,38
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;04;Bremen;419,61
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;02;Hamburg;755,09
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;06;Hessen;21115,62
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;13;Mecklenburg-Vorpommern;23294,90
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;03;Niedersachsen;47709,90
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;05;Nordrhein-Westfalen;34112,72
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;07;Rheinland-Pfalz;19857,97
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;10;Saarland;2571,52
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;14;Sachsen;18449,86
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;15;Sachsen-Anhalt;20467,20
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;01;Schleswig-Holstein;15804,30
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;16;Thüringen;16202,37
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;;Insgesamt;357595,99"""


def mocked_load_data(endpoint, method, params, as_json):
if endpoint == "data" and method == "tablefile":
return EASY_TABLE
elif endpoint == "metadata" and method == "table":
return {"metadata": "table"}
else:
raise NotImplementedError


def test_get_data(monkeypatch):
# patch pystatis.table.load_data with parameter endpoint="data"
# and method="tablefile" to return EASY_TABLE
monkeypatch.setattr(pystatis.table, "load_data", mocked_load_data)
table = pystatis.Table(name="11111-0001")
table.get_data(prettify=False)
assert table.data.shape == (17, 10)
assert isinstance(table.data, pd.DataFrame)
assert not table.data.empty
assert isinstance(table.raw_data, str)
assert table.raw_data != ""


def test_prettify(monkeypatch):
monkeypatch.setattr(pystatis.table, "load_data", mocked_load_data)
table = pystatis.Table(name="11111-0001")
table.get_data(prettify=True)
assert table.data.shape == (17, 3)
assert table.data.columns.to_list() == [
"Stichtag",
"Bundesländer",
"Gebietsflaeche",
]

0 comments on commit 0e2f0a2

Please sign in to comment.