Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/34 visualization examples #48

Merged
merged 6 commits into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,809 changes: 3,809 additions & 0 deletions nb/02_Geo_visualization_example.ipynb

Large diffs are not rendered by default.

168 changes: 167 additions & 1 deletion nb/presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@
# %% [markdown]
# If interested in specific object, can run `results.tables`, `results.statistics`, or `results.variables` directly.

# %% [markdown]
#

# %%
results.tables

Expand Down Expand Up @@ -162,7 +165,170 @@
# ## Jonas

# %% [markdown]
#
# Case study: international students in Germany
# - time evolution
# - regional differences (at the level of federal states)

# %%
# Should we add this to poetry?
# conda install geopandas
# conda install matplotlib

import geopandas

# %%
import pandas as pd
from matplotlib import pyplot as plt

# %% [markdown]
# ### load data from Regionalstatistik

# %%
students = pystatis.Table(name="21311-01-01-4")

# %%
students.get_data(startyear=2015)

# %% [markdown]
# ### set proper column types

# %%
students.data["Kreise und kreisfreie Städte_Code"] = students.data[
"Kreise und kreisfreie Städte_Code"
].astype(str)
students.data["Kreise und kreisfreie Städte_Code"]

# %%
students.data["Kreise und kreisfreie Städte_Code"] = students.data[
"Kreise und kreisfreie Städte_Code"
].apply(lambda x: "0" + x if len(x) <= 1 else x)
students.data["Kreise und kreisfreie Städte_Code"]

# %%
students.data

# %% [markdown]
# ### determine ratio of international students per year and region

# %%
ratio_international = (
students.data[
(students.data.Geschlecht == "Insgesamt")
& (students.data["Fächergruppe (mit Insgesamt)"] == "Insgesamt")
]
.groupby(
by=[
"Kreise und kreisfreie Städte",
"Kreise und kreisfreie Städte_Code",
"Semester",
]
)["Studierende_(im_Kreisgebiet)"]
.apply(lambda x: x.iloc[1] / x.iloc[0] if x.count() == 3 else None)
)
ratio_international.rename("ratio_international", inplace=True)

ratio_international = pd.DataFrame(ratio_international)
ratio_international["year"] = [
int(semester[3:7])
for semester in ratio_international.index.get_level_values(2)
]

ratio_international

# %%
ratio_international[ratio_international.index.get_level_values(0) == " Bayern"]

# %% [markdown]
# ## plot time evolution

# %%
for region in [
"Deutschland",
" Baden-Württemberg",
" Bayern",
" Nordrhein-Westfalen",
" Thüringen",
" Sachsen",
" Niedersachsen",
" Schleswig-Holstein",
" Berlin",
]:
plt.plot(
ratio_international[
ratio_international.index.get_level_values(0) == region
].year,
ratio_international[
ratio_international.index.get_level_values(0) == region
].ratio_international,
label=region,
)
plt.legend()

# %% [markdown]
# ### load shape file

# %%

path_to_data = "vg2500_12-31.utm32s.shape/vg2500/VG2500_LAN.shp"
gdf = geopandas.read_file(path_to_data)


# %%
gdf.loc[:, "area"] = gdf.area

# %%
gdf.plot("area", legend=True)

# %%
gdf.GEN

# %%
gdf.AGS = gdf.AGS.astype(str)

# %% [markdown]
# ### merge with geodataframe and plot

# %%
year = 2015

gdf_merged = pd.merge(
left=gdf,
right=ratio_international[ratio_international.year == year],
left_on="AGS",
right_on="Kreise und kreisfreie Städte_Code",
)
gdf_merged.ratio_international

# %%
gdf_merged.plot(
"ratio_international", legend=True, missing_kwds={"color": "lightgrey"}
)

# %%
ear = 2018
gdf_merged = pd.merge(
left=gdf,
right=ratio_international[ratio_international.year == year],
left_on="AGS",
right_on="Kreise und kreisfreie Städte_Code",
)
gdf_merged.ratio_international
gdf_merged.plot(
"ratio_international", legend=True, missing_kwds={"color": "lightgrey"}
)

# %%
ear = 2021
gdf_merged = pd.merge(
left=gdf,
right=ratio_international[ratio_international.year == year],
left_on="AGS",
right_on="Kreise und kreisfreie Städte_Code",
)
gdf_merged.ratio_international
gdf_merged.plot(
"ratio_international", legend=True, missing_kwds={"color": "lightgrey"}
)

# %% [markdown]
# ## Outlook
Expand Down
12 changes: 10 additions & 2 deletions src/pystatis/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs):

self.raw_data = raw_data
data_str = StringIO(raw_data)
self.data = pd.read_csv(data_str, sep=";")
self.data = pd.read_csv(
data_str, sep=";", na_values=["...", ".", "-", "/", "x"]
)

if prettify:
self.data = self.prettify_table(
Expand Down Expand Up @@ -133,12 +135,18 @@ def parse_regio_table(data: pd.DataFrame) -> pd.DataFrame:
attributes = data.filter(like="Auspraegung_Label")
attributes.columns = data.filter(like="Merkmal_Label").iloc[0].tolist()

# Extracts new column names from first values of the Merkmal_Label columns
# and assigns these to the relevant code columns (Auspraegung_Code)
codes = data.filter(like="Auspraegung_Code")
codes.columns = data.filter(like="Merkmal_Label").iloc[0].tolist()
codes.columns = [code + "_Code" for code in codes.columns]

# Selects all columns containing the values
values = data.filter(like="__")

# Given a name like BEV036__Bevoelkerung_in_Hauptwohnsitzhaushalten__1000
# extracts the readable label and omit both the code and the unit
values.columns = [name.split("__")[1] for name in values.columns]

pretty_data = pd.concat([time, attributes, values], axis=1)
pretty_data = pd.concat([time, attributes, codes, values], axis=1)
return pretty_data
Loading