Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Scrap] 57-112 중 에러: [76, 78, 97, 101, 106, 111], 총 6회 #47

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions analysis/age/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
공공데이터포털 API로 수집한 데이터를 분석하기 위한 패키지입니다.
"""
111 changes: 111 additions & 0 deletions analysis/age/draw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt


def make_scatterplot(package):
(
outdir,
total_population_count,
year,
df_age,
n_clst,
method,
cluster_by,
folder_name,
colors,
font_name,
) = package
# 산점도로 클러스터링 결과를 시각화합니다.
sns.set(style="whitegrid") # Seaborn 스타일 설정 (선택적)
plt.figure(figsize=(10, 6)) # 그림 크기 설정 (선택적)
print(df_age)
sns.scatterplot(data=df_age, x="age", y="area", palette="viridis")
# 클러스터 중심 나이를 플롯에 추가합니다.
for _, row in df_age.iterrows():
area = row["area"]
age = row["age"]
print(age)
plt.text(
age,
area,
"{:.2f}".format(float(age)),
fontsize=12,
ha="right",
fontname=font_name,
)
plt.xlabel("나이", fontname=font_name)
plt.ylabel("지역", fontname=font_name)
plt.yticks(fontname=font_name)
plt.title(
f"{folder_name}자 나이 분포 ({year}) <총 {total_population_count}명>",
fontname=font_name,
)
# 그래프를 이미지 파일로 저장합니다.
plt.savefig(
os.path.join(outdir, method, f"clustering_result ({year}).png"), dpi=300
) # 파일 이름 및 해상도 설정 (선택적)
plt.close()


def plot_eachgroup(df, n_clst, colors):
minage = min(df["age"].min(), 20)
maxage = max(df["age"].max(), 80)
for i in range(n_clst):
clst_data = df[df["cluster_label"] == i]
sns.histplot(
data=clst_data,
x="age",
kde=False,
label=f"Cluster {i}",
color=colors[i],
element="step",
bins=range(minage, maxage, 1),
)
# 몇 명인지 프린트하기
print(f"Cluster {i}: {clst_data.shape[0]} people")
# 그룹마다 몇 살인지 프린트하기
print(f"Cluster {i}: {clst_data['age']}")


def make_hist(package):
(
outdir,
df,
year,
area,
n_clst,
method,
cluster_by,
folder_name,
colors,
font_name,
) = package
plt.figure(figsize=(10, 6))
# 시각화
# plot_young_and_old(yb_clst, ob_clst)
plot_eachgroup(df, n_clst, colors)
total_population_count = df[df[cluster_by] == area].shape[0]
if cluster_by == "sdName":
plt.title(
f"{area} {folder_name}자 나이 분포 ({year}) <총 {total_population_count}명>",
fontname=font_name,
)
elif cluster_by == "wiwName":
sdName = df[df["wiwName"] == area]["sdName"].iloc[0]
plt.title(
f"{sdName} {area} {folder_name}자 나이 분포 ({year}) <총 {total_population_count}명>",
fontname=font_name,
)
else:
print("cluster_by를 sdName 또는 wiwName으로 설정해주세요.")
return
plt.xlabel("나이", fontname=font_name)
plt.ylabel("인원 수", fontname=font_name)
max_ppl_in_age = df["age"].value_counts().max()
plt.yticks(np.arange(0, max(10, max_ppl_in_age), step=5), fontsize=12)
plt.savefig(os.path.join(outdir, method, f"{year}-{area}.png"))
plt.close()
print(f"Saved ", os.path.join(outdir, method, f"{year}-{area}.png"))
158 changes: 158 additions & 0 deletions analysis/age/hist_groups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# coding=utf-8
import os
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from matplotlib import cm
from analysis.age.draw import make_scatterplot, make_hist


def plot_young_and_old(youngest_cluster, oldest_cluster):
try:
sns.histplot(
data=youngest_cluster,
x="age",
kde=True,
label="Youngest Cluster",
color="blue",
element="step",
bins=range(
youngest_cluster["age"].min(), youngest_cluster["age"].max() + 1, 1
),
)
except:
pass
try:
sns.histplot(
data=oldest_cluster,
x="age",
kde=True,
label="Oldest Cluster",
color="red",
element="step",
bins=range(oldest_cluster["age"].min(), oldest_cluster["age"].max() + 1, 1),
)
except:
pass


def cluster_data(method, n_clst, df):
if method == "kmeans":
ages_data = df[["age"]]
# K-means 모델을 초기화하고 학습합니다.
kmeans = KMeans(n_clusters=min(n_clst, len(ages_data)), random_state=0)
kmeans.fit(ages_data)

# 각 데이터 포인트가 속한 클러스터를 나타내는 레이블을 가져옵니다.
clst_labels = kmeans.labels_
elif method == "equal":
clst_labels = []
clst_labels = np.repeat(np.arange(n_clst), len(df) // n_clst)
clst_labels = np.append(clst_labels, np.arange(len(df) % n_clst))
clst_labels.sort()
clst_labels = np.array(clst_labels)
df["cluster_label"] = clst_labels
# 같은 나이는 같은 클러스터에 속하도록 합니다.
# 0번 클러스터는 생기도록 합니다.
for i in [0]:
max_age = df[df["cluster_label"] == i]["age"].max()
# when "age" == max_age, change "cluster_label" to be i
df.loc[df["age"] == max_age, "cluster_label"] = i
for i in range(2, n_clst):
min_age = df[df["cluster_label"] == i]["age"].min()
# when "age" == min_age, change "cluster_label" to be i
df.loc[df["age"] == min_age, "cluster_label"] = i
return df


def cluster(df, year, n_clst, method, cluster_by, outdir, font_name, folder_name):
"""구역별 그룹을 만듭니다.
df: 데이터프레임
year: 선거 연도
n_clusters: 그룹 수
"""
os.makedirs(os.path.join(outdir, method), exist_ok=True)
youngest_age = ("", 100)
oldest_age = ("", 0)
print(f"({year}), {n_clst} clusters")
print(f"{'-' * 20}")
# Get a colormap for generating unique colors for clusters
colors = cm.rainbow(np.linspace(0, 1, n_clst))

# 데이터프레임에서 시도별로 묶은 후 나이 열만 가져옵니다.
df_age = pd.DataFrame(columns=["area", "age"])
for area, df_clst in df.groupby(cluster_by):
df_clst = cluster_data(method, n_clst, df_clst)
# 클러스터 중심 나이를 계산합니다.
clst_age_mean = []
for i in range(n_clst):
clst_data = df_clst[df_clst["cluster_label"] == i]
cluster_center_age = round(clst_data["age"].mean(), 2) # 나이를 소수점 2자리까지 반올림
clst_age_mean.append(cluster_center_age)

clst_of_young = clst_age_mean.index(min(clst_age_mean))
clst_of_old = clst_age_mean.index(max(clst_age_mean))
clst_age_mean.sort()
new_data = pd.DataFrame({"area": area, "age": clst_age_mean})
df_age = pd.concat([df_age, new_data], ignore_index=True)
print(clst_age_mean)

yb_clst = df_clst[df_clst["cluster_label"] == clst_of_young]
ob_clst = df_clst[df_clst["cluster_label"] == clst_of_old]
print(f"Youngest in {area}: {yb_clst['age'].min()} - {yb_clst['age'].max()}")
print(f"Oldest in {area}: {ob_clst['age'].min()} - {ob_clst['age'].max()}")
if clst_age_mean[0] < youngest_age[1]:
youngest_age = (area, clst_age_mean[0])
if clst_age_mean[-1] > oldest_age[1]:
oldest_age = (area, clst_age_mean[-1])

# 그룹의 성비를 계산합니다.
young_group_sexratio = (
yb_clst[yb_clst["gender"] == "여"].shape[0] / yb_clst.shape[0]
)
old_group_sexratio = (
ob_clst[ob_clst["gender"] == "여"].shape[0] / ob_clst.shape[0]
)
print(
f"젊은 층의 성비는 여자가 {young_group_sexratio}, 노인층의 성비는 여자가 {old_group_sexratio}"
)

# 그리기
package = (
outdir,
df_clst,
year,
area,
n_clst,
method,
cluster_by,
folder_name,
colors,
font_name,
)
make_hist(package)

print(f"Number of data points per cluster for {area}")
for cluster_label in range(n_clst):
closest_data_count = sum(df_clst["cluster_label"] == cluster_label)
print(
f"Cluster {cluster_label}: Age {clst_age_mean[cluster_label]}, {closest_data_count} closest data points"
)
print(f"Youngest in {youngest_age[0]}: {youngest_age[1]}")
print(f"Oldest in {oldest_age[0]}: {oldest_age[1]}")

# 그리기
package = (
outdir,
df.shape[0],
year,
df_age,
n_clst,
method,
cluster_by,
folder_name,
colors,
font_name,
)
make_scatterplot(package)
48 changes: 48 additions & 0 deletions analysis/age/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# coding=utf-8
import pandas as pd
import os
import warnings
from matplotlib import font_manager
from analysis.age.most_common_age_group import most_common_age_group
from analysis.age.hist_groups import cluster

# 경고 무시
warnings.filterwarnings("ignore", category=FutureWarning)

BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
# matplotlib 한국어 폰트 설정
font_name = font_manager.FontProperties(
fname=os.path.join(BASE_DIR, "_data", "NanumSquareL.ttf")
).get_name()


def main():
for folder_name in ["지선-당선", "지선-후보"]:
for cluster_by in ["sdName", "wiwName"]:
# folder_name = input("_data 내의 폴더 이름은 무엇인가요?")
# cluster_by = input("구역을 나눌 기준을 입력해주세요 (sdName 즉 시/도 또는 wiwName 즉 기초단체단위): ")
datadir = os.path.join(BASE_DIR, "_data", folder_name)
outdir = os.path.join(
BASE_DIR, "output", f"age_all_{cluster_by}", folder_name
)

for d in os.listdir(datadir):
# xlsx 파일을 읽어옵니다.
if not d.endswith(".xlsx"):
continue
df = pd.read_excel(os.path.join(datadir, d))

# 필요한 열만 추출합니다.
df = df[["sdName", "wiwName", "name", "age", "gender"]]
df = df.sort_values(by="age")
year = d[7:11]
# most_common_age_group(df, year)
cluster(
df, year, 7, "kmeans", cluster_by, outdir, font_name, folder_name
)
cluster(
df, year, 7, "equal", cluster_by, outdir, font_name, folder_name
)


main()
24 changes: 24 additions & 0 deletions analysis/age/most_common_age_group.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# coding=utf-8
import pandas as pd


def most_common_age_group(df, d):
"""10년단위로 무리짓고 가장 사람 많은 무리 출력.
df: 데이터프레임
d: 파일 이름"""
age_groups = pd.cut(
df["age"],
[0, 30, 40, 50, 60, 70, 80, 90, 100],
labels=["0-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81-90", "91-100"],
)

# 나이 그룹을 데이터프레임에 추가합니다.
df["age_group"] = age_groups

# 각 구역에서 가장 많은 나이 그룹을 찾습니다.
most_common_age_group_by_region = df.groupby("sdName")["age_group"].agg(
lambda x: x.mode().iloc[0]
)

# 결과를 출력합니다.
print(d, most_common_age_group_by_region)
Loading