diff --git a/analysis/age/hist_groups.py b/analysis/age/hist_groups.py index 94a5cda..b1beee9 100644 --- a/analysis/age/hist_groups.py +++ b/analysis/age/hist_groups.py @@ -181,7 +181,7 @@ def insert_data_to_mongo( ) -def cluster(df_original, n_clst, basedic): +def cluster(df_original, n_clst, basedic, clean_flag=True): """구역별 그룹을 만듭니다. df_original: 데이터프레임 n_clst: 그룹 수 @@ -194,11 +194,11 @@ def cluster(df_original, n_clst, basedic): histcoll = statdb["age_hist"] statcoll = statdb["age_stat"] # method = "equal"에서 써 줄 통계. # 기존 histogram 정보는 삭제 (나이별로 넣는 것이기 때문에 찌꺼기값 존재가능) - histcoll.delete_many(basedic.__dict__) - if basedic.method == "equal": - statcoll.delete_many(basedic.__dict__) + if clean_flag: + histcoll.delete_many(basedic.__dict__) + if basedic.method == "equal": + statcoll.delete_many(basedic.__dict__) # 연도별로 데이터 찾아서 넣기! - df_original["year"] = df_original["sgId"] // 10000 df_original = df_original[df_original["year"].isin([2010, 2014, 2018, 2022])] years = df_original["year"].unique() for year in years: diff --git a/analysis/age/main.py b/analysis/age/main.py index bc562ba..a766556 100644 --- a/analysis/age/main.py +++ b/analysis/age/main.py @@ -6,6 +6,7 @@ from analysis.age.most_common_age_group import most_common_age_group from analysis.age.hist_groups import cluster from analysis.age import BasicArgument +from db.client import client # 경고 무시 warnings.filterwarnings("ignore", category=FutureWarning) @@ -23,9 +24,10 @@ "기초의원비례대표": "local_councilor", } +personDB = client["council"] -def run(cluster_by, filenames, N=5, folder_name="To_be_filled"): - ## TO-DO: excel말고 mongodb에서 받아오도록 합니다. + +def run_by_excel(cluster_by, filenames, N=5, folder_name="To_be_filled"): assert cluster_by in ["sdName", "wiwName"] level = 1 if cluster_by == "sdName" else 2 datadir = os.path.join(BASE_DIR, "_data", folder_name) @@ -38,6 +40,7 @@ def run(cluster_by, filenames, N=5, folder_name="To_be_filled"): else: df = df[["sgId", "sdName", "wiwName", "name", "age", "gender"]] df = df.sort_values(by="age") + df["year"] = df["sgId"] // 10000 is_elected = ( True if "당선" in d @@ -56,13 +59,67 @@ def run(cluster_by, filenames, N=5, folder_name="To_be_filled"): cluster(df, N, basedic) +# def main(N=5): +# run_by_excel("sdName", ["[당선][시도의원].xlsx", "[당선][광역의원비례대표].xlsx"]) +# run_by_excel("sdName", ["[후보][시도의원].xlsx", "[후보][광역의원비례대표].xlsx"]) +# run_by_excel("sdName", ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"]) +# run_by_excel("sdName", ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"]) +# run_by_excel("wiwName", ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"]) +# run_by_excel("wiwName", ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"]) + + +def run_by_mongo(cluster_by, is_elected, councilorType, N=5): + assert cluster_by in ["sdName", "wiwName"] + level = 1 if cluster_by == "sdName" else 2 + data = [] + if not is_elected: + councilorType = councilorType + "_candidate" + cursor = personDB[councilorType].find() + if level == 1: + for person in cursor: + data.append( + { + "year": person.get("year"), + "sdName": person.get("sdName"), + "name": person.get("name"), + "age": person.get("age"), + "gender": person.get("gender"), + } + ) + else: + for person in cursor: + data.append( + { + "year": person.get("year"), + "sdName": person.get("sdName"), + "wiwName": person.get("wiwName"), + "name": person.get("name"), + "age": person.get("age"), + "gender": person.get("gender"), + } + ) + + df = pd.DataFrame(data) + df = df.sort_values(by="age") + + for method in ["kmeans", "equal"]: + basedic = BasicArgument( + councilorType=councilorType, + is_elected=is_elected, + level=level, + method=method, + ) + cluster(df, N, basedic, clean_flag=True) + + def main(N=5): - run("sdName", ["[당선][시도의원].xlsx", "[당선][광역의원비례대표].xlsx"]) - run("sdName", ["[후보][시도의원].xlsx", "[후보][광역의원비례대표].xlsx"]) - run("sdName", ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"]) - run("sdName", ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"]) - run("wiwName", ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"]) - run("wiwName", ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"]) + # 세종시의 경우 어느 순간 승급하기 때문에 sdName을 먼저 해야, sdName이 cluster 시작 때 밀려도 괜챃다. (cluster 함수 참조) + run_by_mongo("sdName", is_elected=True, councilorType="metro_councilor") + run_by_mongo("sdName", is_elected=False, councilorType="metro_councilor") + run_by_mongo("sdName", is_elected=True, councilorType="local_councilor") + run_by_mongo("sdName", is_elected=False, councilorType="local_councilor") + run_by_mongo("wiwName", is_elected=True, councilorType="local_councilor") + run_by_mongo("wiwName", is_elected=False, councilorType="local_councilor") main()