diff --git a/API/__init__.py b/API/__init__.py index 49e0a6d..793d9d7 100644 --- a/API/__init__.py +++ b/API/__init__.py @@ -26,7 +26,7 @@ "4": CouncilType.LOCAL_LEADER, "5": CouncilType.METROPOLITAN_COUNCIL, "6": CouncilType.LOCAL_COUNCIL, - "7": CouncilType.NATIONAL_COUNCIL_GLOBAL, + "7": CouncilType.NATIONAL_COUNCIL, "8": CouncilType.METROPOLITAN_COUNCIL, "9": CouncilType.LOCAL_COUNCIL, } diff --git a/API/utils.py b/API/utils.py index f16738a..57d8865 100644 --- a/API/utils.py +++ b/API/utils.py @@ -43,12 +43,15 @@ def save_to_mongo(data: List[dict], sgTypecode: str, where: str) -> None: # TODO: Support other types of councils if sgTypecode in ["8", "5", "2", "6", "9"]: for entry in data: + if entry["wiwName"] == None: + print(entry) entry["wiwName"] = change_local_name(entry["sdName"], entry["wiwName"]) district_id = get_district_id(entry["sdName"], entry["wiwName"]) if district_id: main_collection.update_one( { + "year": entry["year"], "name": entry["name"], "localId": district_id["localId"], "metroId": district_id["metroId"], @@ -62,8 +65,10 @@ def save_to_mongo(data: List[dict], sgTypecode: str, where: str) -> None: ) elif sgTypecode in ["7"]: for entry in data: + entry["wiwName"] = "전국" main_collection.update_one( { + "year": entry["year"], "name": entry["name"], "localId": 0, "metroId": 0, diff --git a/README.md b/README.md index 62c4c43..3fb757b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # 다양성 평가 리포트 웹사이트 - 뉴웨이즈 + + ## 프로젝트 개요 -프로젝트 이름 다양성 평가 리포트 웹사이트 - 뉴웨이즈 -기간 23 가을-겨울 + +프로젝트 이름 다양성 평가 리포트 웹사이트 - 뉴웨이즈 +기간 23 가을-겨울 ## 설치 및 실행 과정 + 1. 파이썬 가상환경 생성 - - 아래 명령을 실행하여 파이썬 가상환경을 생성합니다. - ```bash - cd ~ && virtualenv newways --python=3.10 - ``` + - 아래 명령을 실행하여 파이썬 가상환경을 생성합니다. + ```bash + cd ~ && virtualenv newways --python=3.10 + ``` 2. 가상환경 활성화 - - 아래 명령을 실행하여 가상환경을 활성화합니다. - ```bash - source ~/newways/bin/activate - ``` + - 아래 명령을 실행하여 가상환경을 활성화합니다. + ```bash + source ~/newways/bin/activate + ``` 3. 레포지토리 클론 - 아래 명령을 실행하여 레포지토리를 클론합니다. - ```bash + ```bash git clone https://github.com/NewWays-TechForImpactKAIST/API-scrap-and-analysis.git ``` 4. 필요한 패키지 설치 - requirements.txt에 명시된 패키지를 설치합니다. ```bash pip install -r requirements.txt - ``` + ``` 5. 환경 변수 설정 - `.env.example` 파일을 복사하여 `.env` 파일을 생성합니다. ```bash cp .env.example .env - ``` - - `.env` 파일을 열어 환경 변수의 값을 필요에 따라 바꾸어줍니다. + ``` + - `.env` 파일을 열어 환경 변수의 값을 필요에 따라 바꾸어줍니다. 6. 예제 코드 실행 - 이 프로젝트는 여러 개의 파이썬 패키지로 구성되어 있습니다. - 각각의 패키지는 독립적으로 실행할 수 있습니다. 단, 실행 시 python -m 옵션(module을 의미)을 사용해야 합니다. - 크롤링 및 데이터베이스 저장 예제 코드를 실행하려면, 아래 명령을 실행합니다. - ```bash - # scrap/local_councils/seoul/junggu.py 파일을 실행합니다. - python -m scrap.local_councils.seoul.junggu - # scrap/examples/database.py 파일을 실행합니다. - python -m scrap.examples.database - ``` \ No newline at end of file + ```bash + # scrap/local_councils/seoul/junggu.py 파일을 실행합니다. + python -m scrap.local_councils.seoul.junggu + # scrap/examples/database.py 파일을 실행합니다. + python -m scrap.examples.database + ``` diff --git a/analysis/diversity_db.py b/analysis/diversity_db.py index fd8fe01..a9934e6 100644 --- a/analysis/diversity_db.py +++ b/analysis/diversity_db.py @@ -60,7 +60,7 @@ def shannon(data, stair=0, opts=True): # ==================================== -def save_to_mongo_local(localId: int, factor: str, stair=0, opts=False) -> None: +def save_to_mongo_local(localId: int, factor: str, stair=0, opts=True) -> None: factor_field = {"age": "age", "gender": "gender", "party": "jdName"} data = [ councilor[factor_field[factor]] @@ -166,7 +166,7 @@ def calculate_age_diversity_rank_history_local() -> None: # ==================================== -def save_to_mongo_metro(metroId: int, factor: str, stair=0, opts=False) -> None: +def save_to_mongo_metro(metroId: int, factor: str, stair=0, opts=True) -> None: factor_field = {"age": "age", "gender": "gender", "party": "jdName"} data = [ councilor[factor_field[factor]] @@ -267,23 +267,115 @@ def calculate_age_diversity_rank_history_metro() -> None: ) -if __name__ == "__main__": - # for localId in range(1, 227): - # save_to_mongo_local(localId, "age", stair=10) - # save_to_mongo_local(localId, "gender") - # save_to_mongo_local(localId, "party") - # calculate_rank_local("age") - # calculate_rank_local("gender") - # calculate_rank_local("party") - # calculate_age_diversity_rank_history_local() - - # for metroId in range(1, 18): - # if metroId in [8, 17]: - # continue - # save_to_mongo_metro(metroId, "age", stair=10) - # save_to_mongo_metro(metroId, "gender") - # save_to_mongo_metro(metroId, "party") - # calculate_rank_metro("age") - # calculate_rank_metro("gender") - # calculate_rank_metro("party") +# ===================================== +# National council diversity statistics +# ===================================== + + +def save_to_mongo_national(factor: str, stair=0, opts=True) -> None: + factor_field = {"age": "age", "gender": "gender", "party": "jdName"} + data = [ + councilor[factor_field[factor]] + for councilor in client["council"]["national_councilor"].find() + ] + [ + councilor[factor_field[factor]] + for councilor in client["council"]["national_councilor_global"].find() + ] + print(f"National {factor}") + print(data) + client["stats"].get_collection("diversity_index").update_one( + {"national": True}, + {"$set": {f"{factor}DiversityIndex": gini_simpson(data, stair, opts)}}, + upsert=True, + ) + + +def calculate_age_diversity_rank_history_national() -> None: + for is_elected in [True, False]: + docs = client["stats"]["age_hist"].find( + { + "councilorType": "national_councilor", + "method": "equal", + "is_elected": is_elected, + } + ) + for doc in docs: + diversity_index = gini_simpson( + [ + group["minAge"] + for group in doc["data"] + for _ in range(group["count"]) + ], + stair=10, + ) + client["stats"]["age_hist"].find_one_and_update( + { + "councilorType": "national_councilor", + "method": "equal", + "is_elected": is_elected, + "year": doc["year"], + }, + {"$set": {"diversityIndex": diversity_index}}, + ) + + years = list({doc["year"] for doc in client["stats"]["age_hist"].find()}) + + for year in years: + result = client["stats"]["age_hist"].aggregate( + [ + { + "$match": { + "councilorType": "national_councilor", + "method": "equal", + "is_elected": is_elected, + "year": year, + } + }, + {"$sort": {"diversityIndex": -1}}, + {"$group": {"_id": "", "items": {"$push": "$$ROOT"}}}, + {"$unwind": {"path": "$items", "includeArrayIndex": "items.rank"}}, + {"$replaceRoot": {"newRoot": "$items"}}, + {"$addFields": {"rank": {"$add": ["$rank", 1]}}}, + ] + ) + for doc in result: + client["stats"]["age_hist"].find_one_and_update( + { + "councilorType": "national_councilor", + "method": "equal", + "is_elected": is_elected, + "year": year, + }, + {"$set": {"diversityRank": int(doc["rank"])}}, + ) + + +def main(): + for localId in range(1, 227): + save_to_mongo_local(localId, "age", stair=10) + save_to_mongo_local(localId, "gender") + save_to_mongo_local(localId, "party") + calculate_rank_local("age") + calculate_rank_local("gender") + calculate_rank_local("party") + calculate_age_diversity_rank_history_local() + + for metroId in range(1, 18): + if metroId in [17]: + continue + save_to_mongo_metro(metroId, "age", stair=10) + save_to_mongo_metro(metroId, "gender") + save_to_mongo_metro(metroId, "party") + calculate_rank_metro("age") + calculate_rank_metro("gender") + calculate_rank_metro("party") calculate_age_diversity_rank_history_metro() + + save_to_mongo_national("age", stair=10) + save_to_mongo_national("gender") + save_to_mongo_national("party") + calculate_age_diversity_rank_history_national() + + +if __name__ == "__main__": + main() diff --git a/analysis/gender_party_hist.py b/analysis/gender_party_hist.py new file mode 100644 index 0000000..6b813e5 --- /dev/null +++ b/analysis/gender_party_hist.py @@ -0,0 +1,354 @@ +# coding=utf-8 +import pandas as pd +import os +import warnings +from db.client import client +from analysis.age.hist_groups import ( + local_to_metro_list, + change_local_name, +) + +# 경고 무시 +warnings.filterwarnings("ignore", category=FutureWarning) + +BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir) + + +# =================================== +# Gender history calculations +# =================================== + + +def gender_hist( + councilor_type: str, level: int, is_elected: bool, filenames: list[str] +): + ## TO-DO: excel말고 mongodb에서 받아오도록 합니다. + assert (councilor_type, level) in [ + ("local_councilor", 2), + ("metro_councilor", 1), + ("national_councilor", 0), + ] + datadir = os.path.join(BASE_DIR, "_data") + df = pd.DataFrame() + + for d in filenames: + df_new = pd.read_excel(os.path.join(datadir, d)) + df = pd.concat([df, df_new]) + + district_db = client["district"] + gender_hist_collection = client["stats"].get_collection("gender_hist") + + df["wiwName"] = df["wiwName"].apply(lambda x: x if isinstance(x, str) else "") + df["sdName"] = df[["sdName", "wiwName"]].apply( + lambda x: local_to_metro_list(*x), axis=1 + ) + df["wiwName"] = df[["sdName", "wiwName"]].apply( + lambda x: change_local_name(*x), axis=1 + ) + + if level == 0: + df = df[["sgId", "name", "gender"]].groupby(by=["sgId", "gender"]).count() + for idx in df.index: + year = int(str(idx[0])[:4]) + print(f"{year=}") + gender_hist_collection.find_one_and_update( + { + "councilorType": "national_councilor", + "is_elected": is_elected, + "level": 0, + "year": year, + }, + {"$set": {idx[1]: int(df["name"][idx])}}, + upsert=True, + ) + + elif level == 1: + df = ( + df[["sgId", "sdName", "name", "gender"]] + .groupby(by=["sgId", "sdName", "gender"]) + .count() + ) + for idx in df.index: + year = int(str(idx[0])[:4]) + print(f"{year=} sdName={idx[1]}") + metroId = district_db.get_collection("metro_district").find_one( + {"sdName": idx[1]} + )["metroId"] + + gender_hist_collection.find_one_and_update( + { + "councilorType": "metro_councilor", + "is_elected": is_elected, + "level": 1, + "metroId": metroId, + "year": year, + }, + {"$set": {idx[2]: int(df["name"][idx])}}, + upsert=True, + ) + + elif level == 2: + df = ( + df[["sgId", "sdName", "wiwName", "name", "gender"]] + .groupby(by=["sgId", "sdName", "wiwName", "gender"]) + .count() + ) + for idx in df.index: + year = int(str(idx[0])[:4]) + print(f"{year=} sdName={idx[1]} wiwName={idx[2]}") + doc = district_db["local_district"].find_one( + { + "sdName": idx[1], + "wiwName": idx[2] if idx[1] != "세종특별자치시" else "세종특별자치시", + } + ) + metroId, localId = doc["metroId"], doc["localId"] + + gender_hist_collection.find_one_and_update( + { + "councilorType": "local_councilor", + "is_elected": is_elected, + "level": 2, + "metroId": metroId, + "localId": localId, + "year": year, + }, + {"$set": {idx[3]: int(df["name"][idx])}}, + upsert=True, + ) + + +def gender_hist_add_zero(): + gender_hist_collection = client["stats"].get_collection("gender_hist") + gender_hist_collection.update_many({"남": {"$exists": False}}, {"$set": {"남": 0}}) + gender_hist_collection.update_many({"여": {"$exists": False}}, {"$set": {"여": 0}}) + + +# =================================== +# Party history calculations +# =================================== + + +def party_hist(councilor_type: str, level: int, is_elected: bool, filenames: list[str]): + ## TO-DO: excel말고 mongodb에서 받아오도록 합니다. + assert (councilor_type, level) in [ + ("local_councilor", 2), + ("metro_councilor", 1), + ("national_councilor", 0), + ] + datadir = os.path.join(BASE_DIR, "_data") + df = pd.DataFrame() + + for d in filenames: + df_new = pd.read_excel(os.path.join(datadir, d)) + df = pd.concat([df, df_new]) + + district_db = client["district"] + party_hist_collection = client["stats"].get_collection("party_hist") + + df["wiwName"] = df["wiwName"].apply(lambda x: x if isinstance(x, str) else "") + df["sdName"] = df[["sdName", "wiwName"]].apply( + lambda x: local_to_metro_list(*x), axis=1 + ) + df["wiwName"] = df[["sdName", "wiwName"]].apply( + lambda x: change_local_name(*x), axis=1 + ) + + if level == 0: + df = df[["sgId", "name", "jdName"]].groupby(by=["sgId", "jdName"]).count() + for idx in df.index: + year = int(str(idx[0])[:4]) + print(f"{year=}") + party_hist_collection.find_one_and_update( + { + "councilorType": "national_councilor", + "is_elected": is_elected, + "level": 0, + "year": year, + }, + {"$set": {idx[1]: int(df["name"][idx])}}, + upsert=True, + ) + + elif level == 1: + df = ( + df[["sgId", "sdName", "name", "jdName"]] + .groupby(by=["sgId", "sdName", "jdName"]) + .count() + ) + for idx in df.index: + year = int(str(idx[0])[:4]) + print(f"{year=} sdName={idx[1]}") + metroId = district_db.get_collection("metro_district").find_one( + {"sdName": idx[1]} + )["metroId"] + + party_hist_collection.find_one_and_update( + { + "councilorType": "metro_councilor", + "is_elected": is_elected, + "level": 1, + "metroId": metroId, + "year": year, + }, + {"$set": {idx[2]: int(df["name"][idx])}}, + upsert=True, + ) + + elif level == 2: + df = ( + df[["sgId", "sdName", "wiwName", "name", "jdName"]] + .groupby(by=["sgId", "sdName", "wiwName", "jdName"]) + .count() + ) + for idx in df.index: + year = int(str(idx[0])[:4]) + print(f"{year=} sdName={idx[1]} wiwName={idx[2]}") + doc = district_db["local_district"].find_one( + { + "sdName": idx[1], + "wiwName": idx[2] if idx[1] != "세종특별자치시" else "세종특별자치시", + } + ) + metroId, localId = doc["metroId"], doc["localId"] + + party_hist_collection.find_one_and_update( + { + "councilorType": "local_councilor", + "is_elected": is_elected, + "level": 2, + "metroId": metroId, + "localId": localId, + "year": year, + }, + {"$set": {idx[3]: int(df["name"][idx])}}, + upsert=True, + ) + + +# =================================================== +# Age history calculations for national councilors +# =================================================== + + +def age_hist_national(is_elected: bool, filenames: list[str]): + datadir = os.path.join(BASE_DIR, "_data") + df = pd.DataFrame() + + for d in filenames: + df_new = pd.read_excel(os.path.join(datadir, d)) + df = pd.concat([df, df_new]) + + df = df[["sgId", "name", "age"]].groupby(by=["sgId", "age"]).count().reset_index() + df["cumsum"] = df.groupby(by="sgId")["name"].cumsum() + df["quintile"] = pd.qcut(df["cumsum"], q=5, labels=[0, 1, 2, 3, 4]) + + total = ( + df[["sgId", "name"]] + .groupby(by=["sgId"]) + .sum() + .rename(columns={"name": "total"}) + ) + first_quintile = ( + df.loc[df["quintile"] == 0] + .groupby(by=["sgId"]) + .max()[["age"]] + .rename(columns={"age": "q1"}) + ) + last_quintile = ( + df.loc[df["quintile"] == 4] + .groupby(by=["sgId"]) + .min()[["age"]] + .rename(columns={"age": "q5"}) + ) + quintiles = pd.concat([total, first_quintile, last_quintile], axis=1) + + hist_data: dict[int, list[dict]] = {} + for _, row in df.iterrows(): + year = int(str(row["sgId"])[:4]) + age = int(row["age"]) + cnt = int(row["name"]) + age_group = int(row["quintile"]) + + if year in hist_data: + hist_data[year].append( + {"minAge": age, "maxAge": age + 1, "count": cnt, "ageGroup": age_group} + ) + else: + hist_data[year] = [ + {"minAge": age, "maxAge": age + 1, "count": cnt, "ageGroup": age_group} + ] + + stat_data: dict[int, dict] = {} + for sgId, row in quintiles.iterrows(): + year = int(str(sgId)[:4]) + stat_data[year] = { + "firstquintile": int(row["q1"]), + "lastquintile": int(row["q5"]), + "population": int(row["total"]), + } + + age_hist_collection = client["stats"].get_collection("age_hist") + for year, docs in hist_data.items(): + age_hist_collection.find_one_and_update( + { + "councilorType": "national_councilor", + "is_elected": is_elected, + "level": 0, + "method": "equal", + "year": year, + }, + {"$set": {"data": docs}}, + upsert=True, + ) + + age_stat_collection = client["stats"].get_collection("age_stat") + for year, doc in stat_data.items(): + age_stat_collection.find_one_and_update( + { + "councilorType": "national_councilor", + "is_elected": is_elected, + "level": 0, + "method": "equal", + "year": year, + }, + {"$set": {"data": [doc]}}, + upsert=True, + ) + + +def main(): + gender_hist( + "local_councilor", 2, True, ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"] + ) + gender_hist( + "local_councilor", 2, False, ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"] + ) + + gender_hist("metro_councilor", 1, True, ["[당선][시도의원].xlsx", "[당선][광역의원비례대표].xlsx"]) + gender_hist("metro_councilor", 1, False, ["[후보][시도의원].xlsx", "[후보][광역의원비례대표].xlsx"]) + + gender_hist("national_councilor", 0, True, ["[당선][국회의원].xlsx"]) + gender_hist("national_councilor", 0, False, ["[후보][국회의원].xlsx"]) + + gender_hist_add_zero() + + party_hist( + "local_councilor", 2, True, ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"] + ) + party_hist( + "local_councilor", 2, False, ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"] + ) + + party_hist("metro_councilor", 1, True, ["[당선][시도의원].xlsx", "[당선][광역의원비례대표].xlsx"]) + party_hist("metro_councilor", 1, False, ["[후보][시도의원].xlsx", "[후보][광역의원비례대표].xlsx"]) + + party_hist("national_councilor", 0, True, ["[당선][국회의원].xlsx"]) + party_hist("national_councilor", 0, False, ["[후보][국회의원].xlsx"]) + + age_hist_national(True, ["[당선][국회의원].xlsx"]) + age_hist_national(False, ["[후보][국회의원].xlsx"]) + + +if __name__ == "__main__": + main() diff --git a/scrap/local_councils/gyeongsang.py b/scrap/local_councils/gyeongsang.py index 73ccec7..693bcbf 100644 --- a/scrap/local_councils/gyeongsang.py +++ b/scrap/local_councils/gyeongsang.py @@ -389,6 +389,27 @@ def scrap_204( return ret_local_councilors(cid, councilors) +def scrap_205( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 영양군""" + # TODO : gzip 문제 생기니, selenium으로 대체 + print(url) + soup = get_soup(url, verify=False) + councilors: List[Councilor] = [] + profile_list = soup.find("div", id="content_box") + for name_tag in profile_list.find_all("h3"): + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" + ul = name_tag.find_next("ul") + li_party = ul.find("li", string="소속정당") + party = li_party.text.split(" : ")[-1].strip() + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + def scrap_206( url, cid, diff --git a/scrap/utils/runner_args.json b/scrap/utils/runner_args.json index d03af28..d2dbddf 100644 --- a/scrap/utils/runner_args.json +++ b/scrap/utils/runner_args.json @@ -10,7 +10,7 @@ 88, 97, 103, 107, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 132, 134, 140, 142, 154, 155, 156, 157, 160, 161, 162, 163, 164, 165, 167, 175, 177, 178, 179, 182, 183, 184, 186, 188, 189, 190, 191, 192, 194, - 195, 196, 197, 198, 199, 201, 202, 203, 204, 206, 208, 209, 210, 212, 213, 214, 215, 216, + 195, 196, 197, 198, 199, 201, 202, 203, 204, 205, 206, 208, 209, 210, 212, 213, 214, 215, 216, 217, 218, 219, 220, 222, 223, 224, 226 ], "selenium_basic": [76, 78, 101, 169, 173], diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index 194bc91..70652e2 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -166,6 +166,7 @@ def scrap_all_local_councils() -> None: 202, 203, 204, + 205, 206, 208, 209, @@ -189,7 +190,7 @@ def scrap_all_local_councils() -> None: parse_error_times = 0 timeouts = 0 N = 226 - for n in [204]: + for n in [205]: if n in no_information + error_unsolved: emsg: str = ( (