diff --git a/analysis/age/hist_groups.py b/analysis/age/hist_groups.py index 0a3d476..18722cb 100644 --- a/analysis/age/hist_groups.py +++ b/analysis/age/hist_groups.py @@ -8,6 +8,7 @@ from analysis.age.draw import make_scatterplot, make_hist from db.client import client + def plot_young_and_old(youngest_cluster, oldest_cluster): try: sns.histplot( @@ -65,6 +66,7 @@ def cluster_data(method, n_clst, df): df.loc[df["age"] == min_age, "cluster_label"] = i return df + # 이름이 바뀐 경우 change_city_name = { ("충청남도", "당진군"): "당진시", @@ -75,9 +77,10 @@ def cluster_data(method, n_clst, df): ("인천광역시", "남구"): "미추홀구", } -# +# change_lvl2to1 = {"연기군": "세종특별자치시"} + def change_local_name(sdName, wiwName): """ 1. 만약 '시' 와 '구'가 모두 wiwName에 있다면, '시' 까지만 쓰기 @@ -90,20 +93,23 @@ def change_local_name(sdName, wiwName): """ if (sdName, wiwName) in change_city_name: return change_city_name[(sdName, wiwName)] - if '구' in wiwName and '시' in wiwName: - return wiwName.split('시')[0] + '시' + if "구" in wiwName and "시" in wiwName: + return wiwName.split("시")[0] + "시" else: return wiwName + def local_to_metro_list(sdName, wiwName): """ 구시군에서 광역시/도로 승격한 경우 """ if wiwName in change_lvl2to1: - print('change', wiwName, 'to', change_lvl2to1[wiwName]) + print("change", wiwName, "to", change_lvl2to1[wiwName]) return change_lvl2to1[wiwName] else: return sdName + + def cluster(df, year, n_clst, method, cluster_by, outdir, font_name, folder_name): """구역별 그룹을 만듭니다. df: 데이터프레임 @@ -134,8 +140,12 @@ def cluster(df, year, n_clst, method, cluster_by, outdir, font_name, folder_name # wiwName을 처리합니다 if level == "2level": - df['sdName'] = df[['sdName', 'wiwName']].apply(lambda x: local_to_metro_list(*x), axis=1) - df['wiwName'] = df[['sdName', 'wiwName']].apply(lambda x: change_local_name(*x), axis=1) + df["sdName"] = df[["sdName", "wiwName"]].apply( + lambda x: local_to_metro_list(*x), axis=1 + ) + df["wiwName"] = df[["sdName", "wiwName"]].apply( + lambda x: change_local_name(*x), axis=1 + ) # 데이터프레임에서 시도별로 묶은 후 나이 열만 가져옵니다. df_age = pd.DataFrame(columns=["area", "age"]) for area, df_clst in df.groupby(cluster_by): @@ -181,25 +191,29 @@ def cluster(df, year, n_clst, method, cluster_by, outdir, font_name, folder_name "ageGroup": age_group, } for age, count, age_group in zip( - range(df_clst['age'].min(), df_clst['age'].max() + 1), - df_clst.groupby('age').size(), - df_clst.groupby('age')['cluster_label'].first() + range(df_clst["age"].min(), df_clst["age"].max() + 1), + df_clst.groupby("age").size(), + df_clst.groupby("age")["cluster_label"].first(), ) ] metroname = df_clst["sdName"].iloc[0] metroId = metroIds.find_one({"sdName": metroname})["metroId"] if level == "1level": - print ("sdName is ", metroname) + print("sdName is ", metroname) main_collection.insert_one({"metroId": metroId, "data": data}) elif metroname in change_lvl2to1.values(): - print ("sdName is ", metroname) + print("sdName is ", metroname) lvl1_collection = db[folder_name + "_" + year + "_1level_" + method] lvl1_collection.insert_one({"metroId": metroId, "data": data}) else: localname = df_clst["wiwName"].iloc[0] - print ("sdName is ", metroname, "wiwName is", localname) - localId = localIds.find_one({"sdName": metroname, "wiwName": localname})["localId"] - main_collection.insert_one({"metroId": metroId, "localId": localId, "data": data}) + print("sdName is ", metroname, "wiwName is", localname) + localId = localIds.find_one({"sdName": metroname, "wiwName": localname})[ + "localId" + ] + main_collection.insert_one( + {"metroId": metroId, "localId": localId, "data": data} + ) # # 그리기 # package = ( diff --git a/configurations/secrets.py b/configurations/secrets.py index 3ea0986..5a72cd4 100644 --- a/configurations/secrets.py +++ b/configurations/secrets.py @@ -28,6 +28,7 @@ class OpenDataPortalSecrets: service_key = str(os.getenv("OPEN_DATA_SERICE_KEY") or "") + class EmailSecrets: """ 스크랩 결과 이메일 전송에 필요한 키를 정의합니다. diff --git a/scrap/local_councils/daejeon.py b/scrap/local_councils/daejeon.py index 6cf11db..f92cfab 100644 --- a/scrap/local_councils/daejeon.py +++ b/scrap/local_councils/daejeon.py @@ -112,4 +112,4 @@ def scrap_69(url, cid) -> ScrapResult: party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, jdName=party)) - return ret_local_councilors(cid, councilors) \ No newline at end of file + return ret_local_councilors(cid, councilors) diff --git a/scrap/utils/__init__.py b/scrap/utils/__init__.py index de2d4d7..616f9a7 100644 --- a/scrap/utils/__init__.py +++ b/scrap/utils/__init__.py @@ -1,4 +1,4 @@ """ 크롤링을 실행, 진행결과 알림, 크롤링결과를 mongoDB로 저장하는 기능을 담당하는 모듈입니다. -""" \ No newline at end of file +""" diff --git a/scrap/utils/email_result.py b/scrap/utils/email_result.py index bea063f..cc39b65 100644 --- a/scrap/utils/email_result.py +++ b/scrap/utils/email_result.py @@ -5,21 +5,22 @@ smtp_server = "smtp.gmail.com" smtp_port = 587 + def email_result(emessages): - # 이메일 내용 설정 - subject = "스크래핑 결과" - # 메일 구성 - msg = MIMEText(emessages) - msg['Subject'] = subject - msg['From'] = EmailSecrets.sender_email - msg['To'] = EmailSecrets.receiver_email + # 이메일 내용 설정 + subject = "스크래핑 결과" + # 메일 구성 + msg = MIMEText(emessages) + msg["Subject"] = subject + msg["From"] = EmailSecrets.sender_email + msg["To"] = EmailSecrets.receiver_email - # 이메일 전송 - try: - with smtplib.SMTP(smtp_server, smtp_port) as server: - server.starttls() - server.login(msg['From'], EmailSecrets.password) - server.sendmail(msg['From'], msg['To'], msg.as_string()) - print("이메일이 성공적으로 전송되었습니다.") - except Exception as e: - print(f"이메일 전송 중 오류 발생: {e}") + # 이메일 전송 + try: + with smtplib.SMTP(smtp_server, smtp_port) as server: + server.starttls() + server.login(msg["From"], EmailSecrets.password) + server.sendmail(msg["From"], msg["To"], msg.as_string()) + print("이메일이 성공적으로 전송되었습니다.") + except Exception as e: + print(f"이메일 전송 중 오류 발생: {e}") diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index 43957f9..f3df8bf 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -71,16 +71,53 @@ def main() -> None: 0 ) # 원하는 워크시트 선택 (0은 첫 번째 워크시트입니다.) # TODO - 홈페이지 위 charset=euc-kr 등을 인식해 바로 가져오기. - euc_kr = [6, 13, 16, 31, 72, 88, 112, 134, 154, 157, 163, 165, 167, 176, 181, - 197, 202, 222] + euc_kr = [ + 6, + 13, + 16, + 31, + 72, + 88, + 112, + 134, + 154, + 157, + 163, + 165, + 167, + 176, + 181, + 197, + 202, + 222, + ] special_functions = ( list(range(1, 57)) + [62, 63, 64, 88, 97, 103, 107] + list(range(113, 127)) + [132, 134, 140, 142, 154, 155, 156, 157, 160, 161, 162, 163, 164, 165, 167] + list(range(177, 180)) - + [182, 183, 184, 186, 188, 189, 190, 191, 194, 195, 196, 198, 199, 201, 203, - 206, 208, 209, 210] + + [ + 182, + 183, + 184, + 186, + 188, + 189, + 190, + 191, + 194, + 195, + 196, + 198, + 199, + 201, + 203, + 206, + 208, + 209, + 210, + ] + list(range(212, 221)) + [222, 223, 224, 226] ) @@ -100,19 +137,25 @@ def main() -> None: N = 226 emessages: str = "" enumbers = [] + def add_error(n, msg): nonlocal emessages emsg: str = f"| {n:3} | 오류: {msg}" emessages += emsg enumbers.append(n) + for n in range(1, N + 1): if n in no_information + error_unsolved: emsg: str = ( - "지난번 확인 시, 정당 정보 등이 홈페이지에 없었습니다. \ + ( + "지난번 확인 시, 정당 정보 등이 홈페이지에 없었습니다. \ 다시 확인해보시겠어요?" - if n in no_information - else "함수 구현에 실패한 웹페이지입니다." - ) + " 링크: " + data[n - 1]["URL"] + if n in no_information + else "함수 구현에 실패한 웹페이지입니다." + ) + + " 링크: " + + data[n - 1]["URL"] + ) add_error(n, emsg) continue encoding = "euc-kr" if n in euc_kr else "utf-8" @@ -154,12 +197,15 @@ def add_error(n, msg): add_error(n, emsg) except Exception as e: add_error(n, "기타 오류 - " + str(e)) - emessages = f""" + emessages = ( + f""" 총 실행 횟수: {N} 에러: {enumbers}, 총 {len(enumbers)}회 그 중 '정보 없음' 횟수: {parse_error_times} 타임아웃 횟수: {timeouts} - """ + emessages + """ + + emessages + ) email_result(emessages)