From e22c6b925df909fff05a957d5f2ee178da292e9f Mon Sep 17 00:00:00 2001 From: Re-st Date: Tue, 21 Nov 2023 15:44:13 +0900 Subject: [PATCH] =?UTF-8?q?[Scrap]=20=EA=B2=B0=EA=B3=BC=20=EC=9D=B4?= =?UTF-8?q?=EB=A9=94=EC=9D=BC=EB=A1=9C=20=EC=A0=84=EC=86=A1=EB=90=98?= =?UTF-8?q?=EB=8F=84=EB=A1=9D=20=EC=9E=91=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configurations/secrets.py | 9 +++ scrap/local_councils/daejeon.py | 115 ++++++++++++++++++++++++++++++++ scrap/utils/__init__.py | 4 ++ scrap/utils/email_result.py | 25 +++++++ scrap/utils/spreadsheet.py | 95 ++++++++++---------------- 5 files changed, 190 insertions(+), 58 deletions(-) create mode 100644 scrap/local_councils/daejeon.py create mode 100644 scrap/utils/__init__.py create mode 100644 scrap/utils/email_result.py diff --git a/configurations/secrets.py b/configurations/secrets.py index 2bcdfc7..3ea0986 100644 --- a/configurations/secrets.py +++ b/configurations/secrets.py @@ -27,3 +27,12 @@ class OpenDataPortalSecrets: """ service_key = str(os.getenv("OPEN_DATA_SERICE_KEY") or "") + +class EmailSecrets: + """ + 스크랩 결과 이메일 전송에 필요한 키를 정의합니다. + """ + + sender_email = str(os.getenv("SCRAP_SENDER_EMAIL") or "") + receiver_email = str(os.getenv("SCRAP_RECEIVER_EMAIL") or "") + password = str(os.getenv("SCRAP_EMAIL_PASSWORD") or "") diff --git a/scrap/local_councils/daejeon.py b/scrap/local_councils/daejeon.py new file mode 100644 index 0000000..6cf11db --- /dev/null +++ b/scrap/local_councils/daejeon.py @@ -0,0 +1,115 @@ +from scrap.local_councils import * + + +def scrap_65(url, cid) -> ScrapResult: + """대전 동구""" + soup = get_soup(url, verify=False) + councilors: List[Councilor] = [] + + # 프로필 링크 스크랩을 위해 base_url 추출 + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + + for profile in soup.find_all("dl", class_="profile"): + name_tag = profile.find("strong", class_="name") + name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + party = "정당 정보 없음" + + # 프로필보기 링크 가져오기 + profile_link = profile.find("a", class_="start") + if profile_link: + data_uid = profile_link.get("data-uid") + if data_uid: + profile_url = base_url + f"/kr/member/profile_popup?uid={data_uid}" + profile_soup = get_soup(profile_url, verify=False) + party_info = profile_soup.find("strong", string="정 당") + if ( + party_info + and (party_span := party_info.find_next("span")) is not None + ): + party = party_span.text + + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + +def scrap_66(url, cid) -> ScrapResult: + """대전 중구""" + soup = get_soup(url, verify=False) + councilors: List[Councilor] = [] + + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("div", class_="name") + name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") + if party_info: + party = party_info.find_next("span").get_text(strip=True) + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + +def scrap_67( + url, + cid, +) -> ScrapResult: + """대전 서구""" + soup = get_soup(url, verify=False) + councilors: List[Councilor] = [] + + for profile in soup.find_all("dl"): + name_tag = profile.find("dd", class_="name") + name = ( + name_tag.get_text(strip=True).replace(" 의원", "") if name_tag else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = list(filter(lambda x: "정당" in str(x), profile.find_all("dd"))) + if party_info: + party = party_info[0].get_text(strip=True).replace("정당: ", "") + + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + +def scrap_68(url, cid) -> ScrapResult: + """대전 유성구""" + soup = get_soup(url, verify=False) + councilors: List[Councilor] = [] + + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + # () 안에 있는 한자를 제거 (ex. 김영희(金英姬) -> 김영희) + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + regex_pattern = re.compile(r"정\s*당\s*:", re.IGNORECASE) # Case-insensitive + party_info = profile.find("em", string=regex_pattern) + if party_info: + party = party_info.find_next("span").get_text(strip=True) + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + +def scrap_69(url, cid) -> ScrapResult: + """대전 대덕구""" + soup = get_soup(url, verify=False) + councilors: List[Councilor] = [] + + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + regex_pattern = re.compile(r"정\s*당\s*:", re.IGNORECASE) # Case-insensitive + party_info = profile.find("em", string=regex_pattern) + if party_info: + party = party_info.find_next("span").get_text(strip=True) + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) \ No newline at end of file diff --git a/scrap/utils/__init__.py b/scrap/utils/__init__.py new file mode 100644 index 0000000..de2d4d7 --- /dev/null +++ b/scrap/utils/__init__.py @@ -0,0 +1,4 @@ +""" +크롤링을 실행, 진행결과 알림, 크롤링결과를 mongoDB로 저장하는 +기능을 담당하는 모듈입니다. +""" \ No newline at end of file diff --git a/scrap/utils/email_result.py b/scrap/utils/email_result.py new file mode 100644 index 0000000..bea063f --- /dev/null +++ b/scrap/utils/email_result.py @@ -0,0 +1,25 @@ +import smtplib +from email.mime.text import MIMEText +from configurations.secrets import EmailSecrets + +smtp_server = "smtp.gmail.com" +smtp_port = 587 + +def email_result(emessages): + # 이메일 내용 설정 + subject = "스크래핑 결과" + # 메일 구성 + msg = MIMEText(emessages) + msg['Subject'] = subject + msg['From'] = EmailSecrets.sender_email + msg['To'] = EmailSecrets.receiver_email + + # 이메일 전송 + try: + with smtplib.SMTP(smtp_server, smtp_port) as server: + server.starttls() + server.login(msg['From'], EmailSecrets.password) + server.sendmail(msg['From'], msg['To'], msg.as_string()) + print("이메일이 성공적으로 전송되었습니다.") + except Exception as e: + print(f"이메일 전송 중 오류 발생: {e}") diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index 2fee572..43957f9 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -1,3 +1,6 @@ +""" +local_councils 폴더에 정의된 각 함수를 사용해서 크롤링합니다. +""" import os import sys import gspread @@ -20,6 +23,7 @@ from scrap.local_councils.gyeongsang import * from scrap.local_councils import * from requests.exceptions import Timeout +from utils.email_result import email_result # 구글로부터 권한을 요청할 어플리케이션 목록 # 변경 시 token.json 삭제 후 재인증 필요 @@ -67,60 +71,22 @@ def main() -> None: 0 ) # 원하는 워크시트 선택 (0은 첫 번째 워크시트입니다.) # TODO - 홈페이지 위 charset=euc-kr 등을 인식해 바로 가져오기. - euc_kr = [ - 6, - 13, - 16, - 31, - 72, - 88, - 112, - 134, - 154, - 157, - 163, - 165, - 167, - 176, - 181, - 197, - 202, - 222, - ] + euc_kr = [6, 13, 16, 31, 72, 88, 112, 134, 154, 157, 163, 165, 167, 176, 181, + 197, 202, 222] special_functions = ( list(range(1, 57)) + [62, 63, 64, 88, 97, 103, 107] + list(range(113, 127)) + [132, 134, 140, 142, 154, 155, 156, 157, 160, 161, 162, 163, 164, 165, 167] + list(range(177, 180)) - + [ - 182, - 183, - 184, - 186, - 188, - 189, - 190, - 191, - 194, - 195, - 196, - 198, - 199, - 201, - 203, - 206, - 208, - 209, - 210, - ] + + [182, 183, 184, 186, 188, 189, 190, 191, 194, 195, 196, 198, 199, 201, 203, + 206, 208, 209, 210] + list(range(212, 221)) + [222, 223, 224, 226] ) selenium_basic = [76, 78, 101, 169, 173, 177] no_information = [18, 29, 106, 111, 172, 181, 185, 187, 197, 200, 204, 207] error_unsolved = [170, 171] - errors = [] f = open(JSON_PATH, "r") args = json.load(f) f.close() @@ -132,16 +98,22 @@ def main() -> None: parse_error_times = 0 timeouts = 0 N = 226 - for n in [189]: # range(1, N + 1): + emessages: str = "" + enumbers = [] + def add_error(n, msg): + nonlocal emessages + emsg: str = f"| {n:3} | 오류: {msg}" + emessages += emsg + enumbers.append(n) + for n in range(1, N + 1): if n in no_information + error_unsolved: - error_msg = ( + emsg: str = ( "지난번 확인 시, 정당 정보 등이 홈페이지에 없었습니다. \ 다시 확인해보시겠어요?" if n in no_information else "함수 구현에 실패한 웹페이지입니다." - ) - print(f"| {n} | 오류: ", error_msg, " 링크 : ", data[n - 1]["URL"]) - errors.append(n) + ) + " 링크: " + data[n - 1]["URL"] + add_error(n, emsg) continue encoding = "euc-kr" if n in euc_kr else "utf-8" council_url: str = "" @@ -160,7 +132,11 @@ def main() -> None: function_to_call(council_url, n, args=council_args).councilors ) else: - print("[API/spreadsheet] Error : No function found") + emsg: str = f"특수 함수를 사용해서 스크랩한다고 \ + 명시되어 있는데 함수가 정의되어 있지 않네요. [scrap/utils/\ + spreadsheet.py의 special_functions에 함수 번호를 빼고 \ + 다시 시도해 보시겠어요?]" + add_error(n, emsg) elif n in selenium_basic: result = str(sel_scrap_basic(council_url, n, council_args).councilors) else: @@ -168,20 +144,23 @@ def main() -> None: scrap_basic(council_url, n, council_args, encoding).councilors ) if "정보 없음" in result: - print("정보 없음이 포함되어 있습니다.") + emsg = "스크랩 결과에 '정보 없음'이 포함되어 있습니다. 일부 인명에\ + 대해 스크랩이 실패했다는 뜻이에요. 함수나 인자를 점검해 주세요." parse_error_times += 1 - errors.append(n) - print(f"| {n} | {result}") + add_error(n, emsg) except Timeout: - print(f"| {n} | 오류: Request to {council_url} timed out.") + emsg = f"{council_url}에 시도한 연결이 타임아웃됐어요." timeouts += 1 + add_error(n, emsg) except Exception as e: - print(f"| {n} | 오류: {e}") - errors.append(n) - continue # 에러가 발생하면 다음 반복으로 넘어감 - print( - f"| 총 실행 횟수: {N} | 에러: {errors}, 총 {len(errors)}회 | 그 중 정보 없음 횟수: {parse_error_times} | 타임아웃 횟수: {timeouts} |" - ) + add_error(n, "기타 오류 - " + str(e)) + emessages = f""" + 총 실행 횟수: {N} + 에러: {enumbers}, 총 {len(enumbers)}회 + 그 중 '정보 없음' 횟수: {parse_error_times} + 타임아웃 횟수: {timeouts} + """ + emessages + email_result(emessages) if __name__ == "__main__":