From 89bd06d3e0fe4e240ecc6afe5fdf06959964671c Mon Sep 17 00:00:00 2001 From: Re-st Date: Mon, 27 Nov 2023 13:29:24 +0000 Subject: [PATCH] Formatted with black --- scrap/local_councils/basic.py | 19 +++++++++++++---- scrap/local_councils/gyeongsang.py | 33 ++++++++++++++++++------------ scrap/local_councils/jeolla.py | 15 +++++++++++--- scrap/utils/runner.py | 6 +++++- scrap/utils/spreadsheet.py | 21 ++++++++++++------- 5 files changed, 66 insertions(+), 28 deletions(-) diff --git a/scrap/local_councils/basic.py b/scrap/local_councils/basic.py index f36a252..42d688b 100644 --- a/scrap/local_councils/basic.py +++ b/scrap/local_councils/basic.py @@ -151,7 +151,9 @@ def extract_party(string): return None -def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False): +def goto_profilesite( + profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False +): # 의원 프로필에서 프로필보기 링크를 가져옴 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" @@ -224,7 +226,9 @@ def getpty(profile, element, class_, wrapper_element, wrapper_class_, wrapper_tx raise Exception("[basic.py] 정당 정보 파싱 불가") -def getpty_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False): +def getpty_easy( + profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False +): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: profile = goto_profilesite( @@ -253,7 +257,9 @@ def sel_getpty_easy( return party -def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8", inner_euckr=False) -> ScrapResult: +def scrap_basic( + url, cid, args: ScrapBasicArgument, encoding="utf-8", inner_euckr=False +) -> ScrapResult: """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param cid: 의회 id @@ -293,7 +299,12 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8", inner_euck except Exception as e: try: party = getpty_easy( - profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url, inner_euckr + profile, + args.pty_wrapelt, + args.pty_wrapcls, + args.pty_wraptxt, + url, + inner_euckr, ) except Exception: raise RuntimeError("[basic.py] 의원 정당을 가져오는데 실패했습니다. 이유: " + str(e)) diff --git a/scrap/local_councils/gyeongsang.py b/scrap/local_councils/gyeongsang.py index e1290bc..73ccec7 100644 --- a/scrap/local_councils/gyeongsang.py +++ b/scrap/local_councils/gyeongsang.py @@ -13,6 +13,7 @@ party_keywords = getPartyList() party_keywords.append("무소속") + def scrap_186( url, cid, @@ -124,6 +125,7 @@ def scrap_191( return ret_local_councilors(cid, councilors) + def scrap_192( url, cid, @@ -142,10 +144,10 @@ def scrap_192( base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" profile_url = base_url + profile_link["href"] profile = get_soup(profile_url, verify=False, encoding="euc-kr") - party="" + party = "" for keyword in party_keywords: if keyword in profile.text: - party=keyword + party = keyword break councilors.append(Councilor(name=name, jdName=party)) @@ -225,11 +227,13 @@ def scrap_197( """경상북도 경산시""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: List[Councilor] = [] - for profile in soup.find_all('div', class_='memberL') + soup.find_all('div', class_='memberR'): - party = profile.find_previous('h4', class_='title').text.strip() - assert(party in party_keywords) - name = profile.find('dt').text.strip() - + for profile in soup.find_all("div", class_="memberL") + soup.find_all( + "div", class_="memberR" + ): + party = profile.find_previous("h4", class_="title").text.strip() + assert party in party_keywords + name = profile.find("dt").text.strip() + councilors.append(Councilor(name=name, jdName=party)) return ret_local_councilors(cid, councilors) @@ -323,15 +327,16 @@ def scrap_202( base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" profile_url = base_url + link profile = get_soup(profile_url, verify=False, encoding="euc-kr") - party="" + party = "" for keyword in party_keywords: if keyword in profile.text: - party=keyword + party = keyword break councilors.append(Councilor(name=name, jdName=party)) return ret_local_councilors(cid, councilors) + def scrap_203( url, cid, @@ -353,6 +358,7 @@ def scrap_203( return ret_local_councilors(cid, councilors) + def scrap_204( url, cid, @@ -369,19 +375,20 @@ def scrap_204( base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" profile_url = base_url + link profile = get_soup(profile_url, verify=False) - link = profile.find('a', text='의원소개', href=True) - profile_url = base_url + link['href'] + link = profile.find("a", text="의원소개", href=True) + profile_url = base_url + link["href"] profile = get_soup(profile_url, verify=False) - party="" + party = "" for keyword in party_keywords: if keyword in profile.text: - party=keyword + party = keyword break councilors.append(Councilor(name=name, jdName=party)) return ret_local_councilors(cid, councilors) + def scrap_206( url, cid, diff --git a/scrap/local_councils/jeolla.py b/scrap/local_councils/jeolla.py index 64e02d4..cc7dee3 100644 --- a/scrap/local_councils/jeolla.py +++ b/scrap/local_councils/jeolla.py @@ -14,6 +14,7 @@ party_keywords = getPartyList() party_keywords.append("무소속") + def scrap_154( url, cid, @@ -337,6 +338,7 @@ def scrap_167( # return ret_local_councilors(cid, councilors) + def scrap_175( url, cid, @@ -346,7 +348,9 @@ def scrap_175( browser = get_selenium(url) councilors: list[Councilor] = [] for profileList in browser.find_elements(By.CSS_SELECTOR, "ul[id='councilList']"): - for profile in profileList.find_elements(By.CSS_SELECTOR, "ul[class='name_51']"): + for profile in profileList.find_elements( + By.CSS_SELECTOR, "ul[class='name_51']" + ): name_tag = profile.find_element(By.TAG_NAME, "li") name = name_tag.text.strip() if name_tag else "이름 정보 없음" @@ -362,6 +366,7 @@ def scrap_175( return ret_local_councilors(cid, councilors) + def scrap_177( url, cid, @@ -393,8 +398,12 @@ def scrap_178( """전라남도 완도군""" browser = get_selenium(url) councilors: list[Councilor] = [] - for profileList in browser.find_elements(By.CSS_SELECTOR, "div[class='congressperson_list']"): - for profile in profileList.find_elements(By.CSS_SELECTOR, "div[class='col-lg-6']"): + for profileList in browser.find_elements( + By.CSS_SELECTOR, "div[class='congressperson_list']" + ): + for profile in profileList.find_elements( + By.CSS_SELECTOR, "div[class='col-lg-6']" + ): name_tag = profile.find_element(By.TAG_NAME, "strong") name = name_tag.text.strip() if name_tag else "이름 정보 없음" profile_link = sel_find(profile, "a", class_="icon_btn") diff --git a/scrap/utils/runner.py b/scrap/utils/runner.py index a21761a..da63e9e 100644 --- a/scrap/utils/runner.py +++ b/scrap/utils/runner.py @@ -106,8 +106,10 @@ def get_records_from_data_source(self, data_source: str): # Helper Functions def is_euc_kr(self, n: int) -> bool: return n in self.runner_args["euc_kr"] + def inner_euckr(self, n: int) -> bool: return n in self.runner_args["inner_euckr"] + def is_special_function(self, n: int) -> bool: return n in self.runner_args["special_functions"] @@ -136,7 +138,9 @@ def run_single(self, cid: int) -> ScrapResult: if self.is_selenium_basic(cid): result = sel_scrap_basic(council_url, cid, council_args) else: - result = scrap_basic(council_url, cid, council_args, encoding, inner_euckr) + result = scrap_basic( + council_url, cid, council_args, encoding, inner_euckr + ) return result diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index d93ba62..194bc91 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -116,6 +116,7 @@ def scrap_all_metro_councils() -> None: # ) # email_result(emessages) + def scrap_all_local_councils() -> None: # TODO - 홈페이지 위 charset=euc-kr 등을 인식해 바로 가져오기. euc_kr = [ @@ -138,7 +139,7 @@ def scrap_all_local_councils() -> None: 202, 222, ] - inner_euckr=[200] + inner_euckr = [200] special_functions = ( list(range(1, 57)) + [62, 63, 64, 88, 97, 103, 107] @@ -153,14 +154,18 @@ def scrap_all_local_councils() -> None: 188, 189, 190, - 191,192, + 191, + 192, 194, 195, - 196,197, + 196, + 197, 198, 199, - 201,202, - 203,204, + 201, + 202, + 203, + 204, 206, 208, 209, @@ -225,7 +230,9 @@ def scrap_all_local_councils() -> None: result = str(sel_scrap_basic(council_url, n, council_args).councilors) else: result = str( - scrap_basic(council_url, n, council_args, encoding, inner_euckr).councilors + scrap_basic( + council_url, n, council_args, encoding, inner_euckr + ).councilors ) if "정보 없음" in result: emsg = "스크랩 결과에 '정보 없음'이 포함되어 있습니다. 일부 인명에\ @@ -241,7 +248,7 @@ def scrap_all_local_councils() -> None: except Exception as e: print(e) print(result) - # add_error(n, "기타 오류 - " + str(e)) + # add_error(n, "기타 오류 - " + str(e)) # emessages = ( # f""" # 총 실행 횟수: {N}