diff --git a/scrap/local_councils/__init__.py b/scrap/local_councils/__init__.py index 6995f2f..efe61ff 100644 --- a/scrap/local_councils/__init__.py +++ b/scrap/local_councils/__init__.py @@ -10,7 +10,7 @@ from typing import List from db.types import CouncilType, Councilor -from scrap.utils.types import ScrapResult, ScrapBasicArgument +from scrap.utils.types import ScrapResult, ScrapBasicArgument, ArgsType from scrap.utils.requests import get_soup from scrap.utils.utils import getPartyList diff --git a/scrap/local_councils/basic.py b/scrap/local_councils/basic.py index 97ae9aa..79432dd 100644 --- a/scrap/local_councils/basic.py +++ b/scrap/local_councils/basic.py @@ -1,38 +1,61 @@ +from scrap.utils.requests import get_selenium, By +from selenium.common.exceptions import NoSuchElementException from scrap.local_councils import * import re import requests import copy -from scrap.utils.utils import getPartyList -from scrap.utils.types import ScrapBasicArgument +import traceback regex_pattern = re.compile(r"정\s*\S*\s*당", re.IGNORECASE) # Case-insensitive party_keywords = getPartyList() party_keywords.append("무소속") -def find(soup, element, class_): +def find(soup, element, class_=None): if class_ is None: return soup.find(element) else: return soup.find(element, class_) -def find_all(soup, element, class_): +def findall(soup, element, class_=None): if class_ is None: return soup.find_all(element) else: return soup.find_all(element, class_) +def sel_find(driver, element, class_=None): + if class_ is None: + return driver.find_element(By.TAG_NAME, element) + else: + return driver.find_element(By.CSS_SELECTOR, f"{element}.{class_}") + +def sel_findall(driver, element, class_=None): + if class_ is None: + return driver.find_elements(By.TAG_NAME, element) + else: + return driver.find_elements(By.CSS_SELECTOR, f"{element}.{class_}") -def get_profiles(soup, element, class_, memberlistelement, memberlistclass_): +def getprofiles(soup, element, class_, memberlistelement, memberlistclass_): # 의원 목록 사이트에서 의원 프로필을 가져옴 if memberlistelement is not None: try: - soup = find_all(soup, memberlistelement, class_=memberlistclass_)[0] + soup = findall(soup, memberlistelement, class_=memberlistclass_)[0] except Exception: raise RuntimeError("[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.") - return find_all(soup, element, class_) + return findall(soup, element, class_) +def sel_getprofiles(driver, element, class_, memberlistelement, memberlistclass_): + # 의원 목록 사이트에서 의원 프로필을 가져옴 + if memberlistelement is not None: + try: + member_list = sel_findall(driver, memberlistelement, memberlistclass_)[0] + except Exception: + raise RuntimeError("[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.") + else: + member_list = driver + + return sel_findall(member_list, element, class_) def getDataFromAPI(url_format, data_uid, name_id, party_id) -> Councilor: # API로부터 의원 정보를 가져옴 @@ -44,21 +67,22 @@ def getDataFromAPI(url_format, data_uid, name_id, party_id) -> Councilor: ) -def get_name(profile, element, class_, wrapper_element, wrapper_class_): +def getname(profile, element, class_, wrapper_element, wrapper_class_): # 의원 프로필에서 의원 이름을 가져옴 if wrapper_element is not None: - profile = find_all(profile, wrapper_element, class_=wrapper_class_)[0] + profile = findall(profile, wrapper_element, class_=wrapper_class_)[0] name_tag = find(profile, element, class_) if name_tag.find("span"): name_tag = copy.copy(name_tag) - # span 태그 안의 것들을 다 지움 + # 1. span 태그 안의 것들을 다 지움 for span in name_tag.find_all("span"): span.decompose() - for a_tag in name_tag.find_all("a"): # 인천 서구 등. 안에 '개인홈페이지' 링크가 들음. + # 2. a 태그 안의 것들을 다 지움 : 예. 인천 서구, 안에 '개인홈페이지' 링크가 들음. + for a_tag in name_tag.find_all("a"): a_tag.extract() name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - - # name은 길고 그 중 strong태그 안에 이름이 있는 경우. 은평구, 수원시 등. + # 3. strong 태그 안에 이름이 있는 경우 : name이 긴지 체크한 뒤 strong 태그 안을 받아옴. + # 은평구, 수원시 등. if name_tag.strong is not None: name = name_tag.strong.get_text(strip=True) if name_tag.strong else "이름 정보 없음" name = name.split("(")[0].split(":")[-1].strip() # 이름 뒷 한자이름, 앞 '이 름:' 제거 @@ -79,6 +103,42 @@ def get_name(profile, element, class_, wrapper_element, wrapper_class_): name = maybe_name return name +def sel_getname(profile, element, class_, wrapper_element, wrapper_class_): + # 의원 프로필에서 의원 이름을 가져옴 + if wrapper_element is not None: + profile = sel_find(profile, wrapper_element, class_=wrapper_class_) + name_tag = sel_find(profile, element, class_) + # print(name_tag.text) + name = name_tag.text.strip() if name_tag else "이름 정보 없음" + # strong 태그 안에 이름이 있는 경우 : name이 긴지 체크한 뒤 strong 태그 안을 받아옴. + # 은평구, 수원시 등. + try: + strong_element = sel_find(name_tag, "strong") + if strong_element is not None: + name = strong_element.text.strip() + except NoSuchElementException: + pass + # print(name+"\n") + if name == "": + return "231114" + name = name.split("(")[0].split(":")[-1].strip() # 이름 뒷 한자이름, 앞 '이 름:' 제거 + # TODO : 만약 이름이 우연히 아래 단어를 포함하는 경우를 생각해볼만 함. + if len(name) > 3: + # 수식어가 이름 앞이나 뒤에 붙어있는 경우 + for keyword in ["부의장", "의원", "의장"]: # 119, 강서구 등 + if keyword in name: + name = name.replace(keyword, "").strip() + for keyword in party_keywords: + if keyword in name: # 인천 서구 등 + name = name.replace(keyword, "").strip() + break + print(name, "is name\n") + maybe_name = name.split()[0] # 이름 뒤에 직책이 따라오는 경우 + if len(maybe_name) == 1: # 외자 이름이 띄어쓰기 때문에 분리된 경우 + name = "".join(name.split()[0:2]) + else: + name = maybe_name + return name def extract_party(string): for keyword in party_keywords: @@ -86,7 +146,6 @@ def extract_party(string): return keyword return None - def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url): # 의원 프로필에서 프로필보기 링크를 가져옴 parsed_url = urlparse(url) @@ -94,7 +153,7 @@ def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url) # 프로필보기 링크 가져오기 profile_link = find(profile, wrapper_element, class_=wrapper_class_) if wrapper_txt is not None: - profile_links = find_all(profile, "a", class_=wrapper_class_) + profile_links = findall(profile, "a", class_=wrapper_class_) profile_link = [link for link in profile_links if link.text == wrapper_txt][0] if profile_link is None: raise RuntimeError("[basic.py] 의원 프로필에서 프로필보기 링크를 가져오는데 실패했습니다.") @@ -107,8 +166,29 @@ def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url) raise RuntimeError("[basic.py] '//'가 있진 않나요?", " url: ", profile_url) return profile +def sel_goto_profilesite(profile, wrapper_element, wrapper_class_=None, wrapper_txt=None, url=None): + # 의원 프로필에서 프로필보기 링크를 가져옴 + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + # 프로필보기 링크 가져오기 + if wrapper_txt is None: + profile_link = sel_find(profile, wrapper_element, class_=wrapper_class_) + else: + profile_links = sel_findall(profile, wrapper_element, class_=wrapper_class_) + profile_link = [link for link in profile_links if link.text == wrapper_txt][0] + if profile_link is None: + raise RuntimeError("[basic.py] 의원 프로필에서 프로필보기 링크를 가져오는데 실패했습니다.") + # print(profile_link.get_attribute("href")) + + # print("clicked") + # # 프로필 사이트 로딩을 기다림 + # try: + # WebDriverWait(profile, 10).until(EC.url_contains(profile_link.get_attribute("href"))) # 10초 동안 기다림 + # except Exception: + # raise RuntimeError("[basic.py] 프로필 사이트로 이동하는데 실패했습니다.") + return get_selenium(profile_link.get_attribute("href")) -def get_party( +def getpty( profile, element, class_, wrapper_element, wrapper_class_, wrapper_txt, url ): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 @@ -118,7 +198,7 @@ def get_party( ) party_pulp_list = list( filter( - lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) + lambda x: regex_pattern.search(str(x)), findall(profile, element, class_) ) ) if party_pulp_list == []: @@ -133,8 +213,7 @@ def get_party( else: return "[basic.py] 정당 정보 파싱 불가" - -def get_party_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url): +def getpty_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: profile = goto_profilesite( @@ -144,18 +223,32 @@ def get_party_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url): assert party is not None return party +def sel_getpty_easy(profile, wrapper_element, wrapper_class_=None, wrapper_txt=None, url=None): + # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 + if wrapper_element is not None: + page_content = sel_goto_profilesite( + profile, wrapper_element, wrapper_class_, wrapper_txt, url + ).page_source + for keyword in party_keywords: + if keyword in page_content: + return keyword + raise RuntimeError("[basic.py/get_selenium_party_easy] Party not found") + else: + party = extract_party(profile.text) + assert party is not None + return party def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapResult: """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url - :param n: 의회 id + :param cid: 의회 id :param encoding: 받아온 soup 인코딩 :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ soup = get_soup(url, verify=False, encoding=encoding) councilors: list[Councilor] = [] - profiles = get_profiles( + profiles = getprofiles( soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls ) print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. @@ -163,7 +256,7 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapRe for profile in profiles: name = party = "" try: - name = get_name( + name = getname( profile, args.name_elt, args.name_cls, @@ -173,7 +266,7 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapRe except Exception as e: raise RuntimeError("[basic.py] 의원 이름을 가져오는데 실패했습니다. 이유 : " + str(e)) try: - party = get_party( + party = getpty( profile, args.pty_elt, args.pty_cls, @@ -184,7 +277,7 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapRe ) except Exception as e: try: - party = get_party_easy( + party = getpty_easy( profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url ) except Exception: @@ -193,6 +286,46 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapRe return ret_local_councilors(cid, councilors) +def sel_scrap_basic(url, cid, args: ScrapBasicArgument) -> ScrapResult: + """의원 상세약력 스크랩 + :param url: 의원 목록 사이트 url + :param cid: 의회 id + :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 + """ + browser = get_selenium(url) + councilors: list[Councilor] = [] + + profiles = sel_getprofiles( + browser, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. + + for profile in profiles: + name = party = "" + try: + name = sel_getname( + profile, + args.name_elt, + args.name_cls, + args.name_wrapelt, + args.name_wrapcls, + ) + except Exception as e: + traceback.print_exc() + raise RuntimeError("[basic.py/selenium] 의원 이름을 가져오는데 실패했습니다. 이유 : " + str(e)) + if name =="231114": + continue + try: + party = sel_getpty_easy( + profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url + ) + except Exception as e: + traceback.print_exc() + raise RuntimeError("[basic.py/selenium] 의원 정당을 가져오는데 실패했습니다. 이유: " + str(e)) + + councilors.append(Councilor(name, party)) + + return ret_local_councilors(cid, councilors) if __name__ == "__main__": args3 = ScrapBasicArgument( diff --git a/scrap/local_councils/busan.py b/scrap/local_councils/busan.py index 69fe9f1..f56a25a 100644 --- a/scrap/local_councils/busan.py +++ b/scrap/local_councils/busan.py @@ -2,15 +2,14 @@ from scrap.utils.requests import get_selenium, By from scrap.local_councils import * -from scrap.local_councils.basic import ret_local_councilors +from scrap.local_councils.basic import find, findall - -def scrap_26(url, cid) -> ScrapResult: +def scrap_26(url, cid, args: ArgsType = None) -> ScrapResult: """부산 중구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find("div", class_="bbs_blog council").find_all("dl"): + for profile in findall(find(soup, "div", class_="bbs_blog council"), "dl"): name_tag = profile.find_next("dt") name = ( name_tag.get_text(strip=True).split()[-1].strip() @@ -28,7 +27,7 @@ def scrap_26(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_27(url, cid) -> ScrapResult: +def scrap_27(url, cid, args: ArgsType = None) -> ScrapResult: """부산 서구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -37,7 +36,7 @@ def scrap_27(url, cid) -> ScrapResult: parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all("div", class_="intro"): + for profile in findall(soup, "div", class_="intro"): name_tag = profile.find_next("span").find_next("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" party = "정당 정보 없음" @@ -54,15 +53,15 @@ def scrap_27(url, cid) -> ScrapResult: councilors.append(Councilor(name=name, jdName=party)) - ret_local_councilors(cid, councilors) + return ret_local_councilors(cid, councilors) -def scrap_28(url, cid) -> ScrapResult: +def scrap_28(url, cid, args: ArgsType = None) -> ScrapResult: """부산 동구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_="council_box"): + for profile in findall(soup, "div", class_="council_box"): name_tag = profile.find_next("span", class_="n2") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -73,15 +72,15 @@ def scrap_28(url, cid) -> ScrapResult: councilors.append(Councilor(name=name, jdName=party)) - ret_local_councilors(cid, councilors) + return ret_local_councilors(cid, councilors) -def scrap_29(url, cid) -> ScrapResult: +def scrap_29(url, cid, args: ArgsType = None) -> ScrapResult: """부산 영도구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_="even-grid gap3pct panel1 p01205bg"): + for profile in findall(soup, "div", class_="even-grid gap3pct panel1 p01205bg"): name_tag = profile.find_next("strong", class_="h1 title") name = ( name_tag.get_text(strip=True).split(" ")[0].strip() @@ -97,12 +96,12 @@ def scrap_29(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_30(url, cid) -> ScrapResult: +def scrap_30(url, cid, args: ArgsType = None) -> ScrapResult: """부산 부산진구""" soup = get_soup(url, verify=False).find("ul", class_="mlist") councilors: list[Councilor] = [] - for profile in soup.find_all("dl"): + for profile in findall(soup, "dl"): name_tag = profile.find("dd", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -118,12 +117,12 @@ def scrap_30(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_31(url, cid) -> ScrapResult: +def scrap_31(url, cid, args: ArgsType = None) -> ScrapResult: """부산 동래구""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for name_tag in soup.find_all("li", class_="name"): + for name_tag in findall(soup, "li", class_="name"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" party = "정당 정보 없음" @@ -136,12 +135,12 @@ def scrap_31(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_32(url, cid) -> ScrapResult: +def scrap_32(url, cid, args: ArgsType = None) -> ScrapResult: """부산 남구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("dl", class_="profile"): + for profile in findall(soup, "dl", class_="profile"): name_tag = profile.find("strong") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -160,12 +159,12 @@ def scrap_32(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_33(url, cid) -> ScrapResult: +def scrap_33(url, cid, args: ArgsType = None) -> ScrapResult: """부산 북구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("dl", class_="info"): + for profile in findall(soup, "dl", class_="info"): name_tag = profile.find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -179,7 +178,7 @@ def scrap_33(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_34(url, cid) -> ScrapResult: +def scrap_34(url, cid, args: ArgsType = None) -> ScrapResult: """부산 해운대구""" soup = get_soup(url, verify=False).find("div", class_="initial_list") councilors: list[Councilor] = [] @@ -188,30 +187,31 @@ def scrap_34(url, cid) -> ScrapResult: parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for name_tag in soup.find_all("dd"): + for name_tag in findall(soup, "dd"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" # 프로필보기 링크 가져오기 profile_link = name_tag.find("a") - if profile_link: - profile_url = base_url + profile_link["href"] - profile_soup = get_soup(profile_url, verify=False) + assert profile_link is not None + profile_url = base_url + profile_link["href"] + profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find("span", string="소속정당") - if party_info and (party_span := party_info.parent) is not None: - party = party_span.text[4:].strip() + party_info = profile_soup.find("span", string="소속정당") + party = "" + if party_info and (party_span := party_info.parent) is not None: + party = party_span.text[4:].strip() councilors.append(Councilor(name=name, jdName=party)) return ret_local_councilors(cid, councilors) -def scrap_35(url, cid) -> ScrapResult: +def scrap_35(url, cid, args: ArgsType = None) -> ScrapResult: """부산 기장군""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all("ul", class_="wulli bul02"): + for profile in findall(soup, "ul", class_="wulli bul02"): li_tags = profile.find_all("li") name_tag = li_tags[0] @@ -227,12 +227,12 @@ def scrap_35(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_36(url, cid) -> ScrapResult: +def scrap_36(url, cid, args: ArgsType = None) -> ScrapResult: """부산 사하구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for district_tag in soup.find_all("div", class_="list_member"): + for district_tag in findall(soup, "div", class_="list_member"): for name_tag in district_tag.find_all("h4", class_="name"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -246,12 +246,12 @@ def scrap_36(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_37(url, cid) -> ScrapResult: +def scrap_37(url, cid, args: ArgsType = None) -> ScrapResult: """부산 금정구""" soup = get_soup(url, verify=False).find("div", class_="council_list") councilors: list[Councilor] = [] - for profile in soup.find_all("a"): + for profile in findall(soup, "a"): name_tag = profile.find("span", class_="tit").find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -259,6 +259,7 @@ def scrap_37(url, cid) -> ScrapResult: profile_soup = get_soup(profile_url, verify=False) party_info = profile_soup.find("span", class_="name", string="정당") + party = "" if party_info and (party_span := party_info.parent) is not None: party = party_span.text[2:].strip() @@ -267,12 +268,12 @@ def scrap_37(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_38(url, cid) -> ScrapResult: +def scrap_38(url, cid, args: ArgsType = None) -> ScrapResult: """부산 강서구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile_img in soup.find_all("button", class_="btn_close"): + for profile_img in findall(soup, "button", class_="btn_close"): profile = profile_img.find_next("dl") name_tag = profile.find("dd", class_="name") @@ -290,7 +291,7 @@ def scrap_38(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_39(url, cid) -> ScrapResult: +def scrap_39(url, cid, args: ArgsType = None) -> ScrapResult: """부산 연제구""" councilors: list[Councilor] = [] @@ -322,12 +323,12 @@ def scrap_39(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_40(url, cid) -> ScrapResult: +def scrap_40(url, cid, args: ArgsType = None) -> ScrapResult: """부산 수영구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_="mem_info"): + for profile in findall(soup, "div", class_="mem_info"): name_tag = profile.find("span", class_="name").find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -341,12 +342,12 @@ def scrap_40(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_41(url, cid) -> ScrapResult: +def scrap_41(url, cid, args: ArgsType = None) -> ScrapResult: """부산 사상구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for district in soup.find_all("ul", class_="council_list"): + for district in findall(soup, "ul", class_="council_list"): for profile in district.find_all("li"): name_tag = profile.find("span", class_="tit") name = ( @@ -362,8 +363,4 @@ def scrap_41(url, cid) -> ScrapResult: councilors.append(Councilor(name=name, jdName=party)) - return ret_local_councilors(cid, councilors) - - -if __name__ == "__main__": - print(scrap_39()) + return ret_local_councilors(cid, councilors) \ No newline at end of file diff --git a/scrap/local_councils/daegu.py b/scrap/local_councils/daegu.py index 59bac26..dcfc84e 100644 --- a/scrap/local_councils/daegu.py +++ b/scrap/local_councils/daegu.py @@ -1,7 +1,7 @@ from scrap.local_councils import * -def scrap_42(url, cid) -> ScrapResult: +def scrap_42(url, cid, args: ArgsType = None) -> ScrapResult: """대구 중구""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] @@ -22,7 +22,7 @@ def scrap_42(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_43(url, cid) -> ScrapResult: +def scrap_43(url, cid, args: ArgsType = None) -> ScrapResult: """대구 동구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -53,7 +53,7 @@ def scrap_43(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_44(url, cid) -> ScrapResult: +def scrap_44(url, cid, args: ArgsType = None) -> ScrapResult: """대구 서구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -76,7 +76,7 @@ def scrap_44(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_45(url, cid) -> ScrapResult: +def scrap_45(url, cid, args: ArgsType = None) -> ScrapResult: """대구 남구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -97,7 +97,7 @@ def scrap_45(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_46(url, cid) -> ScrapResult: +def scrap_46(url, cid, args: ArgsType = None) -> ScrapResult: """대구 북구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -118,7 +118,7 @@ def scrap_46(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_47(url, cid) -> ScrapResult: +def scrap_47(url, cid, args: ArgsType = None) -> ScrapResult: """대구 수성구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -137,7 +137,7 @@ def scrap_47(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_48(url, cid) -> ScrapResult: +def scrap_48(url, cid, args: ArgsType = None) -> ScrapResult: """대구 달서구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -159,7 +159,7 @@ def scrap_48(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_49(url, cid) -> ScrapResult: +def scrap_49(url, cid, args: ArgsType = None) -> ScrapResult: """대구 달성군""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -191,8 +191,4 @@ def scrap_49(url, cid) -> ScrapResult: councilors.append(Councilor(name=name, jdName=party)) - return ret_local_councilors(cid, councilors) - - -if __name__ == "__main__": - print(scrap_49()) + return ret_local_councilors(cid, councilors) \ No newline at end of file diff --git a/scrap/local_councils/daejeon/daejeon.py b/scrap/local_councils/daejeon/daejeon.py index 3ccc05d..9b9ad60 100644 --- a/scrap/local_councils/daejeon/daejeon.py +++ b/scrap/local_councils/daejeon/daejeon.py @@ -114,8 +114,4 @@ def scrap_69(url, cid) -> ScrapResult: party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, jdName=party)) - return ret_local_councilors(cid, councilors) - - -if __name__ == "__main__": - print(scrap_69()) + return ret_local_councilors(cid, councilors) \ No newline at end of file diff --git a/scrap/local_councils/gangwon.py b/scrap/local_councils/gangwon.py index 4da2d33..42bf2f7 100644 --- a/scrap/local_councils/gangwon.py +++ b/scrap/local_councils/gangwon.py @@ -10,7 +10,7 @@ def scrap_107( url, - cid, + cid, args: ArgsType = None ) -> ScrapResult: """강원도 원주시""" councilors: list[Councilor] = [] @@ -27,6 +27,7 @@ def scrap_107( if keyword in name: name = name.replace(keyword, "").strip() party_tag = info.find_elements(By.TAG_NAME, "dd") + party = "" for tag in party_tag: party = tag.text.split(" ")[-1] if party in party_keywords: @@ -39,7 +40,7 @@ def scrap_107( return ret_local_councilors(cid, councilors) -def scrap_113(url, cid, args: ScrapBasicArgument = None) -> ScrapResult: +def scrap_113(url, cid, args: ArgsType = None) -> ScrapResult: """강원도 속초시""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -63,7 +64,7 @@ def scrap_113(url, cid, args: ScrapBasicArgument = None) -> ScrapResult: def scrap_114( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 고성군""" soup = get_soup(url, verify=False) @@ -90,7 +91,7 @@ def scrap_114( def scrap_115( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 양양군""" soup = get_soup(url, verify=False) @@ -115,7 +116,7 @@ def scrap_115( def scrap_116( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 인제군""" soup = get_soup(url, verify=False) @@ -136,7 +137,7 @@ def scrap_116( def scrap_117( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 홍천군""" soup = get_soup(url, verify=False, encoding="euc-kr") @@ -162,7 +163,7 @@ def scrap_117( def scrap_118( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 횡성군""" soup = get_soup(url, verify=False) @@ -179,11 +180,10 @@ def scrap_118( return ret_local_councilors(cid, councilors) - def scrap_119( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 영월군""" base_url = "https://council.yw.go.kr" @@ -216,7 +216,7 @@ def scrap_119( def scrap_120( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 평창군""" soup = get_soup(url, verify=False) @@ -239,7 +239,7 @@ def scrap_120( def scrap_121( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 화천군""" soup = get_soup(url, verify=False) @@ -260,7 +260,7 @@ def scrap_121( def scrap_122( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 양구군""" soup = get_soup(url, verify=False) @@ -281,7 +281,7 @@ def scrap_122( def scrap_123( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """강원도 철원군""" soup = get_soup(url, verify=False) @@ -299,8 +299,4 @@ def scrap_123( # TODO councilors.append(Councilor(name=name, jdName=party)) - return ret_local_councilors(cid, councilors) - - -if __name__ == "__main__": - print(scrap_123()) + return ret_local_councilors(cid, councilors) \ No newline at end of file diff --git a/scrap/local_councils/gwangju.py b/scrap/local_councils/gwangju.py index 02f8cc0..0c80726 100644 --- a/scrap/local_councils/gwangju.py +++ b/scrap/local_councils/gwangju.py @@ -11,7 +11,7 @@ party_keywords.append("무소속") -def scrap_62(url, cid) -> ScrapResult: +def scrap_62(url, cid, args: ArgsType = None) -> ScrapResult: """광주 서구""" councilors: list[Councilor] = [] @@ -40,6 +40,7 @@ def scrap_62(url, cid) -> ScrapResult: ) party_tag = browser.find_elements(By.TAG_NAME, "dd") + party = "" for tag in party_tag: party = tag.text.strip() if party in party_keywords: @@ -55,7 +56,7 @@ def scrap_62(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_63(url, cid) -> ScrapResult: +def scrap_63(url, cid, args: ArgsType = None) -> ScrapResult: """광주 북구""" councilors: list[Councilor] = [] @@ -79,6 +80,7 @@ def scrap_63(url, cid) -> ScrapResult: ) name = name_tag.text.strip() if name_tag else "이름 정보 없음" party_tag = info.find_elements(By.TAG_NAME, "dd") + party = "" for tag in party_tag: party = tag.text.strip() if party in party_keywords: @@ -91,7 +93,7 @@ def scrap_63(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_64(url, cid) -> ScrapResult: +def scrap_64(url, cid, args: ArgsType = None) -> ScrapResult: """광주 광산구""" councilors: list[Councilor] = [] @@ -118,6 +120,7 @@ def scrap_64(url, cid) -> ScrapResult: if keyword in name: name = name.replace(keyword, "").strip() party_tag = info.find_elements(By.TAG_NAME, "dd") + party = "" for tag in party_tag: party = tag.text.replace(" ", "") if party in party_keywords: diff --git a/scrap/local_councils/gyeonggi.py b/scrap/local_councils/gyeonggi.py index 8147304..db4e986 100644 --- a/scrap/local_councils/gyeonggi.py +++ b/scrap/local_councils/gyeonggi.py @@ -1,15 +1,51 @@ """경기도를 스크랩. """ +from scrap.utils.requests import get_selenium, By from scrap.local_councils import * from scrap.local_councils.basic import ( find, regex_pattern, - find_all, + findall, extract_party, - get_name, - get_party_easy, + getname, + getpty_easy, ) +party_keywords = getPartyList() +party_keywords.append("무소속") + +def scrap_76(url, cid, args: ArgsType) -> ScrapResult: + """경기도 성남시""" + assert args is not None + assert args.pf_elt is not None + assert args.pf_cls is not None + assert args.name_elt is not None + assert args.name_cls is not None + assert args.pty_elt is not None + assert args.pty_cls is not None + + councilors: list[Councilor] = [] + + browser = get_selenium(url) + + councilor_infos = browser.find_elements(By.CSS_SELECTOR, args.pf_elt + "[class*='" + args.pf_cls + "']") + + for info in councilor_infos: + name_tag = info.find_element(By.CSS_SELECTOR, args.name_elt + "[class='" + args.name_cls + "']") + name = name_tag.text.strip() if name_tag else "이름 정보 없음" + party_tag = info.find_elements(By.TAG_NAME, args.pty_elt) + party = "" + for tag in party_tag: + party = tag.text.strip() + if party in party_keywords: + break + if party not in party_keywords: + party = "정당 정보 없음" + + councilors.append(Councilor(name, party)) + + return ret_local_councilors(cid, councilors) + def get_profiles_88_103(soup, element, class_, memberlistelement, memberlistclass_): if memberlistelement is not None: @@ -30,7 +66,7 @@ def get_party_88(profile, element, class_, wrapper_element, wrapper_class_, url) profile = get_soup(profile_url, verify=False, encoding="euc-kr") party_pulp_list = list( filter( - lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) + lambda x: regex_pattern.search(str(x)), findall(profile, element, class_) ) ) if party_pulp_list == []: @@ -56,7 +92,7 @@ def scrap_88(url, cid, args: ScrapBasicArgument) -> ScrapResult: print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name( + name = getname( profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls ) party = "" @@ -70,7 +106,7 @@ def scrap_88(url, cid, args: ScrapBasicArgument) -> ScrapResult: url, ) except Exception: - party = get_party_easy( + party = getpty_easy( profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url ) @@ -89,7 +125,7 @@ def get_party_103(profile, element, class_, wrapper_element, wrapper_class_, url profile = get_soup(profile_url, verify=False) party_pulp_list = list( filter( - lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) + lambda x: regex_pattern.search(str(x)), findall(profile, element, class_) ) ) if party_pulp_list == []: @@ -115,7 +151,7 @@ def scrap_103(url, cid, args: ScrapBasicArgument) -> ScrapResult: print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name( + name = getname( profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls ) party = get_party_103( diff --git a/scrap/local_councils/incheon.py b/scrap/local_councils/incheon.py index 8ffbd4e..5a45f57 100644 --- a/scrap/local_councils/incheon.py +++ b/scrap/local_councils/incheon.py @@ -2,15 +2,15 @@ """ from scrap.utils.requests import get_selenium, By from scrap.local_councils.basic import ( - get_profiles, - get_name, + getprofiles, + getname, find, extract_party, ) from scrap.local_councils import * -def scrap_50(url, cid) -> ScrapResult: +def scrap_50(url, cid, args: ArgsType = None) -> ScrapResult: """인천 중구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -25,7 +25,7 @@ def scrap_50(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_51(url, cid) -> ScrapResult: +def scrap_51(url, cid, args: ArgsType = None) -> ScrapResult: """인천 동구""" browser = get_selenium(url) councilors: list[Councilor] = [] @@ -57,7 +57,7 @@ def scrap_51(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_52(url, cid) -> ScrapResult: +def scrap_52(url, cid, args: ArgsType = None) -> ScrapResult: """인천 미추홀구""" councilors: list[Councilor] = [] @@ -81,7 +81,7 @@ def scrap_52(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_53(url, cid) -> ScrapResult: +def scrap_53(url, cid, args: ArgsType = None) -> ScrapResult: """인천 연수구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -102,7 +102,7 @@ def scrap_53(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_54(url, cid) -> ScrapResult: +def scrap_54(url, cid, args: ArgsType = None) -> ScrapResult: """인천 남동구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -121,7 +121,7 @@ def scrap_54(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_55(url, cid) -> ScrapResult: +def scrap_55(url, cid, args: ArgsType = None) -> ScrapResult: """인천 부평구""" browser = get_selenium(url) councilors: list[Councilor] = [] @@ -141,7 +141,7 @@ def scrap_55(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_56(url, cid) -> ScrapResult: +def scrap_56(url, cid, args: ArgsType = None) -> ScrapResult: """인천 계양구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -167,13 +167,13 @@ def scrap_57(url, args) -> ScrapResult: councilors: list[Councilor] = [] cid = 57 - profiles = get_profiles( + profiles = getprofiles( soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls ) print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name( + name = getname( profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls ) diff --git a/scrap/local_councils/jeolla.py b/scrap/local_councils/jeolla.py index bcaa9f1..6079730 100644 --- a/scrap/local_councils/jeolla.py +++ b/scrap/local_councils/jeolla.py @@ -1,17 +1,17 @@ from scrap.local_councils import * from scrap.utils.requests import get_selenium, By - +from scrap.local_councils.basic import getprofiles, getname, extract_party, find, findall, regex_pattern def scrap_154( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """전라북도 남원시""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all("ul", class_="info"): + for profile in findall(soup, "ul", class_="info"): name_tag = profile.find("span", class_="name") name = name_tag.get_text(strip=True).split()[0] if name_tag else "이름 정보 없음" @@ -26,13 +26,13 @@ def scrap_154( def scrap_155( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """전라북도 김제시""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_="bbs_member"): + for profile in findall(soup, "div", class_="bbs_member"): name_tag = profile.find("dt") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -47,14 +47,14 @@ def scrap_155( def scrap_156( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """전라북도 완주군""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] memberlist = soup.find("div", class_="card-member") - - for profile in memberlist.find_all("li"): + + for profile in findall(memberlist, "li"): name_tag = profile.find("div", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -69,13 +69,13 @@ def scrap_156( def scrap_157( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """전라북도 진안군""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_="profile"): + for profile in findall(soup, "div", class_="profile"): name_tag = profile.find("dt") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -122,13 +122,13 @@ def scrap_161(url, cid) -> ScrapResult: def scrap_162( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """전라북도 고창군""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_="con_mem"): + for profile in findall(soup, "div", class_="con_mem"): name_tag = profile.find("strong") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -143,13 +143,13 @@ def scrap_162( def scrap_163( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """전라북도 부안군""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - profiles = soup.find_all("div", class_="person") + profiles = findall(soup, "div", class_="person") profiles = profiles[1:] # 첫 번째 태그는 홈페이지 목록 for profile in profiles: @@ -170,14 +170,14 @@ def scrap_163( def scrap_164( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """전라남도 목포시""" base_url = "https://council.mokpo.go.kr/" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_="profile"): + for profile in findall(soup, "div", class_="profile"): name_tag = profile.find("em", class_="name").get_text(strip=True) name = name_tag if name_tag else "이름 정보 없음" name = name.split("(")[0] # 괄호 안에 있는 한자는 제외 @@ -185,7 +185,7 @@ def scrap_164( member_link = profile.find("a", class_="start")["href"] member_soup = get_soup(base_url + member_link) - party_tag = member_soup.find("ul", class_="profile_list") + party_tag = find(member_soup, "ul", class_="profile_list") party = ( party_tag.select_one("li:contains('정 당')").text.replace("정 당:", "").strip() ) @@ -198,18 +198,18 @@ def scrap_164( def scrap_165( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """전라남도 여수시""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_="profile"): + for profile in findall(soup, "div", class_="profile"): name_tag = profile.find("li", class_="name").get_text(strip=True) name = name_tag if name_tag else "이름 정보 없음" name = name.split("(")[0] # 괄호 안에 있는 한자는 제외 - party_tag = [li for li in soup.find_all("li") if "소속정당" in li.get_text()] + party_tag = [li for li in findall(soup, "li") if "소속정당" in li.get_text()] party = party_tag[0].get_text() if party_tag else "정당 정보 없음" councilors.append(Councilor(name=name, jdName=party)) @@ -220,13 +220,13 @@ def scrap_165( def scrap_167( url, cid, - args: ScrapBasicArgument = None, + args: ArgsType = None, ) -> ScrapResult: """전라북도 나주시""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_="profile"): + for profile in findall(soup, "div", class_="profile"): name_tag = profile.find("dt") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -237,6 +237,92 @@ def scrap_167( return ret_local_councilors(cid, councilors) +# def goto_profilesite_171(profile, wrapper_element, wrapper_class_, wrapper_txt, url): +# # 프로필보기 링크 가져오기 +# profile_link = find(profile, wrapper_element, class_=wrapper_class_) +# if wrapper_txt is not None: +# profile_links = find_all(profile, "a", class_=wrapper_class_) +# profile_link = [link for link in profile_links if link.text == wrapper_txt][0] +# if profile_link is None: +# raise RuntimeError("[basic.py] 의원 프로필에서 프로필보기 링크를 가져오는데 실패했습니다.") +# profile_url = profile_link["href"] + "/main/" +# print(profile_url) +# try: +# profile = get_soup(profile_url, verify=False) +# except Exception: +# raise RuntimeError("[basic.py] '//'가 있진 않나요?", " url: ", profile_url) +# return profile + +# def get_party_171( +# profile, element, class_, wrapper_element, wrapper_class_, wrapper_txt, url +# ): +# # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 +# if wrapper_element is not None: +# profile = goto_profilesite_171( +# profile, wrapper_element, wrapper_class_, wrapper_txt, url +# ) +# print(profile.text) +# print(find_all(profile, element, class_)) +# print("hihih") +# party_pulp_list = list( +# filter( +# lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) +# ) +# ) +# if party_pulp_list == []: +# raise RuntimeError("[basic.py] 정당정보 regex 실패") +# party_pulp = party_pulp_list[0] +# party_string = party_pulp.get_text(strip=True).split(" ")[-1] +# while True: +# if (party := extract_party(party_string)) is not None: +# return party +# if (party_pulp := party_pulp.find_next("span")) is not None: +# party_string = party_pulp.text.strip().split(" ")[-1] +# else: +# return "[basic.py] 정당 정보 파싱 불가" + + +# def scrap_171( +# url, +# cid, +# args: ArgsType = None, +# ) -> ScrapResult: +# """전라남도 곡성군""" +# soup = get_soup(url, verify=False) +# councilors: list[Councilor] = [] + +# profiles = get_profiles( +# soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls +# ) +# print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. + +# for profile in profiles: +# name = party = "" +# try: +# name = get_name( +# profile, +# args.name_elt, +# args.name_cls, +# args.name_wrapelt, +# args.name_wrapcls, +# ) +# except Exception as e: +# raise RuntimeError("[basic.py] 의원 이름을 가져오는데 실패했습니다. 이유 : " + str(e)) +# try: +# party = get_party_171( +# profile, +# args.pty_cls, +# args.pty_elt, +# args.pty_wrapelt, +# args.pty_wrapcls, +# args.pty_wraptxt, +# url, +# ) +# except Exception as e: +# raise RuntimeError("[basic.py] 의원 정당을 가져오는데 실패했습니다. 이유: " + str(e)) +# councilors.append(Councilor(name=name, party=party)) + +# return ret_local_councilors(cid, councilors) if __name__ == "__main__": print(scrap_161("https://www.sunchangcouncil.go.kr/main/contents/lawmaker", 161)) diff --git a/scrap/local_councils/seoul.py b/scrap/local_councils/seoul.py index 2febe8f..70e5193 100644 --- a/scrap/local_councils/seoul.py +++ b/scrap/local_councils/seoul.py @@ -6,9 +6,7 @@ from scrap.local_councils import * -def scrap_1( - url, - cid, +def scrap_1(url, cid, args: ArgsType = None ) -> ScrapResult: """서울 종로구""" soup = get_soup(url, verify=False) @@ -28,7 +26,7 @@ def scrap_1( return ret_local_councilors(cid, councilors) -def scrap_2(url, cid) -> ScrapResult: +def scrap_2(url, cid, args: ArgsType = None) -> ScrapResult: """서울 중구""" parliment_soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -57,7 +55,7 @@ def scrap_2(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_3(url, cid) -> ScrapResult: +def scrap_3(url, cid, args: ArgsType = None) -> ScrapResult: """서울 용산구""" soup = get_soup(url, verify=False) @@ -77,7 +75,7 @@ def scrap_3(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_4(url, cid) -> ScrapResult: +def scrap_4(url, cid, args: ArgsType = None) -> ScrapResult: """서울 성동구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -96,7 +94,7 @@ def scrap_4(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_5(url, cid) -> ScrapResult: +def scrap_5(url, cid, args: ArgsType = None) -> ScrapResult: """서울 광진구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -117,7 +115,7 @@ def scrap_5(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_6(url, cid) -> ScrapResult: +def scrap_6(url, cid, args: ArgsType = None) -> ScrapResult: """서울 동대문구""" parliment_soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] @@ -151,7 +149,7 @@ def scrap_6(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_7(url, cid) -> ScrapResult: +def scrap_7(url, cid, args: ArgsType = None) -> ScrapResult: """서울 중랑구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -170,7 +168,7 @@ def scrap_7(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_8(url, cid) -> ScrapResult: +def scrap_8(url, cid, args: ArgsType = None) -> ScrapResult: """서울 성북구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -191,7 +189,7 @@ def scrap_8(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_9(url, cid) -> ScrapResult: +def scrap_9(url, cid, args: ArgsType = None) -> ScrapResult: """서울 강북구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -214,10 +212,7 @@ def scrap_9(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_10( - url, - cid, -) -> ScrapResult: +def scrap_10(url, cid, args: ArgsType = None) -> ScrapResult: """서울 도봉구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -238,7 +233,7 @@ def scrap_10( return ret_local_councilors(cid, councilors) -def scrap_11(url, cid) -> ScrapResult: +def scrap_11(url, cid, args: ArgsType = None) -> ScrapResult: """서울 노원구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -259,7 +254,7 @@ def scrap_11(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_12(url, cid) -> ScrapResult: +def scrap_12(url, cid, args: ArgsType = None) -> ScrapResult: """서울 은평구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -283,8 +278,7 @@ def scrap_12(url, cid) -> ScrapResult: def scrap_13( - url, - cid, + url, cid, args: ArgsType = None ) -> ScrapResult: """서울 서대문구""" soup = get_soup(url, verify=False, encoding="euc-kr") @@ -311,7 +305,7 @@ def scrap_13( return ret_local_councilors(cid, councilors) -def scrap_14(url, cid) -> ScrapResult: +def scrap_14(url, cid, args: ArgsType = None) -> ScrapResult: """서울 마포구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -330,7 +324,7 @@ def scrap_14(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_15(url, cid) -> ScrapResult: +def scrap_15(url, cid, args: ArgsType = None) -> ScrapResult: """서울 양천구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -363,7 +357,7 @@ def scrap_15(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_16(url, cid) -> ScrapResult: +def scrap_16(url, cid, args: ArgsType = None) -> ScrapResult: """서울 강서구""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] @@ -388,7 +382,7 @@ def scrap_16(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_17(url, cid) -> ScrapResult: +def scrap_17(url, cid, args: ArgsType = None) -> ScrapResult: """서울 구로구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -411,7 +405,7 @@ def scrap_17(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_18(url, cid) -> ScrapResult: +def scrap_18(url, cid, args: ArgsType = None) -> ScrapResult: """서울 금천구""" soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] @@ -432,7 +426,7 @@ def scrap_18(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_19(url, cid) -> ScrapResult: +def scrap_19(url, cid, args: ArgsType = None) -> ScrapResult: """서울 영등포구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -451,7 +445,7 @@ def scrap_19(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_20(url, cid) -> ScrapResult: +def scrap_20(url, cid, args: ArgsType = None) -> ScrapResult: """서울 동작구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -470,7 +464,7 @@ def scrap_20(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_21(url, cid) -> ScrapResult: +def scrap_21(url, cid, args: ArgsType = None) -> ScrapResult: """서울 관악구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -491,7 +485,7 @@ def scrap_21(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_22(url, cid) -> ScrapResult: +def scrap_22(url, cid, args: ArgsType = None) -> ScrapResult: """서울 서초구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -512,7 +506,7 @@ def scrap_22(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_23(url, cid) -> ScrapResult: +def scrap_23(url, cid, args: ArgsType = None) -> ScrapResult: """서울 강남구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -535,7 +529,7 @@ def scrap_23(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_24(url, cid) -> ScrapResult: +def scrap_24(url, cid, args: ArgsType = None) -> ScrapResult: """서울 송파구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -554,7 +548,7 @@ def scrap_24(url, cid) -> ScrapResult: return ret_local_councilors(cid, councilors) -def scrap_25(url, cid) -> ScrapResult: +def scrap_25(url, cid, args: ArgsType = None) -> ScrapResult: """서울 강동구""" soup = get_soup(url, verify=False) councilors: list[Councilor] = [] diff --git a/scrap/utils/scrap_args.json b/scrap/utils/scrap_args.json index a02f39f..82b4ca6 100644 --- a/scrap/utils/scrap_args.json +++ b/scrap/utils/scrap_args.json @@ -135,7 +135,7 @@ "pf_cls": "profile", "name_elt": "em", "name_cls": "name", - "pty_elt": "em" + "pty_elt": "span" }, "77": { "pf_elt": "dl", @@ -296,8 +296,8 @@ }, "97": { "pf_elt": "li", - "pf_memlistelt": "ul", - "pf_memlistcls": "memberList", + "pf_memlistelt": "class", + "pf_memlistcls": "nameMember", "name_elt": "strong", "pty_elt": "tr", "pty_wrapelt": "a" @@ -607,5 +607,24 @@ "name_cls": "name", "pty_elt": "span", "pty_cls": "itemContent" + }, + "171": { + "pf_elt": "tr", + "pf_memlistelt": "tbody", + "name_elt": "td", + "name_cls": "AlignCenter", + "pty_elt": "p", + "pty_cls": "pro_text", + "pty_wrapelt": "a", + "pty_wrapcls": "cont_btn" + }, + "173": { + "pf_elt": "li", + "pf_cls": "item_box", + "pf_memlistelt": "div", + "pf_memlistcls": "submem", + "name_elt": "li", + "name_cls": "name", + "pty_elt": "dl" } } \ No newline at end of file diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index 1d3621e..bee6477 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -69,11 +69,14 @@ def main() -> None: euc_kr = [6, 13, 16, 31, 72, 88, 112, 134, 154, 157, 163, 165, 167, 181, 197, 202] special_functions = ( list(range(1, 57)) - + [62, 63, 64, 88, 103, 107] + + [62, 63, 64, 88, 97, 103, 107] + list(range(113, 127)) - + [132, 134, 140, 142, 154, 155, 156, 157, 160, 161, 162, 163, 164, 165, 167] + + [132, 134, 140, 142, 154, 155, 156, 157, 160, 161, 162, 163, 164, 165, + 167, 170, 171, 172] ) - no_information = [106, 111] + selenium_basic = [76, 78, 101, 173] + no_information = [106, 111, 172] + error_unsolved = [170, 171] errors = [] f = open(JSON_PATH, "r") args = json.load(f) @@ -86,16 +89,20 @@ def main() -> None: parse_error_times = 0 timeouts = 0 N = 226 - for n in range(1, 57): - if n in no_information: + for n in range(1, 56): + if n in no_information + error_unsolved: + error_msg = "지난번 확인 시, 정당 정보 등이 홈페이지에 없었습니다. \ + 다시 확인해보시겠어요?" \ + if n in no_information \ + else "함수 구현에 실패한 웹페이지입니다." print( - f"| {n} | 오류: 지난번 확인 시, 정당 정보 등이 홈페이지에 없었습니다." "다시 확인해보시겠어요? 링크 : ", - data[n - 1]["URL"], + f"| {n} | 오류: ", error_msg, " 링크 : ", + data[n - 1]["URL"] ) errors.append(n) continue encoding = "euc-kr" if n in euc_kr else "utf-8" - result = None + council_url: str = "" try: council_url = data[n - 1]["URL"] council_args = args.get(str(n), None) @@ -107,18 +114,13 @@ def main() -> None: function_name = f"scrap_{n}" if hasattr(sys.modules[__name__], function_name): function_to_call = getattr(sys.modules[__name__], function_name) - if n < 57 or n in [62, 63, 64, 107]: - result = str(function_to_call(council_url, n).councilors) - else: - result = str( - function_to_call( - council_url, n, args=council_args - ).councilors - ) + result = str(function_to_call(council_url, n, args=council_args).councilors) + else: + print("[API/spreadsheet] Error : No function found") + elif n in selenium_basic: + result = str(sel_scrap_basic(council_url, n, council_args).councilors) else: - result = str( - scrap_basic(council_url, n, council_args, encoding).councilors - ) + result = str(scrap_basic(council_url, n, council_args, encoding).councilors) if "정보 없음" in result: print("정보 없음이 포함되어 있습니다.") parse_error_times += 1 diff --git a/scrap/utils/types.py b/scrap/utils/types.py index 7a576b0..3cbb2ba 100644 --- a/scrap/utils/types.py +++ b/scrap/utils/types.py @@ -4,30 +4,6 @@ """ from typing import Optional, List from dataclasses import dataclass -from enum import Enum - - -class CouncilType(str, Enum): - """ - 의회의 종류를 나타내는 열거형입니다. - """ - - LOCAL_COUNCIL = "local_council" - """ - 기초의회 - """ - METRO_COUNCIL = "metropolitan_council" - """ - 광역의회 - """ - - def __str__(self): - """ - JSON으로 직렬화하기 위해 문자열로 변환하는 함수를 오버라이드합니다. - """ - return str(self.value) - - from db.types import CouncilType, Councilor @@ -58,19 +34,19 @@ class ScrapBasicArgument: def __init__( self, - pf_elt: str = None, - pf_cls: str = None, - pf_memlistelt: str = None, - pf_memlistcls: str = None, - name_elt: str = None, - name_cls: str = None, - name_wrapelt: str = None, - name_wrapcls: str = None, - pty_elt: str = None, - pty_cls: str = None, - pty_wrapelt: str = None, - pty_wrapcls: str = None, - pty_wraptxt: str = None, + pf_elt: str | None = None, + pf_cls: str | None = None, + pf_memlistelt: str | None = None, + pf_memlistcls: str | None = None, + name_elt: str | None = None, + name_cls: str | None = None, + name_wrapelt: str | None = None, + name_wrapcls: str | None = None, + pty_elt: str | None = None, + pty_cls: str | None = None, + pty_wrapelt: str | None = None, + pty_wrapcls: str | None = None, + pty_wraptxt: str | None = None, ): """ ScrapBasicArgument 클래스의 생성자입니다. @@ -103,3 +79,5 @@ def __init__( self.pty_wrapelt = pty_wrapelt self.pty_wrapcls = pty_wrapcls self.pty_wraptxt = pty_wraptxt + +ArgsType = Optional[ScrapBasicArgument] \ No newline at end of file