diff --git a/.github/workflows/format-black.yml b/.github/workflows/format-black.yml new file mode 100644 index 0000000..94e591a --- /dev/null +++ b/.github/workflows/format-black.yml @@ -0,0 +1,16 @@ +name: Format with black +on: [push, pull_request] + +jobs: + format: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Format files using the black formatter + uses: rickstaa/action-black@v1 + id: action_black + with: + black_args: "." + - uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: Formatted with black \ No newline at end of file diff --git a/API/__init__.py b/API/__init__.py index 6a8f860..106136f 100644 --- a/API/__init__.py +++ b/API/__init__.py @@ -1,3 +1,3 @@ """ 공공데이터포털 API를 이용한 데이터 수집을 위한 패키지입니다. -""" \ No newline at end of file +""" diff --git a/API/candidate.py b/API/candidate.py index 1e2c311..464da71 100644 --- a/API/candidate.py +++ b/API/candidate.py @@ -6,24 +6,24 @@ from configurations.secrets import OpenDataPortalSecrets BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir) -base_url = 'http://apis.data.go.kr/9760000/PofelcddInfoInqireService/getPofelcddRegistSttusInfoInqire' +base_url = "http://apis.data.go.kr/9760000/PofelcddInfoInqireService/getPofelcddRegistSttusInfoInqire" page_no = 1 num_of_rows = 10000 parliamentVote = [20220601, 20230405] -sgCodes = input("Input the number of sgTypecode: ").split(',') +sgCodes = input("Input the number of sgTypecode: ").split(",") data_list = [] for sgId in parliamentVote: for code in sgCodes: params = { - 'serviceKey': OpenDataPortalSecrets.service_key, - 'pageNo': str(page_no), - 'numOfRows': str(num_of_rows), - 'sgId': str(sgId), - 'sgTypecode': str(code), - 'sggName': '', - 'sdName': '', - 'jdName': '' + "serviceKey": OpenDataPortalSecrets.service_key, + "pageNo": str(page_no), + "numOfRows": str(num_of_rows), + "sgId": str(sgId), + "sgTypecode": str(code), + "sggName": "", + "sdName": "", + "jdName": "", } response = requests.get(base_url, params=params) @@ -34,56 +34,58 @@ root = ET.fromstring(response.content) for item in root.findall(".//item"): - sgId = item.find('sgId').text - sggName = item.find('sggName').text - sdName = item.find('sdName').text - wiwName = item.find('wiwName').text - giho = item.find('giho').text - jdName = item.find('jdName').text - name = item.find('name').text - hanjaName = item.find('hanjaName').text - gender = item.find('gender').text - birthday = item.find('birthday').text - age = item.find('age').text - addr = item.find('addr').text - jobId = item.find('jobId').text - job = item.find('job').text - eduId = item.find('eduId').text - edu = item.find('edu').text - career1 = item.find('career1').text - career2 = item.find('career2').text - status = item.find('status').text + sgId = item.find("sgId").text + sggName = item.find("sggName").text + sdName = item.find("sdName").text + wiwName = item.find("wiwName").text + giho = item.find("giho").text + jdName = item.find("jdName").text + name = item.find("name").text + hanjaName = item.find("hanjaName").text + gender = item.find("gender").text + birthday = item.find("birthday").text + age = item.find("age").text + addr = item.find("addr").text + jobId = item.find("jobId").text + job = item.find("job").text + eduId = item.find("eduId").text + edu = item.find("edu").text + career1 = item.find("career1").text + career2 = item.find("career2").text + status = item.find("status").text - data_list.append({ - 'sgId': sgId, - 'sggName': sggName, - 'sdName': sdName, - 'wiwName': wiwName, - 'giho': giho, - 'jdName': jdName, - 'name': name, - 'hanjaName': hanjaName, - 'gender': gender, - 'birthday': birthday, - 'age': age, - 'addr': addr, - 'jobId': jobId, - 'job': job, - 'eduId': eduId, - 'edu': edu, - 'career1': career1, - 'career2': career2, - 'status': status - }) + data_list.append( + { + "sgId": sgId, + "sggName": sggName, + "sdName": sdName, + "wiwName": wiwName, + "giho": giho, + "jdName": jdName, + "name": name, + "hanjaName": hanjaName, + "gender": gender, + "birthday": birthday, + "age": age, + "addr": addr, + "jobId": jobId, + "job": job, + "eduId": eduId, + "edu": edu, + "career1": career1, + "career2": career2, + "status": status, + } + ) # Create a DataFrame from the collected data df = pd.DataFrame(data_list) # Save the DataFrame to an Excel file -directory_path = os.path.join(BASE_DIR, 'output') +directory_path = os.path.join(BASE_DIR, "output") if not os.path.exists(directory_path): os.makedirs(directory_path) -excel_file = '[후보][구시군의회의원].xlsx' +excel_file = "[후보][구시군의회의원].xlsx" df.to_excel(os.path.join(directory_path, excel_file), index=False) -print(f'Data has been saved to {excel_file}') +print(f"Data has been saved to {excel_file}") diff --git a/API/elected.py b/API/elected.py index 98311e6..38a6030 100644 --- a/API/elected.py +++ b/API/elected.py @@ -6,26 +6,34 @@ from configurations.secrets import OpenDataPortalSecrets BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir) -base_url = 'http://apis.data.go.kr/9760000/WinnerInfoInqireService2/getWinnerInfoInqire' -params ={'serviceKey' : OpenDataPortalSecrets.service_key,\ - 'pageNo' : '1', 'numOfRows' : '10', 'sgId' : '20230405', 'sgTypecode' : '2', 'sdName' : '전라북도', 'sggName' : '전주시을', 'jdName' : ''} +base_url = "http://apis.data.go.kr/9760000/WinnerInfoInqireService2/getWinnerInfoInqire" +params = { + "serviceKey": OpenDataPortalSecrets.service_key, + "pageNo": "1", + "numOfRows": "10", + "sgId": "20230405", + "sgTypecode": "2", + "sdName": "전라북도", + "sggName": "전주시을", + "jdName": "", +} page_no = 1 num_of_rows = 10000 parliamentVote = [20200415, 20210407, 20220601, 20230405] -sgCodes = input("Input the number of sgTypecode: ").split(',') +sgCodes = input("Input the number of sgTypecode: ").split(",") data_list = [] for sgId in parliamentVote: for code in sgCodes: params = { - 'serviceKey': OpenDataPortalSecrets.service_key, - 'pageNo': str(page_no), - 'numOfRows': str(num_of_rows), - 'sgId': str(sgId), - 'sgTypecode': str(code), - 'sggName': '', - 'sdName': '', - 'jdName': '' + "serviceKey": OpenDataPortalSecrets.service_key, + "pageNo": str(page_no), + "numOfRows": str(num_of_rows), + "sgId": str(sgId), + "sgTypecode": str(code), + "sggName": "", + "sdName": "", + "jdName": "", } response = requests.get(base_url, params=params) @@ -36,56 +44,58 @@ root = ET.fromstring(response.content) for item in root.findall(".//item"): - sgId = item.find('sgId').text - sggName = item.find('sggName').text - sdName = item.find('sdName').text - wiwName = item.find('wiwName').text - giho = item.find('giho').text - jdName = item.find('jdName').text - name = item.find('name').text - hanjaName = item.find('hanjaName').text - gender = item.find('gender').text - birthday = item.find('birthday').text - age = item.find('age').text - addr = item.find('addr').text - jobId = item.find('jobId').text - job = item.find('job').text - eduId = item.find('eduId').text - edu = item.find('edu').text - career1 = item.find('career1').text - career2 = item.find('career2').text + sgId = item.find("sgId").text + sggName = item.find("sggName").text + sdName = item.find("sdName").text + wiwName = item.find("wiwName").text + giho = item.find("giho").text + jdName = item.find("jdName").text + name = item.find("name").text + hanjaName = item.find("hanjaName").text + gender = item.find("gender").text + birthday = item.find("birthday").text + age = item.find("age").text + addr = item.find("addr").text + jobId = item.find("jobId").text + job = item.find("job").text + eduId = item.find("eduId").text + edu = item.find("edu").text + career1 = item.find("career1").text + career2 = item.find("career2").text # status = item.find('status').text - data_list.append({ - 'sgId': sgId, - 'sggName': sggName, - 'sdName': sdName, - 'wiwName': wiwName, - 'giho': giho, - 'jdName': jdName, - 'name': name, - 'hanjaName': hanjaName, - 'gender': gender, - 'birthday': birthday, - 'age': age, - 'addr': addr, - 'jobId': jobId, - 'job': job, - 'eduId': eduId, - 'edu': edu, - 'career1': career1, - 'career2': career2, - # 'status': status - }) + data_list.append( + { + "sgId": sgId, + "sggName": sggName, + "sdName": sdName, + "wiwName": wiwName, + "giho": giho, + "jdName": jdName, + "name": name, + "hanjaName": hanjaName, + "gender": gender, + "birthday": birthday, + "age": age, + "addr": addr, + "jobId": jobId, + "job": job, + "eduId": eduId, + "edu": edu, + "career1": career1, + "career2": career2, + # 'status': status + } + ) # Create a DataFrame from the collected data df = pd.DataFrame(data_list) # Save the DataFrame to an Excel file -directory_path = os.path.join(BASE_DIR, 'output') +directory_path = os.path.join(BASE_DIR, "output") if not os.path.exists(directory_path): os.makedirs(directory_path) -excel_file = '[당선][구시군의회의원].xlsx' +excel_file = "[당선][구시군의회의원].xlsx" df.to_excel(os.path.join(directory_path, excel_file), index=False) -print(f'Data has been saved to {excel_file}') \ No newline at end of file +print(f"Data has been saved to {excel_file}") diff --git a/API/votecode.py b/API/votecode.py index b9f7e05..d040987 100644 --- a/API/votecode.py +++ b/API/votecode.py @@ -6,32 +6,39 @@ import argparse parser = argparse.ArgumentParser() -parser.add_argument('-c', '--code', action='store_true', help='코드를 출력합니다.') +parser.add_argument("-c", "--code", action="store_true", help="코드를 출력합니다.") args = parser.parse_args() if args.code: - print("(0) 대표선거명 (1)대통령,(2)국회의원,(3)시도지사,(4)구시군장,(5)시도의원,\ - (6)구시군의회의원, (7)국회의원비례대표,(8)광역의원비례대표,(9)기초의원비례대표,(10)교육의원,(11)교육감") + print( + "(0) 대표선거명 (1)대통령,(2)국회의원,(3)시도지사,(4)구시군장,(5)시도의원,\ + (6)구시군의회의원, (7)국회의원비례대표,(8)광역의원비례대표,(9)기초의원비례대표,(10)교육의원,(11)교육감" + ) else: - print("sgTypecode를 입력하면 해당 sgTypecode와 일치하는 sgId 값을 출력합니다. 여러 개 입력하고 싶으면 ,로 구분해 주세요.") + print( + "sgTypecode를 입력하면 해당 sgTypecode와 일치하는 sgId 값을 출력합니다. 여러 개 입력하고 싶으면 ,로 구분해 주세요." + ) -url = 'http://apis.data.go.kr/9760000/CommonCodeService/getCommonSgCodeList' -params ={'serviceKey' : OpenDataPortalSecrets.service_key,\ - 'pageNo' : '1', 'numOfRows' : '1000'} +url = "http://apis.data.go.kr/9760000/CommonCodeService/getCommonSgCodeList" +params = { + "serviceKey": OpenDataPortalSecrets.service_key, + "pageNo": "1", + "numOfRows": "1000", +} response = requests.get(url, params=params) -xml_data = response.content.decode('utf-8') +xml_data = response.content.decode("utf-8") # Parse the XML data root = ET.fromstring(xml_data) # Find all elements where sgTypecode is equal to INPUT and extract their sgId values sgIds = set() -for code in input("Input the number of sgTypecode: ").split(','): - for item in root.findall(f'.//item[sgTypecode=\"{code}\"]'): - sgId_element = item.find('sgId') +for code in input("Input the number of sgTypecode: ").split(","): + for item in root.findall(f'.//item[sgTypecode="{code}"]'): + sgId_element = item.find("sgId") if sgId_element is not None: sgId = sgId_element.text sgIds.add(sgId) # Print the sgId values for sgId in sorted(sgIds): - print(sgId) \ No newline at end of file + print(sgId) diff --git a/__init__.py b/__init__.py index d2639ae..20221e2 100644 --- a/__init__.py +++ b/__init__.py @@ -1,3 +1,3 @@ """ 이 파일은 프로젝트 루트 폴더를 패키지로 인식하게 해주는 역할을 합니다. -""" \ No newline at end of file +""" diff --git a/configurations/__init__.py b/configurations/__init__.py index e88c5ca..21135bf 100644 --- a/configurations/__init__.py +++ b/configurations/__init__.py @@ -1,4 +1,4 @@ """ 스크립트 실행에 필요한 환경변수를 정의합니다. 환경변수는 프로젝트 루트 폴더에 .env 파일을 생성하여 불러올 수 있습니다. -""" \ No newline at end of file +""" diff --git a/configurations/secrets.py b/configurations/secrets.py index f8adc5b..2bcdfc7 100644 --- a/configurations/secrets.py +++ b/configurations/secrets.py @@ -5,22 +5,25 @@ from dotenv import load_dotenv # .env 파일로부터 환경변수를 불러옵니다. -load_dotenv( - verbose=False, - override=False -) +load_dotenv(verbose=False, override=False) + class MongoDBSecrets: """ MongoDB 연결을 위한 연결 정보를 정의합니다. """ - connection_uri = str(os.getenv("MONGO_CONNECTION_URI") or "mongodb://localhost:27017") + + connection_uri = str( + os.getenv("MONGO_CONNECTION_URI") or "mongodb://localhost:27017" + ) """PyMongo 클라이언트에서 데이터베이스 연결에 사용할 연결 uri입니다.""" database_name = str(os.getenv("MONGO_DATABASE") or "local") """PyMongo 클라이언트에서 사용할 데이터베이스 이름입니다.""" + class OpenDataPortalSecrets: """ 공공데이터포털(data.go.kr) API 호출에 필요한 서비스 키를 정의합니다. """ - service_key = str(os.getenv("OPEN_DATA_SERICE_KEY") or "") \ No newline at end of file + + service_key = str(os.getenv("OPEN_DATA_SERICE_KEY") or "") diff --git a/db/__init__.py b/db/__init__.py index b528b32..c2cab83 100644 --- a/db/__init__.py +++ b/db/__init__.py @@ -1,3 +1,3 @@ """ MongoDB 클라이언트 객체 및 데이터베이스에 값을 넣고 빼는 함수를 정의합니다. -""" \ No newline at end of file +""" diff --git a/db/client.py b/db/client.py index 3d83d42..194119e 100644 --- a/db/client.py +++ b/db/client.py @@ -4,4 +4,4 @@ client = pymongo.MongoClient(MongoDBSecrets.connection_uri) """ MongoDB 클라이언트 객체입니다. -""" \ No newline at end of file +""" diff --git a/scrap/__init__.py b/scrap/__init__.py index bb9b620..ae277ea 100644 --- a/scrap/__init__.py +++ b/scrap/__init__.py @@ -1,3 +1,3 @@ """ 지방의회 크롤링을 위한 파일들을 모아놓은 패키지입니다. -""" \ No newline at end of file +""" diff --git a/scrap/examples/__init__.py b/scrap/examples/__init__.py index fd09f19..ab05e7a 100644 --- a/scrap/examples/__init__.py +++ b/scrap/examples/__init__.py @@ -1,3 +1,3 @@ """ 예시 파일들을 모아놓은 폴더입니다. -""" \ No newline at end of file +""" diff --git a/scrap/examples/database.py b/scrap/examples/database.py index 371b168..d8c3dfe 100644 --- a/scrap/examples/database.py +++ b/scrap/examples/database.py @@ -3,7 +3,12 @@ """ from scrap.utils.database import save_to_database -from scrap.local_councils.seoul import scrap_dongdaemungu, scrap_gwangjingu, scrap_junggu +from scrap.local_councils.seoul import ( + scrap_dongdaemungu, + scrap_gwangjingu, + scrap_junggu, +) + def main() -> None: # 서울시 동대문구의회 크롤링 결과를 데이터베이스에 저장합니다. @@ -13,5 +18,6 @@ def main() -> None: # 서울시 중구의회 크롤링 결과를 데이터베이스에 저장합니다. save_to_database(scrap_junggu()) -if __name__ == '__main__': - main() \ No newline at end of file + +if __name__ == "__main__": + main() diff --git a/scrap/examples/junggu_scrap.py b/scrap/examples/junggu_scrap.py index 7416668..17b74cf 100644 --- a/scrap/examples/junggu_scrap.py +++ b/scrap/examples/junggu_scrap.py @@ -10,23 +10,29 @@ full_url = base_url + link response = requests.get(full_url, verify=False) -soup = BeautifulSoup(response.text, 'html.parser') +soup = BeautifulSoup(response.text, "html.parser") -profiles = soup.find_all('div', class_='profile') +profiles = soup.find_all("div", class_="profile") for profile in profiles: - name = profile.find('em', class_='name').text - party = profile.find('ul', class_='dot').find('li').find_next_sibling('li').find('span').text - + name = profile.find("em", class_="name").text + party = ( + profile.find("ul", class_="dot") + .find("li") + .find_next_sibling("li") + .find("span") + .text + ) + # 프로필보기 링크 가져오기 - profile_link = profile.find('a', class_='start') + profile_link = profile.find("a", class_="start") if profile_link: - profile_url = base_url + profile_link['href'] - + profile_url = base_url + profile_link["href"] + # 프로필 페이지로 이동 profile_response = requests.get(profile_url, verify=False) - profile_soup = BeautifulSoup(profile_response.text, 'html.parser') - + profile_soup = BeautifulSoup(profile_response.text, "html.parser") + # 프로필 페이지에서 원하는 정보를 추출하고 출력 # 여기에서 필요한 정보를 추출하는 방법에 따라 코드를 작성해주세요. @@ -34,10 +40,8 @@ # print('프로필 페이지 URL:', profile_url) # print('---') # "소속정당" 정보 추출 - party_info = profile_soup.find('em', text='소속정당 : ') - party = party_info.find_next('span').string if party_info else '정당 정보 없음' - - print('이름:', name) - print('정당:', party) - + party_info = profile_soup.find("em", text="소속정당 : ") + party = party_info.find_next("span").string if party_info else "정당 정보 없음" + print("이름:", name) + print("정당:", party) diff --git a/scrap/local_councils/__init__.py b/scrap/local_councils/__init__.py index d4f68ed..a4e1fc8 100644 --- a/scrap/local_councils/__init__.py +++ b/scrap/local_councils/__init__.py @@ -4,4 +4,4 @@ """ from .daejeon import * from .ulsan import * -from .basic import * \ No newline at end of file +from .basic import * diff --git a/scrap/local_councils/basic.py b/scrap/local_councils/basic.py index 0d52ccb..ec74750 100644 --- a/scrap/local_councils/basic.py +++ b/scrap/local_councils/basic.py @@ -7,9 +7,9 @@ import requests import copy -regex_pattern = re.compile(r'정\s*\S*\s*당', re.IGNORECASE) # Case-insensitive +regex_pattern = re.compile(r"정\s*\S*\s*당", re.IGNORECASE) # Case-insensitive party_keywords = getPartyList() -party_keywords.append('무소속') +party_keywords.append("무소속") def find(soup, element, class_): @@ -30,10 +30,9 @@ def get_profiles(soup, element, class_, memberlistelement, memberlistclass_): # 의원 목록 사이트에서 의원 프로필을 가져옴 if memberlistelement is not None: try: - soup = find_all(soup, memberlistelement, - class_=memberlistclass_)[0] + soup = find_all(soup, memberlistelement, class_=memberlistclass_)[0] except Exception: - raise RuntimeError('[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.') + raise RuntimeError("[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.") return find_all(soup, element, class_) @@ -41,7 +40,10 @@ def getDataFromAPI(url_format, data_uid, name_id, party_id) -> Councilor: # API로부터 의원 정보를 가져옴 url = url_format.format(data_uid) result = requests.get(url).json() - return Councilor(name=result[name_id] if result[name_id] else '이름 정보 없음', party=result[party_id] if result[party_id] else '정당 정보 없음') + return Councilor( + name=result[name_id] if result[name_id] else "이름 정보 없음", + party=result[party_id] if result[party_id] else "정당 정보 없음", + ) def get_name(profile, element, class_, wrapper_element, wrapper_class_): @@ -49,29 +51,28 @@ def get_name(profile, element, class_, wrapper_element, wrapper_class_): if wrapper_element is not None: profile = find_all(profile, wrapper_element, class_=wrapper_class_)[0] name_tag = find(profile, element, class_) - if name_tag.find('span'): + if name_tag.find("span"): name_tag = copy.copy(name_tag) # span 태그 안의 것들을 다 지움 - for span in name_tag.find_all('span'): + for span in name_tag.find_all("span"): span.decompose() name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + # name은 길고 그 중 strong태그 안에 이름이 있는 경우. 은평구, 수원시 등. if name_tag.strong is not None: - name = name_tag.strong.get_text( - strip=True) if name_tag.strong else "이름 정보 없음" - name = name.split('(')[0].split( - ':')[-1].strip() # 이름 뒷 한자이름, 앞 '이 름:' 제거 - # TODO : 만약 이름이 우연히 국회의장 혹은 김의원박 이라면? + name = name_tag.strong.get_text(strip=True) if name_tag.strong else "이름 정보 없음" + name = name.split("(")[0].split(":")[-1].strip() # 이름 뒷 한자이름, 앞 '이 름:' 제거 + # TODO : 만약 이름이 우연히 아래 단어를 포함하는 경우를 생각해볼만 함. if len(name) > 3: # 수식어가 이름 앞이나 뒤에 붙어있는 경우 - for keyword in ['부의장', '의원', '의장']: # 119, 강서구 등 + for keyword in ["부의장", "의원", "의장"]: # 119, 강서구 등 if keyword in name: - name = name.replace(keyword, '').strip() + name = name.replace(keyword, "").strip() for keyword in party_keywords: if keyword in name: # 인천 서구 등 - name = name.replace(keyword, '').strip() + name = name.replace(keyword, "").strip() break - name = name.split(' ')[0] # 이름 뒤에 직책이 따라오는 경우 + name = name.split(" ")[0] # 이름 뒤에 직책이 따라오는 경우 return name @@ -89,37 +90,42 @@ def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url) # 프로필보기 링크 가져오기 profile_link = find(profile, wrapper_element, class_=wrapper_class_) if wrapper_txt is not None: - profile_links = find_all(profile, 'a', class_=wrapper_class_) - profile_link = [ - link for link in profile_links if link.text == wrapper_txt][0] + profile_links = find_all(profile, "a", class_=wrapper_class_) + profile_link = [link for link in profile_links if link.text == wrapper_txt][0] if profile_link is None: - raise RuntimeError('[basic.py] 의원 프로필에서 프로필보기 링크를 가져오는데 실패했습니다.') + raise RuntimeError("[basic.py] 의원 프로필에서 프로필보기 링크를 가져오는데 실패했습니다.") # if base_url[-1] != '/': # base_url = base_url + '/' - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] try: profile = get_soup(profile_url, verify=False) except Exception: - raise RuntimeError('[basic.py] \'//\'가 있진 않나요?', ' url: ', profile_url) + raise RuntimeError("[basic.py] '//'가 있진 않나요?", " url: ", profile_url) return profile -def get_party(profile, element, class_, wrapper_element, wrapper_class_, wrapper_txt, url): +def get_party( + profile, element, class_, wrapper_element, wrapper_class_, wrapper_txt, url +): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: profile = goto_profilesite( - profile, wrapper_element, wrapper_class_, wrapper_txt, url) - party_pulp_list = list(filter(lambda x: regex_pattern.search( - str(x)), find_all(profile, element, class_))) + profile, wrapper_element, wrapper_class_, wrapper_txt, url + ) + party_pulp_list = list( + filter( + lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) + ) + ) if party_pulp_list == []: - raise RuntimeError('[basic.py] 정당정보 regex 실패') + raise RuntimeError("[basic.py] 정당정보 regex 실패") party_pulp = party_pulp_list[0] - party_string = party_pulp.get_text(strip=True).split(' ')[-1] + party_string = party_pulp.get_text(strip=True).split(" ")[-1] while True: if (party := extract_party(party_string)) is not None: return party - if (party_pulp := party_pulp.find_next('span')) is not None: - party_string = party_pulp.text.strip().split(' ')[-1] + if (party_pulp := party_pulp.find_next("span")) is not None: + party_string = party_pulp.text.strip().split(" ")[-1] else: return "[basic.py] 정당 정보 파싱 불가" @@ -128,54 +134,68 @@ def get_party_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: profile = goto_profilesite( - profile, wrapper_element, wrapper_class_, wrapper_txt, url) + profile, wrapper_element, wrapper_class_, wrapper_txt, url + ) party = extract_party(profile.text) - assert (party is not None) + assert party is not None return party -def scrap_basic(url, cid, args: ScrapBasicArgument, encoding='utf-8') -> ScrapResult: - '''의원 상세약력 스크랩 +def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapResult: + """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param n: 의회 id :param encoding: 받아온 soup 인코딩 :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False, encoding=encoding) councilors: list[Councilor] = [] - profiles = get_profiles(soup, args.pf_elt, args.pf_cls, - args.pf_memlistelt, args.pf_memlistcls) - print(cid, '번째 의회에는,', len(profiles), '명의 의원이 있습니다.') # 디버깅용. + + profiles = get_profiles( + soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = party = '' + name = party = "" try: - name = get_name(profile, args.name_elt, args.name_cls, - args.name_wrapelt, args.name_wrapcls) + name = get_name( + profile, + args.name_elt, + args.name_cls, + args.name_wrapelt, + args.name_wrapcls, + ) except Exception as e: - raise RuntimeError( - '[basic.py] 의원 이름을 가져오는데 실패했습니다. 이유 : ' + str(e)) + raise RuntimeError("[basic.py] 의원 이름을 가져오는데 실패했습니다. 이유 : " + str(e)) try: - party = get_party(profile, args.pty_elt, args.pty_cls, - args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url) + party = get_party( + profile, + args.pty_elt, + args.pty_cls, + args.pty_wrapelt, + args.pty_wrapcls, + args.pty_wraptxt, + url, + ) except Exception as e: try: party = get_party_easy( - profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url) + profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url + ) except Exception: - raise RuntimeError( - '[basic.py] 의원 정당을 가져오는데 실패했습니다. 이유: ' + str(e)) - + raise RuntimeError("[basic.py] 의원 정당을 가져오는데 실패했습니다. 이유: " + str(e)) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id=str(cid), council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': +if __name__ == "__main__": args3 = ScrapBasicArgument( - pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em') - print(scrap_basic('https://www.yscl.go.kr/kr/member/name.do', 3, args3)) # 서울 용산구 + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ) + print(scrap_basic("https://www.yscl.go.kr/kr/member/name.do", 3, args3)) # 서울 용산구 diff --git a/scrap/local_councils/busan.py b/scrap/local_councils/busan.py index e859a82..be10dc3 100644 --- a/scrap/local_councils/busan.py +++ b/scrap/local_councils/busan.py @@ -4,7 +4,9 @@ from scrap.utils.requests import get_soup -def scrap_26(url='https://www.bsjunggu.go.kr/council/board/list.junggu?boardId=BBS_0000118&menuCd=DOM_000000503003000000&contentsSid=755&cpath=%2Fcouncil') -> ScrapResult: +def scrap_26( + url="https://www.bsjunggu.go.kr/council/board/list.junggu?boardId=BBS_0000118&menuCd=DOM_000000503003000000&contentsSid=755&cpath=%2Fcouncil", +) -> ScrapResult: """부산시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -13,12 +15,16 @@ def scrap_26(url='https://www.bsjunggu.go.kr/council/board/list.junggu?boardId=B soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find('div', class_='bbs_blog council').find_all('dl'): - name_tag = profile.find_next('dt') - name = name_tag.get_text(strip=True).split()[-1].strip() if name_tag else "이름 정보 없음" + for profile in soup.find("div", class_="bbs_blog council").find_all("dl"): + name_tag = profile.find_next("dt") + name = ( + name_tag.get_text(strip=True).split()[-1].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find_next('li') + party = "정당 정보 없음" + party_info = profile.find_next("li") if party_info: party = party_info.get_text(strip=True)[3:] @@ -27,11 +33,13 @@ def scrap_26(url='https://www.bsjunggu.go.kr/council/board/list.junggu?boardId=B return ScrapResult( council_id="busan-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_27(url='https://www.bsseogu.go.kr/council/board/list.bsseogu?boardId=BBS_0000097&categoryCode1=8&menuCd=DOM_000000603001000000&contentsSid=785&cpath=%2Fcouncil') -> ScrapResult: +def scrap_27( + url="https://www.bsseogu.go.kr/council/board/list.bsseogu?boardId=BBS_0000097&categoryCode1=8&menuCd=DOM_000000603001000000&contentsSid=785&cpath=%2Fcouncil", +) -> ScrapResult: """부산시 서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -40,22 +48,22 @@ def scrap_27(url='https://www.bsseogu.go.kr/council/board/list.bsseogu?boardId=B soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - # 프로필 링크 스크랩을 위해 base_url 추출 + # 프로필 링크 스크랩을 위해 base_url 추출 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('div', class_='intro'): - name_tag = profile.find_next('span').find_next('span') + for profile in soup.find_all("div", class_="intro"): + name_tag = profile.find_next("span").find_next("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a') + profile_link = profile.find("a") if profile_link: - profile_url = base_url + '/council' + profile_link['href'] + profile_url = base_url + "/council" + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('span', string='소속정당') + party_info = profile_soup.find("span", string="소속정당") if party_info and (party_span := party_info.parent) is not None: party = party_span.text[4:].strip() @@ -64,11 +72,13 @@ def scrap_27(url='https://www.bsseogu.go.kr/council/board/list.bsseogu?boardId=B return ScrapResult( council_id="busan-seogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_28(url='https://www.bsdonggu.go.kr/council/index.donggu?menuCd=DOM_000000502004000000') -> ScrapResult: +def scrap_28( + url="https://www.bsdonggu.go.kr/council/index.donggu?menuCd=DOM_000000502004000000", +) -> ScrapResult: """부산시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -77,25 +87,25 @@ def scrap_28(url='https://www.bsdonggu.go.kr/council/index.donggu?menuCd=DOM_000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='council_box'): - name_tag = profile.find_next('span', class_='n2') + for profile in soup.find_all("div", class_="council_box"): + name_tag = profile.find_next("span", class_="n2") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find_next('span', class_='n1') + party = "정당 정보 없음" + party_info = profile.find_next("span", class_="n1") if party_info: - party = party_info.get_text(strip=True).split('(')[1][:-1].strip() + party = party_info.get_text(strip=True).split("(")[1][:-1].strip() councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan-donggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_29(url='https://www.yeongdo.go.kr/council/01211/01212.web') -> ScrapResult: +def scrap_29(url="https://www.yeongdo.go.kr/council/01211/01212.web") -> ScrapResult: """부산시 영도구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -104,11 +114,15 @@ def scrap_29(url='https://www.yeongdo.go.kr/council/01211/01212.web') -> ScrapRe soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='even-grid gap3pct panel1 p01205bg'): - name_tag = profile.find_next('strong', class_='h1 title') - name = name_tag.get_text(strip=True).split(' ')[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="even-grid gap3pct panel1 p01205bg"): + name_tag = profile.find_next("strong", class_="h1 title") + name = ( + name_tag.get_text(strip=True).split(" ")[0].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' + party = "정당 정보 없음" # TODO councilors.append(Councilor(name=name, party=party)) @@ -116,51 +130,57 @@ def scrap_29(url='https://www.yeongdo.go.kr/council/01211/01212.web') -> ScrapRe return ScrapResult( council_id="busan-yeongdogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_30(url='https://council.busanjin.go.kr/content/member/member.html') -> ScrapResult: +def scrap_30( + url="https://council.busanjin.go.kr/content/member/member.html", +) -> ScrapResult: """부산시 부산진구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('ul', class_='mlist') + soup = get_soup(url, verify=False).find("ul", class_="mlist") councilors: list[Councilor] = [] - for profile in soup.find_all('dl'): - name_tag = profile.find('dd', class_='name') + for profile in soup.find_all("dl"): + name_tag = profile.find("dd", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find_all('b')[2] + party = "정당 정보 없음" + party_info = profile.find_all("b")[2] if party_info: - party = party_info.find_next('span', class_='itemContent').get_text(strip=True) + party = party_info.find_next("span", class_="itemContent").get_text( + strip=True + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan-busanjingu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_31(url='http://council.dongnae.go.kr/source/kr/member/active.html') -> ScrapResult: +def scrap_31( + url="http://council.dongnae.go.kr/source/kr/member/active.html", +) -> ScrapResult: """부산시 동래구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for name_tag in soup.find_all('li', class_='name'): + for name_tag in soup.find_all("li", class_="name"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = name_tag.find_next('li').find_next('li') + party = "정당 정보 없음" + party_info = name_tag.find_next("li").find_next("li") if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -169,11 +189,11 @@ def scrap_31(url='http://council.dongnae.go.kr/source/kr/member/active.html') -> return ScrapResult( council_id="busan-dongnaegu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_32(url='https://council.bsnamgu.go.kr/kr/member/active') -> ScrapResult: +def scrap_32(url="https://council.bsnamgu.go.kr/kr/member/active") -> ScrapResult: """부산시 남구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -182,25 +202,32 @@ def scrap_32(url='https://council.bsnamgu.go.kr/kr/member/active') -> ScrapResul soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='profile'): - name_tag = profile.find('strong') + for profile in soup.find_all("dl", class_="profile"): + name_tag = profile.find("strong") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', class_='sbj', string='정 당') + party = "정당 정보 없음" + party_info = profile.find("span", class_="sbj", string="정 당") if party_info: - party = party_info.find_next('span', class_='detail').get_text(strip=True).split()[-1].strip() + party = ( + party_info.find_next("span", class_="detail") + .get_text(strip=True) + .split()[-1] + .strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan-namgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_33(url='https://www.bsbukgu.go.kr/council/index.bsbukgu?menuCd=DOM_000000808001001000') -> ScrapResult: +def scrap_33( + url="https://www.bsbukgu.go.kr/council/index.bsbukgu?menuCd=DOM_000000808001001000", +) -> ScrapResult: """부산시 북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -209,12 +236,12 @@ def scrap_33(url='https://www.bsbukgu.go.kr/council/index.bsbukgu?menuCd=DOM_000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='info'): - name_tag = profile.find('span') + for profile in soup.find_all("dl", class_="info"): + name_tag = profile.find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("span", string="소속정당") if party_info: party = party_info.parent.get_text(strip=True).split()[-1].strip() @@ -223,33 +250,35 @@ def scrap_33(url='https://www.bsbukgu.go.kr/council/index.bsbukgu?menuCd=DOM_000 return ScrapResult( council_id="busan-bukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_34(url='https://council.haeundae.go.kr/board/list.do?boardId=BBS_0000096&categoryCode1=08&menuCd=DOM_000000702001001000&contentsSid=330') -> ScrapResult: +def scrap_34( + url="https://council.haeundae.go.kr/board/list.do?boardId=BBS_0000096&categoryCode1=08&menuCd=DOM_000000702001001000&contentsSid=330", +) -> ScrapResult: """부산시 해운대구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('div', class_='initial_list') + soup = get_soup(url, verify=False).find("div", class_="initial_list") councilors: list[Councilor] = [] # 프로필 링크 스크랩을 위해 base_url 추출 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for name_tag in soup.find_all('dd'): + for name_tag in soup.find_all("dd"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" # 프로필보기 링크 가져오기 - profile_link = name_tag.find('a') + profile_link = name_tag.find("a") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('span', string='소속정당') + party_info = profile_soup.find("span", string="소속정당") if party_info and (party_span := party_info.parent) is not None: party = party_span.text[4:].strip() @@ -258,26 +287,28 @@ def scrap_34(url='https://council.haeundae.go.kr/board/list.do?boardId=BBS_00000 return ScrapResult( council_id="busan-haeundaegu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_35(url='https://council.gijang.go.kr/source/korean/member/active.html') -> ScrapResult: +def scrap_35( + url="https://council.gijang.go.kr/source/korean/member/active.html", +) -> ScrapResult: """부산시 기장군 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('ul', class_='wulli bul02'): - li_tags = profile.find_all('li') + for profile in soup.find_all("ul", class_="wulli bul02"): + li_tags = profile.find_all("li") name_tag = li_tags[0] name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" party_info = li_tags[2] if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -287,11 +318,13 @@ def scrap_35(url='https://council.gijang.go.kr/source/korean/member/active.html' return ScrapResult( council_id="busan-gijanggun", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_36(url='https://www.saha.go.kr/council/congressMember/list03.do?mId=0403000000') -> ScrapResult: +def scrap_36( + url="https://www.saha.go.kr/council/congressMember/list03.do?mId=0403000000", +) -> ScrapResult: """부산시 사하구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -300,12 +333,12 @@ def scrap_36(url='https://www.saha.go.kr/council/congressMember/list03.do?mId=04 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for district_tag in soup.find_all('div', class_='list_member'): - for name_tag in district_tag.find_all('h4', class_='name'): + for district_tag in soup.find_all("div", class_="list_member"): + for name_tag in district_tag.find_all("h4", class_="name"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = name_tag.find_next('span', string='소속당 : ') + party = "정당 정보 없음" + party_info = name_tag.find_next("span", string="소속당 : ") if party_info: party = party_info.parent.get_text(strip=True)[7:].strip() @@ -314,27 +347,29 @@ def scrap_36(url='https://www.saha.go.kr/council/congressMember/list03.do?mId=04 return ScrapResult( council_id="busan-sahagu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_37(url='https://council.geumjeong.go.kr/index.geumj?menuCd=DOM_000000716001000000') -> ScrapResult: +def scrap_37( + url="https://council.geumjeong.go.kr/index.geumj?menuCd=DOM_000000716001000000", +) -> ScrapResult: """부산시 금정구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('div', class_='council_list') + soup = get_soup(url, verify=False).find("div", class_="council_list") councilors: list[Councilor] = [] - for profile in soup.find_all('a'): - name_tag = profile.find('span', class_='tit').find('span') + for profile in soup.find_all("a"): + name_tag = profile.find("span", class_="tit").find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - profile_url = profile['href'][:65] + '1' + profile['href'][66:] + profile_url = profile["href"][:65] + "1" + profile["href"][66:] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('span', class_='name', string='정당') + party_info = profile_soup.find("span", class_="name", string="정당") if party_info and (party_span := party_info.parent) is not None: party = party_span.text[2:].strip() @@ -343,11 +378,13 @@ def scrap_37(url='https://council.geumjeong.go.kr/index.geumj?menuCd=DOM_0000007 return ScrapResult( council_id="busan-geumjeonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_38(url='https://www.bsgangseo.go.kr/council/contents.do?mId=0203000000') -> ScrapResult: +def scrap_38( + url="https://www.bsgangseo.go.kr/council/contents.do?mId=0203000000", +) -> ScrapResult: """부산시 강서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -356,14 +393,16 @@ def scrap_38(url='https://www.bsgangseo.go.kr/council/contents.do?mId=0203000000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile_img in soup.find_all('button', class_='btn_close'): - profile = profile_img.find_next('dl') + for profile_img in soup.find_all("button", class_="btn_close"): + profile = profile_img.find_next("dl") - name_tag = profile.find('dd', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + name_tag = profile.find("dd", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('span', class_='bold', string='정당 : ') + party = "정당 정보 없음" + party_info = profile.find("span", class_="bold", string="정당 : ") if party_info: party = party_info.parent.get_text(strip=True)[5:].strip() @@ -372,11 +411,13 @@ def scrap_38(url='https://www.bsgangseo.go.kr/council/contents.do?mId=0203000000 return ScrapResult( council_id="busan-gangseogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_39(url='https://www.yeonje.go.kr/council/assemblyIntro/list.do?mId=0201000000') -> ScrapResult: +def scrap_39( + url="https://www.yeonje.go.kr/council/assemblyIntro/list.do?mId=0201000000", +) -> ScrapResult: """부산시 연제구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -385,11 +426,11 @@ def scrap_39(url='https://www.yeonje.go.kr/council/assemblyIntro/list.do?mId=020 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='info'): - name_tag = profile.find('span') + for profile in soup.find_all("dl", class_="info"): + name_tag = profile.find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당정보없음' + party = "정당정보없음" # TODO @@ -398,11 +439,13 @@ def scrap_39(url='https://www.yeonje.go.kr/council/assemblyIntro/list.do?mId=020 return ScrapResult( council_id="busan-yeonjegu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_40(url='https://www.suyeong.go.kr/council/index.suyeong?menuCd=DOM_000001402001001000&link=success&cpath=%2Fcouncil') -> ScrapResult: +def scrap_40( + url="https://www.suyeong.go.kr/council/index.suyeong?menuCd=DOM_000001402001001000&link=success&cpath=%2Fcouncil", +) -> ScrapResult: """부산시 수영구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -411,12 +454,12 @@ def scrap_40(url='https://www.suyeong.go.kr/council/index.suyeong?menuCd=DOM_000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='mem_info'): - name_tag = profile.find('span', class_='name').find('span') + for profile in soup.find_all("div", class_="mem_info"): + name_tag = profile.find("span", class_="name").find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', string='소속정당 :') + party = "정당 정보 없음" + party_info = profile.find("span", string="소속정당 :") if party_info: party = party_info.parent.get_text(strip=True)[6:].strip() @@ -425,11 +468,13 @@ def scrap_40(url='https://www.suyeong.go.kr/council/index.suyeong?menuCd=DOM_000 return ScrapResult( council_id="busan-suyeonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_41(url='https://www.sasang.go.kr/council/index.sasang?menuCd=DOM_000000202005000000') -> ScrapResult: +def scrap_41( + url="https://www.sasang.go.kr/council/index.sasang?menuCd=DOM_000000202005000000", +) -> ScrapResult: """부산시 사상구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -438,24 +483,28 @@ def scrap_41(url='https://www.sasang.go.kr/council/index.sasang?menuCd=DOM_00000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for district in soup.find_all('ul', class_='council_list'): - for profile in district.find_all('li'): - name_tag = profile.find('span', class_='tit') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('span', class_='con') + for district in soup.find_all("ul", class_="council_list"): + for profile in district.find_all("li"): + name_tag = profile.find("span", class_="tit") + name = ( + name_tag.get_text(strip=True).split()[0].strip() + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("span", class_="con") if party_info: - party = party_info.get_text(strip=True).split(']')[0].strip()[1:] + party = party_info.get_text(strip=True).split("]")[0].strip()[1:] councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan-sasanggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_41()) \ No newline at end of file +if __name__ == "__main__": + print(scrap_41()) diff --git a/scrap/local_councils/daegu.py b/scrap/local_councils/daegu.py index a11baac..f565e11 100644 --- a/scrap/local_councils/daegu.py +++ b/scrap/local_councils/daegu.py @@ -4,21 +4,25 @@ from scrap.utils.requests import get_soup -def scrap_42(url='https://junggucouncil.daegu.kr/source/main03/main01.html?d_th=8') -> ScrapResult: +def scrap_42( + url="https://junggucouncil.daegu.kr/source/main03/main01.html?d_th=8", +) -> ScrapResult: """대전시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('li', class_='name') - name = name_tag.get_text(strip=True).split()[1].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("li", class_="name") + name = ( + name_tag.get_text(strip=True).split()[1].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = name_tag.find_next('li').find_next('li') + party = "정당 정보 없음" + party_info = name_tag.find_next("li").find_next("li") if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -27,11 +31,13 @@ def scrap_42(url='https://junggucouncil.daegu.kr/source/main03/main01.html?d_th= return ScrapResult( council_id="daejeon-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_43(url='https://www.donggucl.daegu.kr/content/member/member.html') -> ScrapResult: +def scrap_43( + url="https://www.donggucl.daegu.kr/content/member/member.html", +) -> ScrapResult: """대전시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -39,22 +45,26 @@ def scrap_43(url='https://www.donggucl.daegu.kr/content/member/member.html') -> """ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - - # 프로필 링크 스크랩을 위해 base_url 추출 + + # 프로필 링크 스크랩을 위해 base_url 추출 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for name_tag in soup.find_all('dd', class_='name'): - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + for name_tag in soup.find_all("dd", class_="name"): + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) + party = "정당 정보 없음" - profile_link = name_tag.find_next('a', class_='abtn_profile') + profile_link = name_tag.find_next("a", class_="abtn_profile") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('th', scope='row', string='소속정당') - if party_info and (party_span := party_info.find_next('td')) is not None: + party_info = profile_soup.find("th", scope="row", string="소속정당") + if party_info and (party_span := party_info.find_next("td")) is not None: party = party_span.get_text(strip=True) councilors.append(Councilor(name=name, party=party)) @@ -62,11 +72,11 @@ def scrap_43(url='https://www.donggucl.daegu.kr/content/member/member.html') -> return ScrapResult( council_id="daejeon-donggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_44(url='https://www.dgscouncil.go.kr/kr/member/active') -> ScrapResult: +def scrap_44(url="https://www.dgscouncil.go.kr/kr/member/active") -> ScrapResult: """대전시 서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -75,12 +85,16 @@ def scrap_44(url='https://www.dgscouncil.go.kr/kr/member/active') -> ScrapResult soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='profile'): - name_tag = profile.find('strong', class_='name') - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("dl", class_="profile"): + name_tag = profile.find("strong", class_="name") + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('li').find_next('li').find_next('li') + party = "정당 정보 없음" + party_info = profile.find("li").find_next("li").find_next("li") if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -89,11 +103,13 @@ def scrap_44(url='https://www.dgscouncil.go.kr/kr/member/active') -> ScrapResult return ScrapResult( council_id="daejeon-seogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_45(url='https://nam.daegu.kr/council/index.do?menu_id=00000548') -> ScrapResult: +def scrap_45( + url="https://nam.daegu.kr/council/index.do?menu_id=00000548", +) -> ScrapResult: """대전시 남구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -102,12 +118,14 @@ def scrap_45(url='https://nam.daegu.kr/council/index.do?menu_id=00000548') -> Sc soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('span', class_='name2') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("span", class_="name2") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', class_='name', string='소속정당').find_next('span', class_='name3') + party = "정당 정보 없음" + party_info = profile.find("span", class_="name", string="소속정당").find_next( + "span", class_="name3" + ) if party_info: party = party_info.get_text(strip=True) @@ -116,11 +134,11 @@ def scrap_45(url='https://nam.daegu.kr/council/index.do?menu_id=00000548') -> Sc return ScrapResult( council_id="daejeon-namgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_46(url='https://bukgucouncil.daegu.kr/kr/member/name.do') -> ScrapResult: +def scrap_46(url="https://bukgucouncil.daegu.kr/kr/member/name.do") -> ScrapResult: """대전시 북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -129,12 +147,14 @@ def scrap_46(url='https://bukgucouncil.daegu.kr/kr/member/name.do') -> ScrapResu soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당 : ').find_next('span') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당 : ").find_next("span") if party_info: party = party_info.get_text(strip=True) @@ -143,11 +163,13 @@ def scrap_46(url='https://bukgucouncil.daegu.kr/kr/member/name.do') -> ScrapResu return ScrapResult( council_id="daejeon-bukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_47(url='https://suseongcouncil.suseong.kr/ss_council/content/?pos=active&me_code=2010') -> ScrapResult: +def scrap_47( + url="https://suseongcouncil.suseong.kr/ss_council/content/?pos=active&me_code=2010", +) -> ScrapResult: """대전시 수성구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -156,12 +178,12 @@ def scrap_47(url='https://suseongcouncil.suseong.kr/ss_council/content/?pos=acti soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='item'): - name_tag = profile.find('p', class_='name').find('span') + for profile in soup.find_all("div", class_="item"): + name_tag = profile.find("p", class_="name").find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find_all('li')[2].find('span') + party = "정당 정보 없음" + party_info = profile.find_all("li")[2].find("span") if party_info: party = party_info.get_text(strip=True) @@ -170,11 +192,13 @@ def scrap_47(url='https://suseongcouncil.suseong.kr/ss_council/content/?pos=acti return ScrapResult( council_id="daejeon-suseonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_48(url='https://www.dalseocouncil.daegu.kr/content/member/member.html') -> ScrapResult: +def scrap_48( + url="https://www.dalseocouncil.daegu.kr/content/member/member.html", +) -> ScrapResult: """대전시 달서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -183,11 +207,15 @@ def scrap_48(url='https://www.dalseocouncil.daegu.kr/content/member/member.html' soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for name_tag in soup.find_all('dd', class_='name'): - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" + for name_tag in soup.find_all("dd", class_="name"): + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = name_tag.find_next('span', string='소속정당').parent + party = "정당 정보 없음" + party_info = name_tag.find_next("span", string="소속정당").parent if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -196,11 +224,13 @@ def scrap_48(url='https://www.dalseocouncil.daegu.kr/content/member/member.html' return ScrapResult( council_id="daejeon-dalseogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_49(url='https://council.dalseong.go.kr/content/member/member.html') -> ScrapResult: +def scrap_49( + url="https://council.dalseong.go.kr/content/member/member.html", +) -> ScrapResult: """대전시 달성군 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -213,27 +243,35 @@ def scrap_49(url='https://council.dalseong.go.kr/content/member/member.html') -> parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for name_tag in soup.find_all('dd', class_='name'): - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + for name_tag in soup.find_all("dd", class_="name"): + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) + party = "정당 정보 없음" - profile_link = name_tag.find_next('a', class_='abtn1') + profile_link = name_tag.find_next("a", class_="abtn1") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('span', class_='item', string='소속정당') - if party_info and (party_span := party_info.find_next('span', class_='item_content')) is not None: + party_info = profile_soup.find("span", class_="item", string="소속정당") + if ( + party_info + and (party_span := party_info.find_next("span", class_="item_content")) + is not None + ): party = party_span.get_text(strip=True) - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="daejeon-dalseonggun", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_49()) \ No newline at end of file +if __name__ == "__main__": + print(scrap_49()) diff --git a/scrap/local_councils/daejeon.py b/scrap/local_councils/daejeon.py index 14a484f..091a130 100644 --- a/scrap/local_councils/daejeon.py +++ b/scrap/local_councils/daejeon.py @@ -5,12 +5,13 @@ from scrap.utils.requests import get_soup import re -def scrap_65(url = 'https://council.donggu.go.kr/kr/member/active') -> ScrapResult: - '''대전시 동구 페이지에서 의원 상세약력 스크랩 + +def scrap_65(url="https://council.donggu.go.kr/kr/member/active") -> ScrapResult: + """대전시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] @@ -18,20 +19,23 @@ def scrap_65(url = 'https://council.donggu.go.kr/kr/member/active') -> ScrapResu parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('dl', class_='profile'): + for profile in soup.find_all("dl", class_="profile"): name_tag = profile.find("strong", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a', class_='start') + profile_link = profile.find("a", class_="start") if profile_link: - data_uid = profile_link.get('data-uid') + data_uid = profile_link.get("data-uid") if data_uid: - profile_url = base_url + f'/kr/member/profile_popup?uid={data_uid}' + profile_url = base_url + f"/kr/member/profile_popup?uid={data_uid}" profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('strong', string='정 당') - if party_info and (party_span := party_info.find_next('span')) is not None: + party_info = profile_soup.find("strong", string="정 당") + if ( + party_info + and (party_span := party_info.find_next("span")) is not None + ): party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -39,19 +43,20 @@ def scrap_65(url = 'https://council.donggu.go.kr/kr/member/active') -> ScrapResu return ScrapResult( council_id="daejeon-donggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_66(url = 'https://council.djjunggu.go.kr/kr/member/name.do') -> ScrapResult: - '''대전시 중구 페이지에서 의원 상세약력 스크랩 + +def scrap_66(url="https://council.djjunggu.go.kr/kr/member/name.do") -> ScrapResult: + """대전시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("div", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -64,24 +69,29 @@ def scrap_66(url = 'https://council.djjunggu.go.kr/kr/member/name.do') -> ScrapR return ScrapResult( council_id="daejeon-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_67(url = 'https://www.seogucouncil.daejeon.kr/svc/mbr/MbrPresent.do') -> ScrapResult: - '''대전시 서구 페이지에서 의원 상세약력 스크랩 + +def scrap_67( + url="https://www.seogucouncil.daejeon.kr/svc/mbr/MbrPresent.do", +) -> ScrapResult: + """대전시 서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('dl'): + for profile in soup.find_all("dl"): name_tag = profile.find("dd", class_="name") - name = name_tag.get_text(strip=True).replace(" 의원", "") if name_tag else "이름 정보 없음" + name = ( + name_tag.get_text(strip=True).replace(" 의원", "") if name_tag else "이름 정보 없음" + ) party = "정당 정보 없음" - party_info = list(filter(lambda x: '정당' in str(x), profile.find_all("dd"))) + party_info = list(filter(lambda x: "정당" in str(x), profile.find_all("dd"))) if party_info: party = party_info[0].get_text(strip=True).replace("정당: ", "") @@ -90,25 +100,26 @@ def scrap_67(url = 'https://www.seogucouncil.daejeon.kr/svc/mbr/MbrPresent.do') return ScrapResult( council_id="daejeon-seogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_68(url = 'https://yuseonggucouncil.go.kr/page/page02_01_01.php') -> ScrapResult: - '''대전시 유성구 페이지에서 의원 상세약력 스크랩 + +def scrap_68(url="https://yuseonggucouncil.go.kr/page/page02_01_01.php") -> ScrapResult: + """대전시 유성구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("em", class_="name") # () 안에 있는 한자를 제거 (ex. 김영희(金英姬) -> 김영희) - name = name_tag.get_text(strip=True).split('(')[0] if name_tag else "이름 정보 없음" + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" party = "정당 정보 없음" - regex_pattern = re.compile(r'정\s*당\s*:', re.IGNORECASE) # Case-insensitive + regex_pattern = re.compile(r"정\s*당\s*:", re.IGNORECASE) # Case-insensitive party_info = profile.find("em", string=regex_pattern) if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -117,24 +128,25 @@ def scrap_68(url = 'https://yuseonggucouncil.go.kr/page/page02_01_01.php') -> Sc return ScrapResult( council_id="daejeon-yuseonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_69(url = 'https://council.daedeok.go.kr/kr/member/name.do') -> ScrapResult: - '''대전시 대덕구 페이지에서 의원 상세약력 스크랩 + +def scrap_69(url="https://council.daedeok.go.kr/kr/member/name.do") -> ScrapResult: + """대전시 대덕구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" party = "정당 정보 없음" - regex_pattern = re.compile(r'정\s*당\s*:', re.IGNORECASE) # Case-insensitive + regex_pattern = re.compile(r"정\s*당\s*:", re.IGNORECASE) # Case-insensitive party_info = profile.find("em", string=regex_pattern) if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -143,8 +155,9 @@ def scrap_69(url = 'https://council.daedeok.go.kr/kr/member/name.do') -> ScrapRe return ScrapResult( council_id="daejeon-daedeokgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_69()) \ No newline at end of file + +if __name__ == "__main__": + print(scrap_69()) diff --git a/scrap/local_councils/gwangju.py b/scrap/local_councils/gwangju.py index b34f872..a162b4a 100644 --- a/scrap/local_councils/gwangju.py +++ b/scrap/local_councils/gwangju.py @@ -2,4 +2,4 @@ """ from scrap.utils.types import CouncilType, Councilor, ScrapResult from scrap.utils.requests import get_soup -from scrap.local_councils.basic import * \ No newline at end of file +from scrap.local_councils.basic import * diff --git a/scrap/local_councils/gyeonggi.py b/scrap/local_councils/gyeonggi.py index 7fc2627..8d22ab0 100644 --- a/scrap/local_councils/gyeonggi.py +++ b/scrap/local_councils/gyeonggi.py @@ -4,66 +4,88 @@ from scrap.utils.requests import get_soup from scrap.local_councils.basic import * + def get_profiles_88(soup, element, class_, memberlistelement, memberlistclass_): # 의원 목록 사이트에서 의원 프로필을 가져옴 if memberlistelement is not None: try: soup = soup.find_all(memberlistelement, id=memberlistclass_)[0] except Exception: - raise RuntimeError('[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.') + raise RuntimeError("[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.") return soup.find_all(element, class_) + def get_party_88(profile, element, class_, wrapper_element, wrapper_class_, url): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" # 프로필보기 링크 가져오기 - profile_link = find(profile, wrapper_element, class_=wrapper_class_).find('a') - profile_url = base_url + profile_link['href'] - profile = get_soup(profile_url, verify=False, encoding='euc-kr') - party_pulp_list = list(filter(lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_))) - if party_pulp_list == []: raise RuntimeError('[basic.py] 정당정보 regex 실패') + profile_link = find(profile, wrapper_element, class_=wrapper_class_).find("a") + profile_url = base_url + profile_link["href"] + profile = get_soup(profile_url, verify=False, encoding="euc-kr") + party_pulp_list = list( + filter( + lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) + ) + ) + if party_pulp_list == []: + raise RuntimeError("[basic.py] 정당정보 regex 실패") party_pulp = party_pulp_list[0] - party_string = party_pulp.get_text(strip=True).split(' ')[-1] + party_string = party_pulp.get_text(strip=True).split(" ")[-1] while True: if (party := extract_party(party_string)) is not None: return party - if (party_pulp := party_pulp.find_next('span')) is not None: - party_string = party_pulp.text.strip().split(' ')[-1] + if (party_pulp := party_pulp.find_next("span")) is not None: + party_string = party_pulp.text.strip().split(" ")[-1] else: return "[basic.py] 정당 정보 파싱 불가" + def scrap_88(url, args: ScrapBasicArgument) -> ScrapResult: - '''의원 상세약력 스크랩 + """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param args: ScrapBasicArgument 객체 :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ cid = 88 - encoding = 'euc-kr' + encoding = "euc-kr" soup = get_soup(url, verify=False, encoding=encoding) councilors: list[Councilor] = [] party_in_main_page = any(keyword in soup.text for keyword in party_keywords) - profiles = get_profiles_88(soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls) - print(cid, '번째 의회에는,', len(profiles), '명의 의원이 있습니다.') # 디버깅용. + profiles = get_profiles_88( + soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name(profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls) - party = '' + name = get_name( + profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls + ) + party = "" try: - party = get_party_88(profile, args.pty_elt, args.pty_cls, args.pty_wrapelt, args.pty_wrapcls, url) + party = get_party_88( + profile, + args.pty_elt, + args.pty_cls, + args.pty_wrapelt, + args.pty_wrapcls, + url, + ) except Exception: - party = get_party_easy(profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url) + party = get_party_easy( + profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id=str(cid), council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) + def get_party_103(profile, element, class_, wrapper_element, wrapper_class_, url): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: @@ -71,41 +93,53 @@ def get_party_103(profile, element, class_, wrapper_element, wrapper_class_, url base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" # 프로필보기 링크 가져오기 profile_link = profile.find(wrapper_element, class_=wrapper_class_) - profile_url = base_url + '/member/' + profile_link['href'] + profile_url = base_url + "/member/" + profile_link["href"] profile = get_soup(profile_url, verify=False) - party_pulp_list = list(filter(lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_))) - if party_pulp_list == []: raise RuntimeError('[basic.py] 정당정보 regex 실패') + party_pulp_list = list( + filter( + lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) + ) + ) + if party_pulp_list == []: + raise RuntimeError("[basic.py] 정당정보 regex 실패") party_pulp = party_pulp_list[0] - party_string = party_pulp.get_text(strip=True).split(' ')[-1] + party_string = party_pulp.get_text(strip=True).split(" ")[-1] while True: if (party := extract_party(party_string)) is not None: return party - if (party_pulp := party_pulp.find_next('span')) is not None: - party_string = party_pulp.text.strip().split(' ')[-1] + if (party_pulp := party_pulp.find_next("span")) is not None: + party_string = party_pulp.text.strip().split(" ")[-1] else: return "[basic.py] 정당 정보 파싱 불가" + def scrap_103(url, args: ScrapBasicArgument) -> ScrapResult: - '''의원 상세약력 스크랩 + """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param args: ScrapBasicArgument 객체 :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ cid = 103 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] party_in_main_page = any(keyword in soup.text for keyword in party_keywords) - profiles = get_profiles_88(soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls) - print(cid, '번째 의회에는,', len(profiles), '명의 의원이 있습니다.') # 디버깅용. + profiles = get_profiles_88( + soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name(profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls) - party = get_party_103(profile, args.pty_elt, args.pty_cls, args.pty_wrapelt, args.pty_wrapcls, url) + name = get_name( + profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls + ) + party = get_party_103( + profile, args.pty_elt, args.pty_cls, args.pty_wrapelt, args.pty_wrapcls, url + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id=str(cid), council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors - ) \ No newline at end of file + councilors=councilors, + ) diff --git a/scrap/local_councils/incheon.py b/scrap/local_councils/incheon.py index 2506579..58384c3 100644 --- a/scrap/local_councils/incheon.py +++ b/scrap/local_councils/incheon.py @@ -4,7 +4,8 @@ from scrap.utils.requests import get_soup from scrap.local_councils.basic import * -def scrap_50(url='https://www.icjg.go.kr/council/cnmi0101c') -> ScrapResult: + +def scrap_50(url="https://www.icjg.go.kr/council/cnmi0101c") -> ScrapResult: """인천시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -13,58 +14,61 @@ def scrap_50(url='https://www.icjg.go.kr/council/cnmi0101c') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for name_tag in soup.find_all('p', class_='name'): - name_tag_str = name_tag.get_text(strip=True).split('[') + for name_tag in soup.find_all("p", class_="name"): + name_tag_str = name_tag.get_text(strip=True).split("[") name = name_tag_str[0].strip() party = name_tag_str[-1][:-1].strip() - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="incheon-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_51(url='https://council.icdonggu.go.kr/korean/member/active') -> ScrapResult: +def scrap_51(url="https://council.icdonggu.go.kr/korean/member/active") -> ScrapResult: """인천시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - raise Exception('현재 인천시 동구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다') + raise Exception("현재 인천시 동구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다") # soup = get_soup(url, verify=False) # councilors: list[Councilor] = [] - # # 프로필 링크 스크랩을 위해 base_url 추출 - # parsed_url = urlparse(url) - # base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - # for name_tag in soup.find_all('strong', class_='name'): - # name = name_tag.get_text(strip=True) - # party = '정당 정보 없음' - - # profile_link = name_tag.find_next('a', class_='abtn1') - # if profile_link: - # profile_url = base_url + profile_link['onclick'][13:104] - # profile_soup = get_soup(profile_url, verify=False) - - # party_info = profile_soup.find('span', class_='subject', string='소속정당') - # if party_info and (party_span := party_info.find_next('span', class_='detail')) is not None: - # party = party_span.get_text(strip=True) - - # councilors.append(Councilor(name=name, party=party)) +# # 프로필 링크 스크랩을 위해 base_url 추출 +# parsed_url = urlparse(url) +# base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - # return ScrapResult( - # council_id="incheon-donggu", - # council_type=CouncilType.LOCAL_COUNCIL, - # councilors=councilors - # ) +# for name_tag in soup.find_all('strong', class_='name'): +# name = name_tag.get_text(strip=True) +# party = '정당 정보 없음' + +# profile_link = name_tag.find_next('a', class_='abtn1') +# if profile_link: +# profile_url = base_url + profile_link['onclick'][13:104] +# profile_soup = get_soup(profile_url, verify=False) + +# party_info = profile_soup.find('span', class_='subject', string='소속정당') +# if party_info and (party_span := party_info.find_next('span', class_='detail')) is not None: +# party = party_span.get_text(strip=True) +# councilors.append(Councilor(name=name, party=party)) -def scrap_52(url='https://www.michuhol.go.kr/council/introduction/career.asp') -> ScrapResult: +# return ScrapResult( +# council_id="incheon-donggu", +# council_type=CouncilType.LOCAL_COUNCIL, +# councilors=councilors +# ) + + +def scrap_52( + url="https://www.michuhol.go.kr/council/introduction/career.asp", +) -> ScrapResult: """인천시 미추홀구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -72,19 +76,23 @@ def scrap_52(url='https://www.michuhol.go.kr/council/introduction/career.asp') - """ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - - script = soup.find('div', class_='contents_header').find_next('script').get_text(strip=True) - # TODO + script = ( + soup.find("div", class_="contents_header") + .find_next("script") + .get_text(strip=True) + ) + + # TODO return ScrapResult( council_id="incheon-michuholgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_53(url='https://council.yeonsu.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_53(url="https://council.yeonsu.go.kr/kr/member/name.do") -> ScrapResult: """인천시 연수구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -93,25 +101,27 @@ def scrap_53(url='https://council.yeonsu.go.kr/kr/member/name.do') -> ScrapResul soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('strong') - name = name_tag.get_text(strip=True) if name_tag else '이름 정보 없음' - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당').find_next('span').find_next('span') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("strong") + name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + party_info = ( + profile.find("em", string="소속정당").find_next("span").find_next("span") + ) if party_info: party = party_info.get_text(strip=True) - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="incheon-yeonsugu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_54(url='https://council.namdong.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_54(url="https://council.namdong.go.kr/kr/member/active.do") -> ScrapResult: """인천시 남동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -120,31 +130,31 @@ def scrap_54(url='https://council.namdong.go.kr/kr/member/active.do') -> ScrapRe soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') - name = name_tag.get_text(strip=True) if name_tag else '이름 정보 없음' - - party = '정당 정보 없음' - party_info = profile.find('em', string='정 당 : ').find_next('span') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + party_info = profile.find("em", string="정 당 : ").find_next("span") if party_info: party = party_info.get_text(strip=True) - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="incheon-namdonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_55(url='https://council.icbp.go.kr/kr/member/active') -> ScrapResult: +def scrap_55(url="https://council.icbp.go.kr/kr/member/active") -> ScrapResult: """인천시 부평구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - raise Exception('현재 인천시 부평구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다') + raise Exception("현재 인천시 부평구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다") # soup = get_soup(url, verify=False) # councilors: list[Councilor] = [] @@ -152,12 +162,12 @@ def scrap_55(url='https://council.icbp.go.kr/kr/member/active') -> ScrapResult: # for profile in soup.find_all('div', class_='profile'): # name_tag = profile.find('strong', class_='name') # name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else '이름 정보 없음' - + # party = '정당 정보 없음' # party_info = profile.find('strong', string='소속정당').find_next('span') # if party_info: # party = party_info.get_text(strip=True).split()[-1].strip() - + # councilors.append(Councilor(name=name, party=party)) # return ScrapResult( @@ -167,7 +177,9 @@ def scrap_55(url='https://council.icbp.go.kr/kr/member/active') -> ScrapResult: # ) -def scrap_56(url='https://www.gyeyang.go.kr/open_content/council/member/present/present.jsp') -> ScrapResult: +def scrap_56( + url="https://www.gyeyang.go.kr/open_content/council/member/present/present.jsp", +) -> ScrapResult: """인천시 계양구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -176,22 +188,25 @@ def scrap_56(url='https://www.gyeyang.go.kr/open_content/council/member/present/ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for name_tag in soup.find_all('li', class_='name'): - name = name_tag.get_text(strip=True) if name_tag else '이름 정보 없음' - - party = '정당 정보 없음' - party_info = name_tag.find_next('li').find_next('li').find('span', class_='span_sfont') + for name_tag in soup.find_all("li", class_="name"): + name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + party_info = ( + name_tag.find_next("li").find_next("li").find("span", class_="span_sfont") + ) if party_info: party = party_info.get_text(strip=True) - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="incheon-gyeyanggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) + def scrap_57(url, args) -> ScrapResult: """인천시 서구 페이지에서 의원 상세약력 스크랩 @@ -202,23 +217,28 @@ def scrap_57(url, args) -> ScrapResult: councilors: list[Councilor] = [] cid = 57 - profiles = get_profiles(soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls) - print(cid, '번째 의회에는,', len(profiles), '명의 의원이 있습니다.') # 디버깅용. + profiles = get_profiles( + soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name(profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls) + name = get_name( + profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls + ) - party = '정당 정보 없음' + party = "정당 정보 없음" party_pulp = find(profile, args.pty_elt, class_=args.pty_cls) - if party_pulp is None: raise AssertionError('[incheon.py] 정당정보 실패') + if party_pulp is None: + raise AssertionError("[incheon.py] 정당정보 실패") party_string = party_pulp.get_text(strip=True) - party_string = party_string.split(' ')[-1].strip() + party_string = party_string.split(" ")[-1].strip() while True: party = extract_party(party_string) if party is not None: break - if (party_pulp := party_pulp.find_next('span')) is not None: - party_string = party_pulp.text.split(' ')[-1] + if (party_pulp := party_pulp.find_next("span")) is not None: + party_string = party_pulp.text.split(" ")[-1] else: raise RuntimeError("[incheon.py] 정당 정보 파싱 불가") @@ -227,8 +247,9 @@ def scrap_57(url, args) -> ScrapResult: return ScrapResult( council_id=str(cid), council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_56()) \ No newline at end of file + +if __name__ == "__main__": + print(scrap_56()) diff --git a/scrap/local_councils/seoul.py b/scrap/local_councils/seoul.py index 82050e4..af74bd7 100644 --- a/scrap/local_councils/seoul.py +++ b/scrap/local_councils/seoul.py @@ -4,39 +4,41 @@ from scrap.utils.requests import get_soup -def scrap_1(url = 'https://bookcouncil.jongno.go.kr/record/recordView.do?key=99784f935fce5c1d7c8c08c2f9e35dda1c0a6128428ecb1a87f87ee2b4e82890ffcf12563e01473f') -> ScrapResult: - '''서울시 종로구 페이지에서 의원 상세약력 스크랩 +def scrap_1( + url="https://bookcouncil.jongno.go.kr/record/recordView.do?key=99784f935fce5c1d7c8c08c2f9e35dda1c0a6128428ecb1a87f87ee2b4e82890ffcf12563e01473f", +) -> ScrapResult: + """서울시 종로구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - - for profile in soup.find_all('div', class_='pop_profile'): + + for profile in soup.find_all("div", class_="pop_profile"): info = profile.find("div", class_="info") data_ul = info.find("ul", class_="detail") data_lis = data_ul.find_all("li") name = data_lis[0].find("span").get_text(strip=True) party = data_lis[2].find("span").get_text(strip=True) name = name if name else "이름 정보 없음" - party = party if party else '정당 정보 없음' + party = party if party else "정당 정보 없음" councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-jongno", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_2(url = 'https://02jgnew.council.or.kr/kr/member/active') -> ScrapResult: - '''서울시 중구 페이지에서 의원 상세약력 스크랩 +def scrap_2(url="https://02jgnew.council.or.kr/kr/member/active") -> ScrapResult: + """서울시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ parliment_soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -44,19 +46,19 @@ def scrap_2(url = 'https://02jgnew.council.or.kr/kr/member/active') -> ScrapResu parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in parliment_soup.find_all('div', class_='profile'): + for profile in parliment_soup.find_all("div", class_="profile"): name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a', class_='start') + profile_link = profile.find("a", class_="start") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('em', string='소속정당 : ') - if party_info and (party_span := party_info.find_next('span')) is not None: + party_info = profile_soup.find("em", string="소속정당 : ") + if party_info and (party_span := party_info.find_next("span")) is not None: party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -64,7 +66,7 @@ def scrap_2(url = 'https://02jgnew.council.or.kr/kr/member/active') -> ScrapResu return ScrapResult( council_id="seoul-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -82,7 +84,7 @@ def scrap_3(url="https://www.yscl.go.kr/kr/member/name.do") -> ScrapResult: name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" party_info = profile.find("em", string="소속정당") if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -92,11 +94,11 @@ def scrap_3(url="https://www.yscl.go.kr/kr/member/name.do") -> ScrapResult: return ScrapResult( council_id="seoul-yongsangu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_4(url='https://sdcouncil.sd.go.kr/kr/member/active2') -> ScrapResult: +def scrap_4(url="https://sdcouncil.sd.go.kr/kr/member/active2") -> ScrapResult: """서울시 성동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -106,11 +108,11 @@ def scrap_4(url='https://sdcouncil.sd.go.kr/kr/member/active2') -> ScrapResult: councilors: list[Councilor] = [] for profile in soup.find_all("dl", class_="profile"): - name_tag = profile.find('strong', class_='name') + name_tag = profile.find("strong", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find("strong", string='정 당 : ') + party = "정당 정보 없음" + party_info = profile.find("strong", string="정 당 : ") if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -119,11 +121,11 @@ def scrap_4(url='https://sdcouncil.sd.go.kr/kr/member/active2') -> ScrapResult: return ScrapResult( council_id="seoul-seongdonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_5(url='https://council.gwangjin.go.kr/kr/member/active') -> ScrapResult: +def scrap_5(url="https://council.gwangjin.go.kr/kr/member/active") -> ScrapResult: """서울시 광진구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -132,51 +134,57 @@ def scrap_5(url='https://council.gwangjin.go.kr/kr/member/active') -> ScrapResul soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_=lambda x: x in ('profile', 'profile_none')): - name_tag = profile.find('strong') + for profile in soup.find_all( + "div", class_=lambda x: x in ("profile", "profile_none") + ): + name_tag = profile.find("strong") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find("em", string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gwangjingu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_6(url='http://council.ddm.go.kr/citizen/menu1.asp') -> ScrapResult: +def scrap_6(url="http://council.ddm.go.kr/citizen/menu1.asp") -> ScrapResult: """서울시 동대문구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - parliment_soup = get_soup(url, verify=False, encoding='euc-kr') + parliment_soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] # 프로필 링크 스크랩을 위해 base_url 추출 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in parliment_soup.find_all('div', class_='intro_text tm_lg_6'): - name = profile.find('p', class_='intro_text_title').string.strip().split(' ')[0] - party = '정당 정보 없음' + for profile in parliment_soup.find_all("div", class_="intro_text tm_lg_6"): + name = profile.find("p", class_="intro_text_title").string.strip().split(" ")[0] + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a') + profile_link = profile.find("a") if profile_link: - profile_url = base_url + '/assemblyman/greeting/menu02.asp?assembly_id=' + profile_link['href'][1:] - profile_soup = get_soup(profile_url, verify=False, encoding='euc-kr') - - profile_info = profile_soup.find('div', class_='profileTxt') + profile_url = ( + base_url + + "/assemblyman/greeting/menu02.asp?assembly_id=" + + profile_link["href"][1:] + ) + profile_soup = get_soup(profile_url, verify=False, encoding="euc-kr") + + profile_info = profile_soup.find("div", class_="profileTxt") if profile_info: - profile_string = profile_info.get_text().strip().split('\xa0') - idx = profile_string.index('소속정당') + profile_string = profile_info.get_text().strip().split("\xa0") + idx = profile_string.index("소속정당") party = profile_string[idx + 2] councilors.append(Councilor(name=name, party=party)) @@ -184,11 +192,11 @@ def scrap_6(url='http://council.ddm.go.kr/citizen/menu1.asp') -> ScrapResult: return ScrapResult( council_id="seoul-dongdaemungu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_7(url='https://council.jungnang.go.kr/kr/member/name2.do') -> ScrapResult: +def scrap_7(url="https://council.jungnang.go.kr/kr/member/name2.do") -> ScrapResult: """서울시 중랑구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -197,25 +205,25 @@ def scrap_7(url='https://council.jungnang.go.kr/kr/member/name2.do') -> ScrapRes soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find("em", string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-jungnanggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_8(url='https://www.sbc.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_8(url="https://www.sbc.go.kr/kr/member/active.do") -> ScrapResult: """서울시 성북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -224,25 +232,27 @@ def scrap_8(url='https://www.sbc.go.kr/kr/member/active.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find("em", string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("span").get_text(strip=True).split(" ")[-1].strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-seongbukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_9(url='https://council.gangbuk.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_9(url="https://council.gangbuk.go.kr/kr/member/name.do") -> ScrapResult: """서울시 강북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -251,25 +261,31 @@ def scrap_9(url='https://council.gangbuk.go.kr/kr/member/name.do') -> ScrapResul soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gangbukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_10(url='https://www.council-dobong.seoul.kr/kr/member/active.do') -> ScrapResult: +def scrap_10( + url="https://www.council-dobong.seoul.kr/kr/member/active.do", +) -> ScrapResult: """서울시 도봉구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -278,25 +294,27 @@ def scrap_10(url='https://www.council-dobong.seoul.kr/kr/member/active.do') -> S soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("span").get_text(strip=True).split(" ")[-1].strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-dobonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_11(url='https://council.nowon.kr/kr/member/active.do') -> ScrapResult: +def scrap_11(url="https://council.nowon.kr/kr/member/active.do") -> ScrapResult: """서울시 노원구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -305,25 +323,27 @@ def scrap_11(url='https://council.nowon.kr/kr/member/active.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("span").get_text(strip=True).split(" ")[-1].strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-nowongu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_12(url='https://council.ep.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_12(url="https://council.ep.go.kr/kr/member/name.do") -> ScrapResult: """서울시 은평구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -332,52 +352,65 @@ def scrap_12(url='https://council.ep.go.kr/kr/member/name.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-eunpyeonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_13(url='https://www.sdmcouncil.go.kr/source/korean/square/ascending.html') -> ScrapResult: +def scrap_13( + url="https://www.sdmcouncil.go.kr/source/korean/square/ascending.html", +) -> ScrapResult: """서울시 서대문구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='card_desc'): - name_tag = profile.find_next('dt') + for profile in soup.find_all("dl", class_="card_desc"): + name_tag = profile.find_next("dt") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('ul') + party = "정당 정보 없음" + party_info = profile.find("ul") if party_info: - party = party_info.find_next('li').find_next('li').find_next('li').get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("li") + .find_next("li") + .find_next("li") + .get_text(strip=True) + .split(" ")[-1] + .strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-seodaemungu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_14(url='https://council.mapo.seoul.kr/kr/member/active.do') -> ScrapResult: +def scrap_14(url="https://council.mapo.seoul.kr/kr/member/active.do") -> ScrapResult: """서울시 마포구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -386,25 +419,25 @@ def scrap_14(url='https://council.mapo.seoul.kr/kr/member/active.do') -> ScrapRe soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='wrap'): - name_tag = profile.find_next('div', class_='right') - name = name_tag.find_next('h4').get_text(strip=True) if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="wrap"): + name_tag = profile.find_next("div", class_="right") + name = name_tag.find_next("h4").get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', class_='tit', string='소속정당 : ') + party = "정당 정보 없음" + party_info = profile.find("span", class_="tit", string="소속정당 : ") if party_info: - party = party_info.find_next('span', class_='con').get_text(strip=True) + party = party_info.find_next("span", class_="con").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-mapogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_15(url='https://www.ycc.go.kr/kr/member/active') -> ScrapResult: +def scrap_15(url="https://www.ycc.go.kr/kr/member/active") -> ScrapResult: """서울시 양천구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -417,19 +450,23 @@ def scrap_15(url='https://www.ycc.go.kr/kr/member/active') -> ScrapResult: parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_uid = profile.find('a', class_='start')['data-uid'] + profile_uid = profile.find("a", class_="start")["data-uid"] if profile_uid: - profile_url = base_url + '/kr/member/profile_popup?uid=' + profile_uid + profile_url = base_url + "/kr/member/profile_popup?uid=" + profile_uid profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('em', string='소속정당') - if party_info and (party_span := party_info.find_next('span')): + party_info = profile_soup.find("em", string="소속정당") + if party_info and (party_span := party_info.find_next("span")): party = party_span.get_text(strip=True) councilors.append(Councilor(name=name, party=party)) @@ -437,38 +474,44 @@ def scrap_15(url='https://www.ycc.go.kr/kr/member/active') -> ScrapResult: return ScrapResult( council_id="seoul-yangcheongu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_16(url='https://gsc.gangseo.seoul.kr/member/org.asp') -> ScrapResult: +def scrap_16(url="https://gsc.gangseo.seoul.kr/member/org.asp") -> ScrapResult: """서울시 강서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('ul', class_='mb-15'): - name_tag = profile.find_next('span', class_='fs-18 fw-700') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find_next('span', class_='title').find_next('span', class_='title').find_next('span', class_='title') + for profile in soup.find_all("ul", class_="mb-15"): + name_tag = profile.find_next("span", class_="fs-18 fw-700") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = ( + profile.find_next("span", class_="title") + .find_next("span", class_="title") + .find_next("span", class_="title") + ) if party_info: - party = party_info.find_next('span').get_text(strip=True) + party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gangseogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_17(url='https://www.guroc.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_17(url="https://www.guroc.go.kr/kr/member/name.do") -> ScrapResult: """서울시 구로구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -477,38 +520,46 @@ def scrap_17(url='https://www.guroc.go.kr/kr/member/name.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gurogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_18(url='https://council.geumcheon.go.kr/member/member.asp') -> ScrapResult: +def scrap_18(url="https://council.geumcheon.go.kr/member/member.asp") -> ScrapResult: """서울시 금천구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('li', class_='name'): - name_tag = profile.find_next('strong') - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("li", class_="name"): + name_tag = profile.find_next("strong") + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' + party = "정당 정보 없음" # TODO councilors.append(Councilor(name=name, party=party)) @@ -516,11 +567,11 @@ def scrap_18(url='https://council.geumcheon.go.kr/member/member.asp') -> ScrapRe return ScrapResult( council_id="seoul-geumcheongu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_19(url='https://www.ydpc.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_19(url="https://www.ydpc.go.kr/kr/member/active.do") -> ScrapResult: """서울시 영등포구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -529,25 +580,25 @@ def scrap_19(url='https://www.ydpc.go.kr/kr/member/active.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당 : ') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당 : ") if party_info: - party = party_info.find_next('span').get_text(strip=True) + party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-yeongdeungpogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_20(url='http://assembly.dongjak.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_20(url="http://assembly.dongjak.go.kr/kr/member/name.do") -> ScrapResult: """서울시 동작구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -556,25 +607,25 @@ def scrap_20(url='http://assembly.dongjak.go.kr/kr/member/name.do') -> ScrapResu soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-dongjakgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_21(url='https://www.ga21c.seoul.kr/kr/member/name.do') -> ScrapResult: +def scrap_21(url="https://www.ga21c.seoul.kr/kr/member/name.do") -> ScrapResult: """서울시 관악구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -583,25 +634,27 @@ def scrap_21(url='https://www.ga21c.seoul.kr/kr/member/name.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("span").get_text(strip=True).split(" ")[-1].strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gwanakgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_22(url='https://www.sdc.seoul.kr/kr/member/active.do') -> ScrapResult: +def scrap_22(url="https://www.sdc.seoul.kr/kr/member/active.do") -> ScrapResult: """서울시 서초구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -610,25 +663,27 @@ def scrap_22(url='https://www.sdc.seoul.kr/kr/member/active.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당 : ') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당 : ") if party_info: - party = party_info.find_next('span').get_text(strip=True) + party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-seochogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_23(url='https://www.gncouncil.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_23(url="https://www.gncouncil.go.kr/kr/member/name.do") -> ScrapResult: """서울시 강남구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -637,35 +692,39 @@ def scrap_23(url='https://www.gncouncil.go.kr/kr/member/name.do') -> ScrapResult soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gangnamgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_24(url='https://council.songpa.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_24(url="https://council.songpa.go.kr/kr/member/active.do") -> ScrapResult: """서울시 송파구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ # TODO - raise Exception('송파구 의회 사이트는 현재 먹통입니다') + raise Exception("송파구 의회 사이트는 현재 먹통입니다") -def scrap_25(url='https://council.gangdong.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_25(url="https://council.gangdong.go.kr/kr/member/active.do") -> ScrapResult: """서울시 강동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -674,23 +733,25 @@ def scrap_25(url='https://council.gangdong.go.kr/kr/member/active.do') -> ScrapR soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당 : ') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당 : ") if party_info: - party = party_info.find_next('span').get_text(strip=True) + party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gangdonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_2()) \ No newline at end of file +if __name__ == "__main__": + print(scrap_2()) diff --git a/scrap/local_councils/ulsan.py b/scrap/local_councils/ulsan.py index f2a2219..52bcf31 100644 --- a/scrap/local_councils/ulsan.py +++ b/scrap/local_councils/ulsan.py @@ -5,24 +5,33 @@ from scrap.utils.requests import get_soup import re -regex_pattern = re.compile(r'정\s*\S*\s*당', re.IGNORECASE) # Case-insensitive +regex_pattern = re.compile(r"정\s*\S*\s*당", re.IGNORECASE) # Case-insensitive -def scrap_70(url = 'https://council.junggu.ulsan.kr/content/member/memberName.html') -> ScrapResult: - '''울산시 중구 페이지에서 의원 상세약력 스크랩 + +def scrap_70( + url="https://council.junggu.ulsan.kr/content/member/memberName.html", +) -> ScrapResult: + """울산시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('dl'): + for profile in soup.find_all("dl"): name_tag = profile.find("dd", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - + party = "정당 정보 없음" - party_info = list(filter(lambda x: regex_pattern.search(str(x)), profile.find_all("dd"))) - if party_info and (party_span := party_info[0].find_next('span').find_next('span')) is not None: + party_info = list( + filter(lambda x: regex_pattern.search(str(x)), profile.find_all("dd")) + ) + if ( + party_info + and (party_span := party_info[0].find_next("span").find_next("span")) + is not None + ): party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -30,25 +39,36 @@ def scrap_70(url = 'https://council.junggu.ulsan.kr/content/member/memberName.ht return ScrapResult( council_id="ulsan-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_71(url = 'https://www.namgucouncil.ulsan.kr/content/member/memberName.html') -> ScrapResult: - '''울산시 남구 페이지에서 의원 상세약력 스크랩 + +def scrap_71( + url="https://www.namgucouncil.ulsan.kr/content/member/memberName.html", +) -> ScrapResult: + """울산시 남구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('dl'): + for profile in soup.find_all("dl"): name_tag = profile.find("dd", class_="name") - name = name_tag.get_text(strip=True).replace(" 의원", "") if name_tag else "이름 정보 없음" + name = ( + name_tag.get_text(strip=True).replace(" 의원", "") if name_tag else "이름 정보 없음" + ) party = "정당 정보 없음" - party_info = list(filter(lambda x: regex_pattern.search(str(x)), profile.find_all("dd"))) - if party_info and (party_span := party_info[0].find_next('span').find_next('span')) is not None: + party_info = list( + filter(lambda x: regex_pattern.search(str(x)), profile.find_all("dd")) + ) + if ( + party_info + and (party_span := party_info[0].find_next("span").find_next("span")) + is not None + ): party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -56,65 +76,74 @@ def scrap_71(url = 'https://www.namgucouncil.ulsan.kr/content/member/memberName. return ScrapResult( council_id="ulsan-namgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_72(url = 'https://www.donggu-council.ulsan.kr/source/korean/member/active.html') -> ScrapResult: - '''울산시 동구 페이지에서 의원 상세약력 스크랩 + +def scrap_72( + url="https://www.donggu-council.ulsan.kr/source/korean/member/active.html", +) -> ScrapResult: + """울산시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' - soup = get_soup(url, verify=False, encoding='euc-kr') + """ + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: List[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("li", class_="name") # () 안에 있는 한자를 제거 (ex. 김영희(金英姬) -> 김영희) - name = name_tag.get_text(strip=True).split('(')[0] if name_tag else "이름 정보 없음" + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" party = "정당 정보 없음" - party_info = list(filter(lambda x: regex_pattern.search(str(x)), profile.find_all("li"))) + party_info = list( + filter(lambda x: regex_pattern.search(str(x)), profile.find_all("li")) + ) if party_info: - party = party_info[0].get_text(strip=True).split(': ')[1] + party = party_info[0].get_text(strip=True).split(": ")[1] councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="ulsan-donggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_73(url = 'https://council.bukgu.ulsan.kr/kr/member/active.do') -> ScrapResult: - '''울산시 북구 페이지에서 의원 상세약력 스크랩 + +def scrap_73(url="https://council.bukgu.ulsan.kr/kr/member/active.do") -> ScrapResult: + """울산시 북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('dl', class_='profile'): + for profile in soup.find_all("dl", class_="profile"): name_tag = profile.find("strong", class_="name") # () 안에 있는 한자를 제거 (ex. 김영희(金英姬) -> 김영희) - name = name_tag.get_text(strip=True).split('(')[0] if name_tag else "이름 정보 없음" + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" party = "정당 정보 없음" - party_info = list(filter(lambda x: regex_pattern.search(str(x)), profile.find_all("li"))) + party_info = list( + filter(lambda x: regex_pattern.search(str(x)), profile.find_all("li")) + ) if party_info: - party = party_info[0].get_text(strip=True).split(': ')[1] + party = party_info[0].get_text(strip=True).split(": ")[1] councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="ulsan-bukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_74(url = 'https://assembly.ulju.ulsan.kr/kr/member/active') -> ScrapResult: - '''울산시 울주군 페이지에서 의원 상세약력 스크랩 + +def scrap_74(url="https://assembly.ulju.ulsan.kr/kr/member/active") -> ScrapResult: + """울산시 울주군 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] @@ -122,18 +151,18 @@ def scrap_74(url = 'https://assembly.ulju.ulsan.kr/kr/member/active') -> ScrapRe parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a', class_='start') + profile_link = profile.find("a", class_="start") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('em', string=regex_pattern) - if party_info and (party_span := party_info.find_next('span')) is not None: + party_info = profile_soup.find("em", string=regex_pattern) + if party_info and (party_span := party_info.find_next("span")) is not None: party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -141,8 +170,9 @@ def scrap_74(url = 'https://assembly.ulju.ulsan.kr/kr/member/active') -> ScrapRe return ScrapResult( council_id="ulsan_uljugun", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_70()) \ No newline at end of file + +if __name__ == "__main__": + print(scrap_70()) diff --git a/scrap/metropolitan_council.py b/scrap/metropolitan_council.py index 1c19078..1c31cdf 100644 --- a/scrap/metropolitan_council.py +++ b/scrap/metropolitan_council.py @@ -4,12 +4,14 @@ from scrap.utils.requests import get_soup -def scrap_metro_1(url = 'https://www.smc.seoul.kr/main/memIntro01.do?menuId=001002001001') -> ScrapResult: - '''서울시 페이지에서 의원 상세약력 스크랩 +def scrap_metro_1( + url="https://www.smc.seoul.kr/main/memIntro01.do?menuId=001002001001", +) -> ScrapResult: + """서울시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -17,54 +19,60 @@ def scrap_metro_1(url = 'https://www.smc.seoul.kr/main/memIntro01.do?menuId=0010 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('input', class_='memLinkk'): - name = profile['value'].strip() if profile else '이름 정보 없음' - party = '정당 정보 없음' + for profile in soup.find_all("input", class_="memLinkk"): + name = profile["value"].strip() if profile else "이름 정보 없음" + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_url = base_url + '/home/' + profile['data-url'] + profile_url = base_url + "/home/" + profile["data-url"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('div', class_='profile') - if party_info and (party_span := party_info.find('li')) is not None: - party = party_span.find_next('li').get_text(strip=True) + party_info = profile_soup.find("div", class_="profile") + if party_info and (party_span := party_info.find("li")) is not None: + party = party_span.find_next("li").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_2(url = 'https://council.busan.go.kr/council/past02') -> ScrapResult: - '''부산시 페이지에서 의원 상세약력 스크랩 +def scrap_metro_2(url="https://council.busan.go.kr/council/past02") -> ScrapResult: + """부산시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' - soup = get_soup(url, verify=False).find('ul', class_='inmemList') + """ + soup = get_soup(url, verify=False).find("ul", class_="inmemList") councilors: list[Councilor] = [] - for profile in soup.find_all('a', class_='detail'): - name = profile.get_text(strip=True) if profile else '이름 정보 없음' - party = '정당 정보 없음' + for profile in soup.find_all("a", class_="detail"): + name = profile.get_text(strip=True) if profile else "이름 정보 없음" + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_url = profile['href'] + profile_url = profile["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('ul', class_='vs-list-st-type01') - if party_info and (party_span := party_info.find('li')) is not None: - party = party_span.find_next('li').find_next('li').get_text(strip=True).split()[-1].strip() + party_info = profile_soup.find("ul", class_="vs-list-st-type01") + if party_info and (party_span := party_info.find("li")) is not None: + party = ( + party_span.find_next("li") + .find_next("li") + .get_text(strip=True) + .split()[-1] + .strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -82,7 +90,7 @@ def scrap_metro_3(url="https://council.daegu.go.kr/kr/member/active") -> ScrapRe name_tag = profile.find("p", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" party_info = profile.find("em", string="소속정당") if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -92,7 +100,7 @@ def scrap_metro_3(url="https://council.daegu.go.kr/kr/member/active") -> ScrapRe return ScrapResult( council_id="daegu", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -103,11 +111,11 @@ def scrap_metro_4(url="https://www.icouncil.go.kr/main/member/name.jsp") -> Scra :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('table', class_='data').find('tbody') + soup = get_soup(url, verify=False).find("table", class_="data").find("tbody") councilors: list[Councilor] = [] for profile in soup.find_all("tr"): - columns = profile.find_all('td') + columns = profile.find_all("td") name_tag = columns[0] name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -120,7 +128,7 @@ def scrap_metro_4(url="https://www.icouncil.go.kr/main/member/name.jsp") -> Scra return ScrapResult( council_id="incheon", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -131,7 +139,7 @@ def scrap_metro_5(url="https://council.gwangju.go.kr/index.do?PID=029") -> Scrap :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('table', class_='data').find('tbody') + soup = get_soup(url, verify=False).find("table", class_="data").find("tbody") councilors: list[Councilor] = [] # TODO @@ -139,37 +147,41 @@ def scrap_metro_5(url="https://council.gwangju.go.kr/index.do?PID=029") -> Scrap return ScrapResult( council_id="gwangju", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_6(url="https://council.daejeon.go.kr/svc/cmp/MbrListByPhoto.do") -> ScrapResult: +def scrap_metro_6( + url="https://council.daejeon.go.kr/svc/cmp/MbrListByPhoto.do", +) -> ScrapResult: """대전시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('ul', class_='mlist') + soup = get_soup(url, verify=False).find("ul", class_="mlist") councilors: list[Councilor] = [] for profile in soup.find_all("dl"): - name_tag = profile.find('dd', class_='name') - name = name_tag.find('strong').get_text(strip=True) if name_tag else "이름 정보 없음" + name_tag = profile.find("dd", class_="name") + name = name_tag.find("strong").get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = name_tag.find_next('dd').find_next('dd') - party = party_tag.find('i').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = name_tag.find_next("dd").find_next("dd") + party = party_tag.find("i").get_text(strip=True) if party_tag else "정당 정보 없음" councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="daejeon", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_7(url="https://www.council.ulsan.kr/kor/councillor/viewByPerson.do") -> ScrapResult: +def scrap_metro_7( + url="https://www.council.ulsan.kr/kor/councillor/viewByPerson.do", +) -> ScrapResult: """울산시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -179,10 +191,10 @@ def scrap_metro_7(url="https://www.council.ulsan.kr/kor/councillor/viewByPerson. soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for name_tag in soup.find_all("div", class_='name'): + for name_tag in soup.find_all("div", class_="name"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = name_tag.find_next('li').find_next('li') + party_tag = name_tag.find_next("li").find_next("li") party = party_tag.get_text(strip=True) if party_tag else "정당 정보 없음" councilors.append(Councilor(name=name, party=party)) @@ -190,57 +202,71 @@ def scrap_metro_7(url="https://www.council.ulsan.kr/kor/councillor/viewByPerson. return ScrapResult( council_id="ulsan", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_8(url="https://council.sejong.go.kr/mnu/pom/introductionMemberByName.do") -> ScrapResult: +def scrap_metro_8( + url="https://council.sejong.go.kr/mnu/pom/introductionMemberByName.do", +) -> ScrapResult: """세종시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('ul', class_='ml') + soup = get_soup(url, verify=False).find("ul", class_="ml") councilors: list[Councilor] = [] - for profile in soup.find_all('dl'): - name_tag = profile.find('dd', class_='name') - name = name_tag.find(string=True, recursive=False).strip() if name_tag else "이름 정보 없음" - - party_tag = name_tag.find_next('dd').find_next('dd') - party = party_tag.get_text(strip=True).split()[-1].strip() if party_tag else "정당 정보 없음" + for profile in soup.find_all("dl"): + name_tag = profile.find("dd", class_="name") + name = ( + name_tag.find(string=True, recursive=False).strip() + if name_tag + else "이름 정보 없음" + ) + + party_tag = name_tag.find_next("dd").find_next("dd") + party = ( + party_tag.get_text(strip=True).split()[-1].strip() + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="sejong", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_9(url="https://www.ggc.go.kr/site/main/memberInfo/actvMmbr/list?cp=1&menu=consonant&sortOrder=MI_NAME&sortDirection=ASC") -> ScrapResult: +def scrap_metro_9( + url="https://www.ggc.go.kr/site/main/memberInfo/actvMmbr/list?cp=1&menu=consonant&sortOrder=MI_NAME&sortDirection=ASC", +) -> ScrapResult: """경기도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('div', class_='paging2 clearfix') + soup = get_soup(url, verify=False).find("div", class_="paging2 clearfix") councilors: list[Councilor] = [] - + parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - - for page in soup.find_all('a'): - page_url = base_url + page['href'] - page_soup = get_soup(page_url, verify=False).find('ul', class_='memberList3 clear') - for profile in page_soup.find_all('li', recursive=False): - name_tag = profile.find('p', class_='f22 blue3') + + for page in soup.find_all("a"): + page_url = base_url + page["href"] + page_soup = get_soup(page_url, verify=False).find( + "ul", class_="memberList3 clear" + ) + for profile in page_soup.find_all("li", recursive=False): + name_tag = profile.find("p", class_="f22 blue3") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('li', class_='f15 m0') + party_tag = profile.find("li", class_="f15 m0") party = party_tag.get_text(strip=True) if party_tag else "정당 정보 없음" councilors.append(Councilor(name=name, party=party)) @@ -248,11 +274,13 @@ def scrap_metro_9(url="https://www.ggc.go.kr/site/main/memberInfo/actvMmbr/list? return ScrapResult( council_id="gyeonggi", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_10(url="https://council.chungbuk.kr/kr/member/active.do") -> ScrapResult: +def scrap_metro_10( + url="https://council.chungbuk.kr/kr/member/active.do", +) -> ScrapResult: """충청북도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -262,23 +290,31 @@ def scrap_metro_10(url="https://council.chungbuk.kr/kr/member/active.do") -> Scr soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party_tag = profile.find('em', string='소속정당') - party = party_tag.find_next('span').find_next('span').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("em", string="소속정당") + party = ( + party_tag.find_next("span").find_next("span").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="chungbuk", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_11(url="https://council.chungnam.go.kr/kr/member/name.do") -> ScrapResult: +def scrap_metro_11( + url="https://council.chungnam.go.kr/kr/member/name.do", +) -> ScrapResult: """충청남도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -288,23 +324,31 @@ def scrap_metro_11(url="https://council.chungnam.go.kr/kr/member/name.do") -> Sc soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party_tag = profile.find('em', string='소속정당 : ') - party = party_tag.find_next('span').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("em", string="소속정당 : ") + party = ( + party_tag.find_next("span").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="chungnam", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_12(url="https://www.assem.jeonbuk.kr/board/list.do?boardId=2018_assemblyman&searchType=assem_check&keyword=1&menuCd=DOM_000000103001000000&contentsSid=453") -> ScrapResult: +def scrap_metro_12( + url="https://www.assem.jeonbuk.kr/board/list.do?boardId=2018_assemblyman&searchType=assem_check&keyword=1&menuCd=DOM_000000103001000000&contentsSid=453", +) -> ScrapResult: """전라북도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -314,23 +358,29 @@ def scrap_metro_12(url="https://www.assem.jeonbuk.kr/board/list.do?boardId=2018_ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('li', class_='career'): - name_tag = profile.find('tr', class_='name') + for profile in soup.find_all("li", class_="career"): + name_tag = profile.find("tr", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('tr', class_='list1') - party = party_tag.find('td', class_='co2').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("tr", class_="list1") + party = ( + party_tag.find("td", class_="co2").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="jeonbuk", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_13(url="https://www.jnassembly.go.kr/profileHistory.es?mid=a10202010000&cs_daesoo=12") -> ScrapResult: +def scrap_metro_13( + url="https://www.jnassembly.go.kr/profileHistory.es?mid=a10202010000&cs_daesoo=12", +) -> ScrapResult: """전라남도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -340,19 +390,23 @@ def scrap_metro_13(url="https://www.jnassembly.go.kr/profileHistory.es?mid=a1020 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('tbody'): - name_tag = profile.find('p') + for profile in soup.find_all("tbody"): + name_tag = profile.find("p") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('th', string='소속정당') - party = party_tag.find_next('td', class_='txt_left').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("th", string="소속정당") + party = ( + party_tag.find_next("td", class_="txt_left").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="jeonnam", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -366,23 +420,29 @@ def scrap_metro_14(url="https://council.gb.go.kr/kr/member/name") -> ScrapResult soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('div', class_='name') - name = name_tag.find('strong').get_text(strip=True) if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("div", class_="name") + name = name_tag.find("strong").get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('em', string='소속정당') - party = party_tag.find_next('span').find_next('span').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("em", string="소속정당") + party = ( + party_tag.find_next("span").find_next("span").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="gyeongbuk", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_15(url="https://council.gyeongnam.go.kr/kr/member/active.do") -> ScrapResult: +def scrap_metro_15( + url="https://council.gyeongnam.go.kr/kr/member/active.do", +) -> ScrapResult: """경상남도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -392,19 +452,27 @@ def scrap_metro_15(url="https://council.gyeongnam.go.kr/kr/member/active.do") -> soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('div', class_='name') - name = name_tag.find('strong').get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" - - party_tag = profile.find('em', class_='ls2', string='정당') - party = party_tag.find_next('span').get_text(strip=True) if party_tag else "정당 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("div", class_="name") + name = ( + name_tag.find("strong").get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) + + party_tag = profile.find("em", class_="ls2", string="정당") + party = ( + party_tag.find_next("span").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="gyeongnam", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -418,23 +486,29 @@ def scrap_metro_16(url="https://council.gangwon.kr/kr/member/name.do") -> ScrapR soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('em', string='소속정당') - party = party_tag.find_next('span').get_text(strip=True).split()[-1].strip() if party_tag else "정당 정보 없음" + party_tag = profile.find("em", string="소속정당") + party = ( + party_tag.find_next("span").get_text(strip=True).split()[-1].strip() + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="gangwon", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_17(url="https://www.council.jeju.kr/cmember/active/name.do") -> ScrapResult: +def scrap_metro_17( + url="https://www.council.jeju.kr/cmember/active/name.do", +) -> ScrapResult: """제주도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -444,7 +518,7 @@ def scrap_metro_17(url="https://www.council.jeju.kr/cmember/active/name.do") -> soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for tag in soup.find_all('p', class_='name'): + for tag in soup.find_all("p", class_="name"): text = tag.get_text(strip=True).split("(") # print(text) name = text[0].strip() @@ -455,10 +529,9 @@ def scrap_metro_17(url="https://www.council.jeju.kr/cmember/active/name.do") -> return ScrapResult( council_id="jeju", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) - -if __name__ == '__main__': - print(scrap_metro_17()) \ No newline at end of file +if __name__ == "__main__": + print(scrap_metro_17()) diff --git a/scrap/national_council.py b/scrap/national_council.py index b058abf..6c4656e 100644 --- a/scrap/national_council.py +++ b/scrap/national_council.py @@ -8,42 +8,43 @@ def scrap_national_council(cd: int) -> ScrapResult: - '''열린국회정보 Open API를 이용해 역대 국회의원 인적사항 스크랩 - _data 폴더에 assembly_api_key.json 파일을 만들어야 하며, - 해당 JSON은 {"key":"(Open API에서 발급받은 인증키)"} 꼴을 가져야 한다. - https://open.assembly.go.kr/portal/data/service/selectAPIServicePage.do/OBL7NF0011935G18076#none - - :param cd: 국회의원 대수. 제21대 국회의원을 스크랩하고자 하면 21 - :return: 국회의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' - - key_json_path = os.path.join(BASE_DIR, '_data', 'assembly_api_key.json') - if not os.path.exists(key_json_path): - raise Exception('열린국회정보 Open API에 회원가입 후 인증키를 발급받아주세요.\nhttps://open.assembly.go.kr/portal/openapi/openApiDevPage.do') - with open(key_json_path, 'r') as key_json: - assembly_key = json.load(key_json)['key'] - - request_url = f"https://open.assembly.go.kr/portal/openapi/nwvrqwxyaytdsfvhu?KEY={assembly_key}&pSize=500&UNIT_CD={cd + 100000}" - response = requests.get(request_url) - - if response.status_code != 200: - raise Exception(f'Open API 요청에 실패했습니다 (상태 코드 {response.status_code})') - - root = ET.fromstring(response.text) - councilors: list[Councilor] = [] - - for row in root.iter('row'): - councilors.append(Councilor( - name=row.find('HG_NM').text, - party=row.find('POLY_NM').text - )) - - return ScrapResult( - council_id='national', - council_type=CouncilType.NATIONAL_COUNCIL, - councilors=councilors - ) - - -if __name__ == '__main__': - print(scrap_national_council(21)) \ No newline at end of file + """열린국회정보 Open API를 이용해 역대 국회의원 인적사항 스크랩 + _data 폴더에 assembly_api_key.json 파일을 만들어야 하며, + 해당 JSON은 {"key":"(Open API에서 발급받은 인증키)"} 꼴을 가져야 한다. + https://open.assembly.go.kr/portal/data/service/selectAPIServicePage.do/OBL7NF0011935G18076#none + + :param cd: 국회의원 대수. 제21대 국회의원을 스크랩하고자 하면 21 + :return: 국회의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 + """ + + key_json_path = os.path.join(BASE_DIR, "_data", "assembly_api_key.json") + if not os.path.exists(key_json_path): + raise Exception( + "열린국회정보 Open API에 회원가입 후 인증키를 발급받아주세요.\nhttps://open.assembly.go.kr/portal/openapi/openApiDevPage.do" + ) + with open(key_json_path, "r") as key_json: + assembly_key = json.load(key_json)["key"] + + request_url = f"https://open.assembly.go.kr/portal/openapi/nwvrqwxyaytdsfvhu?KEY={assembly_key}&pSize=500&UNIT_CD={cd + 100000}" + response = requests.get(request_url) + + if response.status_code != 200: + raise Exception(f"Open API 요청에 실패했습니다 (상태 코드 {response.status_code})") + + root = ET.fromstring(response.text) + councilors: list[Councilor] = [] + + for row in root.iter("row"): + councilors.append( + Councilor(name=row.find("HG_NM").text, party=row.find("POLY_NM").text) + ) + + return ScrapResult( + council_id="national", + council_type=CouncilType.NATIONAL_COUNCIL, + councilors=councilors, + ) + + +if __name__ == "__main__": + print(scrap_national_council(21)) diff --git a/scrap/utils/database.py b/scrap/utils/database.py index c197c6e..9014802 100644 --- a/scrap/utils/database.py +++ b/scrap/utils/database.py @@ -9,6 +9,7 @@ # 컬렉션은 하나 이상의 문서로 구성됩니다. db = client[str(MongoDBSecrets.database_name)] + def save_to_database(record: ScrapResult): """ 지방의회 크롤링 결과를 데이터베이스에 저장합니다. @@ -25,20 +26,21 @@ def save_to_database(record: ScrapResult): collection.find_one_and_update( {"councilId": record.council_id}, {"$set": dataclasses.asdict(record)}, - upsert=True + upsert=True, ) return True except Exception as e: print(e) return False -if __name__ == "__main__": - test_record = (ScrapResult( + +if __name__ == "__main__": + test_record = ScrapResult( council_id="test-test", council_type=CouncilType.LOCAL_COUNCIL, councilors=[ Councilor(name="김철수", party="국민의힘"), Councilor(name="김영희", party="더불어민주당"), - ] - )) - print(save_to_database(test_record)) \ No newline at end of file + ], + ) + print(save_to_database(test_record)) diff --git a/scrap/utils/requests.py b/scrap/utils/requests.py index 2bf72b6..16a2135 100644 --- a/scrap/utils/requests.py +++ b/scrap/utils/requests.py @@ -8,15 +8,18 @@ from unicodedata import normalize # SSL 인증서 검증 경고 무시 -requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) # type: ignore +requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) # type: ignore # 충청북도 보은군, 강진시에서 타임아웃이 timeout_time = 60 -def get_soup(url: str, additional_headers={}, verify=True, encoding="utf-8") -> BeautifulSoup: + +def get_soup( + url: str, additional_headers={}, verify=True, encoding="utf-8" +) -> BeautifulSoup: """ url을 입력받아 BeautifulSoup 객체를 반환합니다. requests 라이브러리를 사용합니다. 크롤링 결과가 정상적으로 나오지 않을 경우, Selenium 라이브러리를 사용할 수 있습니다. - + :param url: 크롤링할 페이지의 url입니다. :param additional_headers: 추가적으로 포함할 헤더입니다. 딕셔너리 형태로 입력받습니다. :param verify: SSL 인증서 검증 여부입니다. 인증서가 만료된 페이지를 크롤링할 경우 False로 설정합니다. @@ -25,11 +28,13 @@ def get_soup(url: str, additional_headers={}, verify=True, encoding="utf-8") -> # HTTP 요청에 포함해줄 헤더 http_headers = { - "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" } http_headers.update(additional_headers) - - response = requests.get(url, verify=verify, headers=http_headers, timeout=timeout_time) + + response = requests.get( + url, verify=verify, headers=http_headers, timeout=timeout_time + ) response.encoding = encoding - sanitized_response = normalize('NFKC', unescape(response.text)) - return BeautifulSoup(sanitized_response, 'html.parser') \ No newline at end of file + sanitized_response = normalize("NFKC", unescape(response.text)) + return BeautifulSoup(sanitized_response, "html.parser") diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index 49f4ce6..47da40a 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -15,119 +15,402 @@ # 변경 시 token.json 삭제 후 재인증 필요 SCOPES = ["https://www.googleapis.com/auth/spreadsheets"] BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) + + def google_authorization(): - '''Google Sheets API 활용을 위한 인증 정보 요청 + """Google Sheets API 활용을 위한 인증 정보 요청 credentials.json 파일을 토대로 인증을 요청하되, token.json 파일이 존재할 경우 거기에 저장된 정보 활용 :todo: credentials.json 파일, token.json 파일 값을 환경변수로 설정 - :return: gspread.client.Client 인스턴스''' + :return: gspread.client.Client 인스턴스""" creds = None - token_json_path = os.path.join(BASE_DIR, '_data', 'token.json') + token_json_path = os.path.join(BASE_DIR, "_data", "token.json") # 이미 저장된 인증 정보가 있는지 확인 if os.path.exists(token_json_path): creds = Credentials.from_authorized_user_file(token_json_path, SCOPES) - + # 인증 정보가 없거나 비정상적인 경우 인증 재요청 if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: - flow= InstalledAppFlow.from_client_secrets_file(os.path.join(BASE_DIR, '_data', 'credentials.json'), SCOPES) + flow = InstalledAppFlow.from_client_secrets_file( + os.path.join(BASE_DIR, "_data", "credentials.json"), SCOPES + ) creds = flow.run_local_server(port=0) - with open(token_json_path, 'w') as token: + with open(token_json_path, "w") as token: token.write(creds.to_json()) return gspread.authorize(creds) + def main() -> None: # Google Sheets API 설정 client: gspread.client.Client = google_authorization() # 스프레드시트 열기 - link = 'https://docs.google.com/spreadsheets/d/1fBDJjkw8FSN5wXrvos9Q2wDsyItkUtNFGOxUZYE-h0M/edit#gid=1127955905' # T4I-의회목록 + link = "https://docs.google.com/spreadsheets/d/1fBDJjkw8FSN5wXrvos9Q2wDsyItkUtNFGOxUZYE-h0M/edit#gid=1127955905" # T4I-의회목록 spreadsheet: gspread.Spreadsheet = client.open_by_url(link) - worksheet: gspread.Worksheet = spreadsheet.get_worksheet(0) # 원하는 워크시트 선택 (0은 첫 번째 워크시트입니다.) - # TODO - 홈페이지 위 charset=euc-kr 등을 인식해 바로 가져오기. + worksheet: gspread.Worksheet = spreadsheet.get_worksheet( + 0 + ) # 원하는 워크시트 선택 (0은 첫 번째 워크시트입니다.) + # TODO - 홈페이지 위 charset=euc-kr 등을 인식해 바로 가져오기. euc_kr = [6, 13, 16, 31, 72, 88, 112, 154, 157, 163, 167, 181, 197, 202] special_functions = list(range(1, 57)) + [57, 88, 103] args = { - 2 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 3 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), + 2: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 3: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), # 인천 - 57 : ScrapBasicArgument(pf_elt='div', pf_cls='box', name_elt='p', name_cls='mem_tit2', pty_elt='p', pty_cls='mem_tit2'), - 58 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 59 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), + 57: ScrapBasicArgument( + pf_elt="div", + pf_cls="box", + name_elt="p", + name_cls="mem_tit2", + pty_elt="p", + pty_cls="mem_tit2", + ), + 58: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 59: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), # 광주 - 60 : ScrapBasicArgument(pf_elt='div', pf_cls='content', name_elt='h5', pty_wrapelt='a', pty_elt='li'), - 61 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), + 60: ScrapBasicArgument( + pf_elt="div", pf_cls="content", name_elt="h5", pty_wrapelt="a", pty_elt="li" + ), + 61: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), # 62 : TODO! /common/selectCouncilMemberProfile.json 을 어떻게 얻을지.. # 63 : TODO! 홈페이지 터짐 # 64 : TODO! /common/selectCouncilMemberProfile.json 을 어떻게 얻을지.. # 대전 - 65 : ScrapBasicArgument(pf_elt='dl', pf_cls='profile', name_elt='strong', name_cls='name', pty_elt='strong'), - 66 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), - 67 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='member', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 68 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 69 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), + 65: ScrapBasicArgument( + pf_elt="dl", + pf_cls="profile", + name_elt="strong", + name_cls="name", + pty_elt="strong", + ), + 66: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), + 67: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="member", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 68: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 69: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), # 울산 - 70 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='memberName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 71 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='memberName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 72 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='li', name_cls='name', pty_elt='li'), - 73 : ScrapBasicArgument(pf_elt='dl', pf_cls='profile', name_elt='strong', name_cls='name', pty_elt='li'), - 74 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_wrapelt='a', pty_wrapcls='start', pty_elt='li'), + 70: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="memberName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 71: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="memberName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 72: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="li", name_cls="name", pty_elt="li" + ), + 73: ScrapBasicArgument( + pf_elt="dl", + pf_cls="profile", + name_elt="strong", + name_cls="name", + pty_elt="li", + ), + 74: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="em", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="start", + pty_elt="li", + ), # 경기 - 75 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), - 76 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 77 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='mbrListByName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 78 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_wrapelt='a', pty_wrapcls='end', pty_elt='li'), - 79 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 80 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 81 : ScrapBasicArgument(pf_memlistelt='div', pf_memlistcls='member_list', pf_elt='dd', name_elt='p', pty_elt='tr'), - 82 : ScrapBasicArgument(pf_memlistelt='div', pf_memlistcls='cts1426_box', pf_elt='div', pf_cls='conbox', name_elt='p', pty_elt='li'), + 75: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), + 76: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 77: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="mbrListByName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 78: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="end", + pty_elt="li", + ), + 79: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 80: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 81: ScrapBasicArgument( + pf_memlistelt="div", + pf_memlistcls="member_list", + pf_elt="dd", + name_elt="p", + pty_elt="tr", + ), + 82: ScrapBasicArgument( + pf_memlistelt="div", + pf_memlistcls="cts1426_box", + pf_elt="div", + pf_cls="conbox", + name_elt="p", + pty_elt="li", + ), # 경기 - 동두천 - 83 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_wrapelt='a', pty_wrapcls='start', pty_elt='li'), - 84 : ScrapBasicArgument(pf_elt='div', pf_cls='law_box', name_elt='span', name_cls='name', pty_elt='p'), - 85 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), - 86 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 87 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 88 : ScrapBasicArgument(pf_memlistelt='div', pf_memlistcls='member_list', pf_elt='dl', pf_cls='box', name_elt='span', name_cls='name', pty_wrapelt='p', pty_wrapcls='btn', pty_elt='li'), - 89 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='memberName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='span'), - 90 : ScrapBasicArgument(pf_elt='dl', pf_cls='profile', name_elt='strong', name_cls='name', pty_elt='li'), - # 경기 - 화성 - 91 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='mbr0101', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 92 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='member', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 93 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_wrapelt='a', pty_wrapcls='end', pty_elt='li'), - 94 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='mbrListByName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 95 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='member', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='tr'), - 96 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), - 97 : ScrapBasicArgument(pf_memlistelt='ul', pf_memlistcls='memberList', pf_elt='li', name_elt='strong', pty_wrapelt='a', pty_elt='tr'), - 98 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 99 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 100 : ScrapBasicArgument(pf_elt='div', pf_cls='list', name_elt='h4', name_cls='h0', pty_elt='li'), + 83: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="em", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="start", + pty_elt="li", + ), + 84: ScrapBasicArgument( + pf_elt="div", + pf_cls="law_box", + name_elt="span", + name_cls="name", + pty_elt="p", + ), + 85: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), + 86: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 87: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 88: ScrapBasicArgument( + pf_memlistelt="div", + pf_memlistcls="member_list", + pf_elt="dl", + pf_cls="box", + name_elt="span", + name_cls="name", + pty_wrapelt="p", + pty_wrapcls="btn", + pty_elt="li", + ), + 89: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="memberName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="span", + ), + 90: ScrapBasicArgument( + pf_elt="dl", + pf_cls="profile", + name_elt="strong", + name_cls="name", + pty_elt="li", + ), + # 경기 - 화성 + 91: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="mbr0101", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 92: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="member", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 93: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="end", + pty_elt="li", + ), + 94: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="mbrListByName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 95: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="member", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="tr", + ), + 96: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), + 97: ScrapBasicArgument( + pf_memlistelt="ul", + pf_memlistcls="memberList", + pf_elt="li", + name_elt="strong", + pty_wrapelt="a", + pty_elt="tr", + ), + 98: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 99: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 100: ScrapBasicArgument( + pf_elt="div", pf_cls="list", name_elt="h4", name_cls="h0", pty_elt="li" + ), # 경기 - 광주 - 101 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 102 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_wrapelt='a', pty_wrapcls='start', pty_elt='li'), - 103 : ScrapBasicArgument(pf_elt='div', pf_cls='col-sm-6', name_elt='h5', name_cls='h5', pty_wrapelt='a', pty_wrapcls='d-inline-block', pty_elt='li'), - 104 : ScrapBasicArgument(pf_elt='div', pf_cls='text_box', name_elt='h3', name_cls='h0', pty_wrapelt='a', pty_wraptxt='누리집', pty_elt='li'), - 105 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), + 101: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 102: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="em", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="start", + pty_elt="li", + ), + 103: ScrapBasicArgument( + pf_elt="div", + pf_cls="col-sm-6", + name_elt="h5", + name_cls="h5", + pty_wrapelt="a", + pty_wrapcls="d-inline-block", + pty_elt="li", + ), + 104: ScrapBasicArgument( + pf_elt="div", + pf_cls="text_box", + name_elt="h3", + name_cls="h0", + pty_wrapelt="a", + pty_wraptxt="누리집", + pty_elt="li", + ), + 105: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), # 강원 # 106 : TODO! 정당정보 없음 # TODO! 107이 get_soup에서 실패 중 - HTTPSConnectionPool(host='council.wonju.go.kr', port=443): Max retries exceeded with url: /content/member/memberName.html (Caused by SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1007)'))) - 107 : ScrapBasicArgument(pf_memlistelt='div', pf_memlistcls='content', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='span'), - 108 : ScrapBasicArgument(pf_elt='dl', pf_cls='profile', name_elt='strong', pty_elt='li'), - 109 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='memberName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='span'), - 110 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - # 111 : TODO! 정당 없고 홈페이지는 깨짐 - 112 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 113 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_cls='name', pty_elt='li'), - 115 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='li'), + 107: ScrapBasicArgument( + pf_memlistelt="div", + pf_memlistcls="content", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="span", + ), + 108: ScrapBasicArgument( + pf_elt="dl", pf_cls="profile", name_elt="strong", pty_elt="li" + ), + 109: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="memberName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="span", + ), + 110: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + # 111 : TODO! 정당 없고 홈페이지는 깨짐 + 112: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 113: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_cls="name", pty_elt="li" + ), + 115: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="li", + ), # TODO : 정당이 주석처리되어 있어서 soup가 인식을 못함. - 116 : ScrapBasicArgument(pf_elt='div', pf_cls='memberName', name_cls='name',pty_elt='dd'), + 116: ScrapBasicArgument( + pf_elt="div", pf_cls="memberName", name_cls="name", pty_elt="dd" + ), } # 데이터 가져오기 data: list[dict] = worksheet.get_all_records() - result: str = '' + result: str = "" error_times = 0 parse_error_times = 0 @@ -135,19 +418,27 @@ def main() -> None: N = 226 # for n in range (113, 169): for n in range(107, 108): - encoding = 'euc-kr' if n in euc_kr else 'utf-8' + encoding = "euc-kr" if n in euc_kr else "utf-8" try: if n in special_functions: function_name = f"scrap_{n}" if hasattr(sys.modules[__name__], function_name): function_to_call = getattr(sys.modules[__name__], function_name) if n < 57: - result = str(function_to_call(data[n - 1]['상세약력 링크']).councilors) + result = str( + function_to_call(data[n - 1]["상세약력 링크"]).councilors + ) else: - result = str(function_to_call(data[n - 1]['상세약력 링크'], args=args[n]).councilors) + result = str( + function_to_call( + data[n - 1]["상세약력 링크"], args=args[n] + ).councilors + ) else: - result = str(scrap_basic(data[n - 1]['상세약력 링크'], n, args[n], encoding).councilors) - if '정보 없음' in result: + result = str( + scrap_basic(data[n - 1]["상세약력 링크"], n, args[n], encoding).councilors + ) + if "정보 없음" in result: print("정보 없음이 포함되어 있습니다.") parse_error_times += 1 print(result) @@ -158,6 +449,10 @@ def main() -> None: print(f"오류 : [district-{n}] {str(e)}") error_times += 1 continue # 에러가 발생하면 다음 반복으로 넘어감 - print(f"| 총 실행 횟수: {N} | 에러 횟수: {error_times} | 정보 없음 횟수: {parse_error_times} | 타임아웃 횟수: {timeouts} |") -if __name__ == '__main__': + print( + f"| 총 실행 횟수: {N} | 에러 횟수: {error_times} | 정보 없음 횟수: {parse_error_times} | 타임아웃 횟수: {timeouts} |" + ) + + +if __name__ == "__main__": main() diff --git a/scrap/utils/types.py b/scrap/utils/types.py index cf3733f..a6ed4f1 100644 --- a/scrap/utils/types.py +++ b/scrap/utils/types.py @@ -1,4 +1,4 @@ -#coding: utf-8 +# coding: utf-8 """ 의회 크롤링 결과를 나타내는 타입을 정의합니다. """ @@ -6,35 +6,42 @@ from typing import Optional, List from dataclasses import dataclass + class CouncilType(str, Enum): """ 의회의 종류를 나타내는 열거형입니다. """ - LOCAL_COUNCIL = "local_council" + + LOCAL_COUNCIL = "local_council" NATIONAL_COUNCIL = "national_council" METROPOLITAN_COUNCIL = "metropolitan_council" """ 기초의회 """ + def __str__(self): """ JSON으로 직렬화하기 위해 문자열로 변환하는 함수를 오버라이드합니다. """ return str(self.value) + @dataclass class Councilor: """ 의원(이름 및 정당)을 나타내는 타입입니다. """ + name: str party: str + @dataclass class ScrapResult: """ 의회 크롤링 결과를 나타내는 타입입니다. """ + council_id: str """ 의회를 구분하기 위한 문자열입니다. @@ -50,23 +57,26 @@ class ScrapResult: class ScrapBasicArgument: - ''' + """ scrap_basic에 쓸 argument입니다 - ''' - def __init__(self, - pf_elt: str | None = None, - pf_cls: str | None = None, - pf_memlistelt: str | None = None, - pf_memlistcls: str | None = None, - name_elt: str | None = None, - name_cls: str | None = None, - name_wrapelt: str | None = None, - name_wrapcls: str | None = None, - pty_elt: str | None = None, - pty_cls: str | None = None, - pty_wrapelt: str | None = None, - pty_wrapcls: str | None = None, - pty_wraptxt: str | None = None): + """ + + def __init__( + self, + pf_elt: str | None = None, + pf_cls: str | None = None, + pf_memlistelt: str | None = None, + pf_memlistcls: str | None = None, + name_elt: str | None = None, + name_cls: str | None = None, + name_wrapelt: str | None = None, + name_wrapcls: str | None = None, + pty_elt: str | None = None, + pty_cls: str | None = None, + pty_wrapelt: str | None = None, + pty_wrapcls: str | None = None, + pty_wraptxt: str | None = None, + ): """ ScrapBasicArgument 클래스의 생성자입니다. @@ -97,4 +107,4 @@ def __init__(self, self.pty_cls = pty_cls self.pty_wrapelt = pty_wrapelt self.pty_wrapcls = pty_wrapcls - self.pty_wraptxt = pty_wraptxt \ No newline at end of file + self.pty_wraptxt = pty_wraptxt diff --git a/scrap/utils/utils.py b/scrap/utils/utils.py index 025ac0e..ef37957 100644 --- a/scrap/utils/utils.py +++ b/scrap/utils/utils.py @@ -1,20 +1,22 @@ from scrap.utils.requests import get_soup + def getPartyList(): """ 중앙선거관리위원회에서 제공하는 정당 목록을 가져옵니다. """ - url = 'https://www.nec.go.kr/site/nec/ex/bbs/List.do?cbIdx=1239' + url = "https://www.nec.go.kr/site/nec/ex/bbs/List.do?cbIdx=1239" soup = get_soup(url) - table = soup.find('table', class_='list type2') + table = soup.find("table", class_="list type2") partyList = [] - for tr in table.find('tbody').find_all('tr'): - td = tr.find_all('td') - if td[0].get_text(strip=True).split("
")[0] == '시도': + for tr in table.find("tbody").find_all("tr"): + td = tr.find_all("td") + if td[0].get_text(strip=True).split("
")[0] == "시도": continue # 더불어민주당(민주당, 더민주) 등은 약자가 괄호 안에 있다. partyList.append(td[0].get_text(strip=True).split("
")[0].split("(")[0]) return partyList -if __name__ == '__main__': - print(getPartyList()) \ No newline at end of file + +if __name__ == "__main__": + print(getPartyList())