-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Scrap] 57-112 중 에러: [76, 78, 97, 101, 106, 111], 총 6회
- Loading branch information
Showing
6 changed files
with
245 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,4 +11,5 @@ pandas==2.1.1 | |
gspread==5.11.2 | ||
pymongo==4.5.0 | ||
python-dotenv==1.0.0 | ||
openpyxl | ||
openpyxl | ||
selenium |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,156 @@ | ||
"""광주광역시를 스크랩. 60-64번째 의회까지 있음. | ||
""" | ||
import os | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.service import Service | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.chrome.options import Options | ||
|
||
from scrap.utils.types import CouncilType, Councilor, ScrapResult | ||
from scrap.utils.requests import get_soup | ||
from scrap.local_councils.basic import * | ||
from scrap.utils.utils import getPartyList | ||
party_keywords = getPartyList() | ||
party_keywords.append("무소속") | ||
|
||
def scrap_62( | ||
url="http://www.gjnc.or.kr/main/contents/lawmakerDistrict", | ||
) -> ScrapResult: | ||
"""광주시 서구 페이지에서 의원 상세약력 스크랩 | ||
:param url: 의원 목록 사이트 url | ||
:return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 | ||
""" | ||
councilors: list[Councilor] = [] | ||
|
||
driver_loc = os.popen("which chromedriver").read().strip() | ||
if len(driver_loc) == 0: | ||
raise Exception("ChromeDriver를 다운로드한 후 다시 시도해주세요.") | ||
|
||
chrome_options = Options() | ||
chrome_options.add_argument("--headless") | ||
chrome_options.add_argument("--no-sandbox") | ||
|
||
webdriver_service = Service(driver_loc) | ||
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options) | ||
browser.get(url) | ||
|
||
councilor_infos = browser.find_elements(By.CSS_SELECTOR, "div[class='con']") | ||
cur_win = browser.current_window_handle | ||
|
||
for info in councilor_infos: | ||
name_tag = info.find_element(By.TAG_NAME, "strong") | ||
name = name_tag.text.strip() if name_tag else "이름 정보 없음" | ||
homepage_link = info.find_element(By.TAG_NAME, "a") | ||
homepage_link.click() | ||
browser.switch_to.window( | ||
[win for win in browser.window_handles if win != cur_win][0] | ||
) | ||
|
||
party_tag = browser.find_elements(By.TAG_NAME, "dd") | ||
for tag in party_tag: | ||
party = tag.text.strip() | ||
if party in party_keywords: | ||
break | ||
if party not in party_keywords: | ||
party = "정당 정보 없음" | ||
|
||
browser.close() | ||
browser.switch_to.window(cur_win) | ||
|
||
councilors.append(Councilor(name, party)) | ||
|
||
return ScrapResult( | ||
council_id="62", | ||
council_type=CouncilType.LOCAL_COUNCIL, | ||
councilors=councilors, | ||
) | ||
|
||
def scrap_63( | ||
url="https://council.bukgu.gwangju.kr/index.do?PID=024", | ||
) -> ScrapResult: | ||
"""광주시 북구 페이지에서 의원 상세약력 스크랩 | ||
:param url: 의원 목록 사이트 url | ||
:return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 | ||
""" | ||
councilors: list[Councilor] = [] | ||
|
||
driver_loc = os.popen("which chromedriver").read().strip() | ||
if len(driver_loc) == 0: | ||
raise Exception("ChromeDriver를 다운로드한 후 다시 시도해주세요.") | ||
|
||
chrome_options = Options() | ||
chrome_options.add_argument("--headless") | ||
chrome_options.add_argument("--no-sandbox") | ||
|
||
webdriver_service = Service(driver_loc) | ||
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options) | ||
browser.get(url) | ||
|
||
councilor_infos = browser.find_elements(By.CSS_SELECTOR, "ul[class='info']") | ||
|
||
for info in councilor_infos: | ||
name_tag = info.find_element(By.CSS_SELECTOR, "li[class='name']").find_element(By.TAG_NAME, "h5") | ||
name = name_tag.text.strip() if name_tag else "이름 정보 없음" | ||
party_tag = info.find_elements(By.TAG_NAME, "dd") | ||
for tag in party_tag: | ||
party = tag.text.strip() | ||
if party in party_keywords: | ||
break | ||
if party not in party_keywords: | ||
party = "정당 정보 없음" | ||
|
||
councilors.append(Councilor(name, party)) | ||
|
||
return ScrapResult( | ||
council_id="63", | ||
council_type=CouncilType.LOCAL_COUNCIL, | ||
councilors=councilors, | ||
) | ||
|
||
def scrap_64( | ||
url="https://gjgc.or.kr/main/contents/lawmaker", | ||
) -> ScrapResult: | ||
"""광주시 광산구 페이지에서 의원 상세약력 스크랩 | ||
:param url: 의원 목록 사이트 url | ||
:return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 | ||
""" | ||
councilors: list[Councilor] = [] | ||
|
||
driver_loc = os.popen("which chromedriver").read().strip() | ||
if len(driver_loc) == 0: | ||
raise Exception("ChromeDriver를 다운로드한 후 다시 시도해주세요.") | ||
|
||
chrome_options = Options() | ||
chrome_options.add_argument("--headless") | ||
chrome_options.add_argument("--no-sandbox") | ||
|
||
webdriver_service = Service(driver_loc) | ||
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options) | ||
browser.get(url) | ||
|
||
councilor_infos = browser.find_elements(By.CSS_SELECTOR, "div[class='con']") | ||
|
||
for info in councilor_infos: | ||
name_tag = info.find_element(By.TAG_NAME, "strong") | ||
name = name_tag.text.strip() if name_tag else "이름 정보 없음" | ||
if len(name) > 3: | ||
# 수식어가 이름 앞이나 뒤에 붙어있는 경우 | ||
for keyword in ["부의장", "의원", "의장"]: # 119, 강서구 등 | ||
if keyword in name: | ||
name = name.replace(keyword, "").strip() | ||
party_tag = info.find_elements(By.TAG_NAME, "dd") | ||
for tag in party_tag: | ||
party = tag.text.replace(" ", "") | ||
if party in party_keywords: | ||
break | ||
if party not in party_keywords: | ||
party = "정당 정보 없음" | ||
|
||
councilors.append(Councilor(name, party)) | ||
|
||
return ScrapResult( | ||
council_id="64", | ||
council_type=CouncilType.LOCAL_COUNCIL, | ||
councilors=councilors, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters