Skip to content

Commit

Permalink
[scrap] JS 기반 스크랩 추가
Browse files Browse the repository at this point in the history
  • Loading branch information
pingpingy1 authored and Re-st committed Nov 13, 2023
1 parent 81ed335 commit 709e115
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 82 deletions.
12 changes: 0 additions & 12 deletions scrap/group_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,6 @@ def scrap_group_heads(

browser = get_selenium(url)

# driver_loc = os.popen("which chromedriver").read().strip()
# if len(driver_loc) == 0:
# raise Exception("ChromeDriver를 다운로드한 후 다시 시도해주세요.")

# chrome_options = Options()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--no-sandbox")

# webdriver_service = Service(driver_loc)
# browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
# browser.get(url)

areas = [
tag.text.strip()
for tag in browser.find_element(
Expand Down
18 changes: 2 additions & 16 deletions scrap/local_councils/gangwon.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from scrap.local_councils.basic import *
from scrap.utils.requests import get_selenium, By
from scrap.utils.utils import getPartyList

party_keywords = getPartyList()
Expand All @@ -17,18 +14,7 @@ def scrap_107(
) -> ScrapResult:
"""강원도 원주시"""
councilors: list[Councilor] = []

driver_loc = os.popen("which chromedriver").read().strip()
if len(driver_loc) == 0:
raise Exception("ChromeDriver를 다운로드한 후 다시 시도해주세요.")

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")

webdriver_service = Service(driver_loc)
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get(url)
browser = get_selenium(url)

pfs_wrapper = browser.find_element(By.CSS_SELECTOR, "div[id='content']")
councilor_infos = pfs_wrapper.find_elements(By.CSS_SELECTOR, "dl")
Expand Down
75 changes: 36 additions & 39 deletions scrap/local_councils/incheon.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,32 +27,34 @@ def scrap_50(url, cid) -> ScrapResult:

def scrap_51(url, cid) -> ScrapResult:
"""인천 동구"""
raise Exception("현재 인천 동구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다")

# soup = get_soup(url, verify=False)
# councilors: list[Councilor] = []

browser = get_selenium(url)
councilors: list[Councilor] = []

# # 프로필 링크 스크랩을 위해 base_url 추출
# parsed_url = urlparse(url)
# base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
cur_win = browser.current_window_handle

# for name_tag in soup.find_all('strong', class_='name'):
# name = name_tag.get_text(strip=True)
# party = '정당 정보 없음'
for profile in browser.find_elements(By.CSS_SELECTOR, "dl[class='profile']"):
name_tag = profile.find_element(By.CSS_SELECTOR, "strong[class='name']")
name = name_tag.text.strip() if name_tag else "이름 정보 없음"

# profile_link = name_tag.find_next('a', class_='abtn1')
# if profile_link:
# profile_url = base_url + profile_link['onclick'][13:104]
# profile_soup = get_soup(profile_url, verify=False)
party = "정당 정보 없음"
profile_link = profile.find_element(By.TAG_NAME, "a")
if profile_link:
profile_link.click()
browser.switch_to.window(
[win for win in browser.window_handles if win != cur_win][0]
)
party_tag = browser.find_elements(By.CSS_SELECTOR, "span[class='detail']")[
1
]
if party_tag:
party = party_tag.text.strip()

# party_info = profile_soup.find('span', class_='subject', string='소속정당')
# if party_info and (party_span := party_info.find_next('span', class_='detail')) is not None:
# party = party_span.get_text(strip=True)
councilors.append(Councilor(name, party))

# councilors.append(Councilor(name=name, party=party))
browser.close()
browser.switch_to.window(cur_win)

# return returncouncilors(cid, councilors)
return ret_local_councilors(cid, councilors)


def scrap_52(url, cid) -> ScrapResult:
Expand Down Expand Up @@ -121,27 +123,22 @@ def scrap_54(url, cid) -> ScrapResult:

def scrap_55(url, cid) -> ScrapResult:
"""인천 부평구"""
raise Exception("현재 인천 부평구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다")

# soup = get_soup(url, verify=False)
# councilors: list[Councilor] = []

# for profile in soup.find_all('div', class_='profile'):
# name_tag = profile.find('strong', class_='name')
# name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else '이름 정보 없음'
browser = get_selenium(url)
councilors: list[Councilor] = []

# party = '정당 정보 없음'
# party_info = profile.find('strong', string='소속정당').find_next('span')
# if party_info:
# party = party_info.get_text(strip=True).split()[-1].strip()
for profile in browser.find_elements(By.CSS_SELECTOR, "dl[class='profile']"):
name_tag = profile.find_element(By.CSS_SELECTOR, "strong[class='name']")
name = name_tag.text.strip().split()[0].strip() if name_tag else "이름 정보 없음"

# councilors.append(Councilor(name=name, party=party))
party_tag = profile.find_elements(By.TAG_NAME, "li")[2]
party = (
party_tag.find_element(By.TAG_NAME, "span").text.strip().split()[-1].strip()
if party_tag
else "정당 정보 없음"
)
councilors.append(Councilor(name, party))

# return returncouncilors(cid, councilors)
# council_id=55,
# council_type=CouncilType.LOCAL_COUNCIL,
# councilors=councilors
# )
return ret_local_councilors(cid, councilors)


def scrap_56(url, cid) -> ScrapResult:
Expand Down Expand Up @@ -201,4 +198,4 @@ def scrap_57(url, args) -> ScrapResult:


if __name__ == "__main__":
print(scrap_52())
print(scrap_51("https://council.icdonggu.go.kr/korean/member/active", 51))
43 changes: 28 additions & 15 deletions scrap/local_councils/jeolla.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from scrap.local_councils import *
from scrap.utils.requests import get_selenium, By


def scrap_154(
Expand Down Expand Up @@ -86,24 +87,36 @@ def scrap_157(
return ret_local_councilors(cid, councilors)


def scrap_160(
url,
cid,
args: ScrapBasicArgument = None,
) -> ScrapResult:
def scrap_160(url, cid) -> ScrapResult:
"""전라북도 임실군"""
# TODO: js로 동적으로 읽어옴
raise NotImplementedError
browser = get_selenium(url)
councilors: list[Councilor] = []

for profile in browser.find_elements(By.CSS_SELECTOR, "div[class='col-lg-6']"):
name_tag = profile.find_element(By.TAG_NAME, "strong")
name = name_tag.text.strip() if name_tag else "이름 정보 없음"

def scrap_161(
url,
cid,
args: ScrapBasicArgument = None,
) -> ScrapResult:
party = "정당 정보 없음"
councilors.append(Councilor(name, party))

return ret_local_councilors(cid, councilors)


def scrap_161(url, cid) -> ScrapResult:
"""전라북도 순창군"""
# TODO: js로 동적으로 읽어옴
raise NotImplementedError
browser = get_selenium(url)
councilors: list[Councilor] = []

for profile in browser.find_elements(By.CSS_SELECTOR, "div[class='con']"):
name_tag = profile.find_element(By.TAG_NAME, "strong")
name = name_tag.text.strip()[:-2].strip() if name_tag else "이름 정보 없음"

party_tag = profile.find_elements(By.TAG_NAME, "dd")[1]
party = party_tag.text.strip() if party_tag else "정당 정보 없음"

councilors.append(Councilor(name, party))

return ret_local_councilors(cid, councilors)


def scrap_162(
Expand Down Expand Up @@ -226,4 +239,4 @@ def scrap_167(


if __name__ == "__main__":
print(scrap_167())
print(scrap_161("https://www.sunchangcouncil.go.kr/main/contents/lawmaker", 161))

0 comments on commit 709e115

Please sign in to comment.