Skip to content

Commit

Permalink
[Scrap] 57-112 중 에러: [76, 78, 97, 101, 106, 111], 총 6회
Browse files Browse the repository at this point in the history
  • Loading branch information
Re-st committed Nov 6, 2023
1 parent f8ae213 commit e852d45
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 40 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ pandas==2.1.1
gspread==5.11.2
pymongo==4.5.0
python-dotenv==1.0.0
openpyxl
openpyxl
selenium
8 changes: 7 additions & 1 deletion scrap/local_councils/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def get_name(profile, element, class_, wrapper_element, wrapper_class_):
# span 태그 안의 것들을 다 지움
for span in name_tag.find_all("span"):
span.decompose()
for a_tag in name_tag.find_all('a'): # 인천 서구 등. 안에 '개인홈페이지' 링크가 들음.
a_tag.extract()
name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음"

# name은 길고 그 중 strong태그 안에 이름이 있는 경우. 은평구, 수원시 등.
Expand All @@ -72,7 +74,11 @@ def get_name(profile, element, class_, wrapper_element, wrapper_class_):
if keyword in name: # 인천 서구 등
name = name.replace(keyword, "").strip()
break
name = name.split(" ")[0] # 이름 뒤에 직책이 따라오는 경우
maybe_name = name.split()[0] # 이름 뒤에 직책이 따라오는 경우
if len(maybe_name) == 1: # 외자 이름이 띄어쓰기 때문에 분리된 경우
name = "".join(name.split()[0:2])
else:
name = maybe_name
return name


Expand Down
13 changes: 2 additions & 11 deletions scrap/local_councils/busan.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,13 @@
from urllib.parse import urlparse

import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from time import sleep

import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from time import sleep

from scrap.utils.types import CouncilType, Councilor, ScrapResult
from scrap.utils.requests import get_soup

from urllib.parse import urlparse
from time import sleep

def scrap_26(
url="https://www.bsjunggu.go.kr/council/board/list.junggu?boardId=BBS_0000118&menuCd=DOM_000000503003000000&contentsSid=755&cpath=%2Fcouncil",
Expand Down
64 changes: 62 additions & 2 deletions scrap/local_councils/gangwon.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,71 @@
from urllib.parse import urlparse
import re
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from scrap.utils.types import CouncilType, Councilor, ScrapResult, ScrapBasicArgument
from scrap.utils.requests import get_soup
from scrap.local_councils.basic import *
from scrap.utils.utils import getPartyList
party_keywords = getPartyList()
party_keywords.append("무소속")

def scrap_107(
url="https://council.wonju.go.kr/content/member/memberName.html",
) -> ScrapResult:
"""강원도 원주시 페이지에서 의원 상세약력 스크랩
:param url: 의원 목록 사이트 url
:return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체
"""
councilors: list[Councilor] = []

driver_loc = os.popen("which chromedriver").read().strip()
if len(driver_loc) == 0:
raise Exception("ChromeDriver를 다운로드한 후 다시 시도해주세요.")

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")

webdriver_service = Service(driver_loc)
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get(url)

pfs_wrapper = browser.find_element(By.CSS_SELECTOR, "div[id='content']")
councilor_infos = pfs_wrapper.find_elements(By.CSS_SELECTOR, "dl")
for info in councilor_infos:
name_tag = info.find_element(By.CSS_SELECTOR, "dd[class='name']")
name = name_tag.text.split("(")[0].strip() if name_tag else "이름 정보 없음"
if len(name) > 3:
# 수식어가 이름 앞이나 뒤에 붙어있는 경우
for keyword in ["부의장", "의원", "의장"]: # 119, 강서구 등
if keyword in name:
name = name.replace(keyword, "").strip()
party_tag = info.find_elements(By.TAG_NAME, "dd")
for tag in party_tag:
party = tag.text.split(" ")[-1]
if party in party_keywords:
break
if party not in party_keywords:
party = "정당 정보 없음"

councilors.append(Councilor(name, party))

return ScrapResult(
council_id="107",
council_type=CouncilType.LOCAL_COUNCIL,
councilors=councilors,
)
# 107: ScrapBasicArgument(
# pf_memlistelt="div",
# pf_memlistcls="content",
# pf_elt="dl",
# name_elt="dd",
# name_cls="name",
# pty_elt="span",
# ),
def scrap_113(
url="https://sokchocl.go.kr/kr/member/active.do", args: ScrapBasicArgument = None
) -> ScrapResult:
Expand Down
155 changes: 153 additions & 2 deletions scrap/local_councils/gwangju.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,156 @@
"""광주광역시를 스크랩. 60-64번째 의회까지 있음.
"""
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from scrap.utils.types import CouncilType, Councilor, ScrapResult
from scrap.utils.requests import get_soup
from scrap.local_councils.basic import *
from scrap.utils.utils import getPartyList
party_keywords = getPartyList()
party_keywords.append("무소속")

def scrap_62(
url="http://www.gjnc.or.kr/main/contents/lawmakerDistrict",
) -> ScrapResult:
"""광주시 서구 페이지에서 의원 상세약력 스크랩
:param url: 의원 목록 사이트 url
:return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체
"""
councilors: list[Councilor] = []

driver_loc = os.popen("which chromedriver").read().strip()
if len(driver_loc) == 0:
raise Exception("ChromeDriver를 다운로드한 후 다시 시도해주세요.")

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")

webdriver_service = Service(driver_loc)
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get(url)

councilor_infos = browser.find_elements(By.CSS_SELECTOR, "div[class='con']")
cur_win = browser.current_window_handle

for info in councilor_infos:
name_tag = info.find_element(By.TAG_NAME, "strong")
name = name_tag.text.strip() if name_tag else "이름 정보 없음"
homepage_link = info.find_element(By.TAG_NAME, "a")
homepage_link.click()
browser.switch_to.window(
[win for win in browser.window_handles if win != cur_win][0]
)

party_tag = browser.find_elements(By.TAG_NAME, "dd")
for tag in party_tag:
party = tag.text.strip()
if party in party_keywords:
break
if party not in party_keywords:
party = "정당 정보 없음"

browser.close()
browser.switch_to.window(cur_win)

councilors.append(Councilor(name, party))

return ScrapResult(
council_id="62",
council_type=CouncilType.LOCAL_COUNCIL,
councilors=councilors,
)

def scrap_63(
url="https://council.bukgu.gwangju.kr/index.do?PID=024",
) -> ScrapResult:
"""광주시 북구 페이지에서 의원 상세약력 스크랩
:param url: 의원 목록 사이트 url
:return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체
"""
councilors: list[Councilor] = []

driver_loc = os.popen("which chromedriver").read().strip()
if len(driver_loc) == 0:
raise Exception("ChromeDriver를 다운로드한 후 다시 시도해주세요.")

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")

webdriver_service = Service(driver_loc)
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get(url)

councilor_infos = browser.find_elements(By.CSS_SELECTOR, "ul[class='info']")

for info in councilor_infos:
name_tag = info.find_element(By.CSS_SELECTOR, "li[class='name']").find_element(By.TAG_NAME, "h5")
name = name_tag.text.strip() if name_tag else "이름 정보 없음"
party_tag = info.find_elements(By.TAG_NAME, "dd")
for tag in party_tag:
party = tag.text.strip()
if party in party_keywords:
break
if party not in party_keywords:
party = "정당 정보 없음"

councilors.append(Councilor(name, party))

return ScrapResult(
council_id="63",
council_type=CouncilType.LOCAL_COUNCIL,
councilors=councilors,
)

def scrap_64(
url="https://gjgc.or.kr/main/contents/lawmaker",
) -> ScrapResult:
"""광주시 광산구 페이지에서 의원 상세약력 스크랩
:param url: 의원 목록 사이트 url
:return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체
"""
councilors: list[Councilor] = []

driver_loc = os.popen("which chromedriver").read().strip()
if len(driver_loc) == 0:
raise Exception("ChromeDriver를 다운로드한 후 다시 시도해주세요.")

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")

webdriver_service = Service(driver_loc)
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get(url)

councilor_infos = browser.find_elements(By.CSS_SELECTOR, "div[class='con']")

for info in councilor_infos:
name_tag = info.find_element(By.TAG_NAME, "strong")
name = name_tag.text.strip() if name_tag else "이름 정보 없음"
if len(name) > 3:
# 수식어가 이름 앞이나 뒤에 붙어있는 경우
for keyword in ["부의장", "의원", "의장"]: # 119, 강서구 등
if keyword in name:
name = name.replace(keyword, "").strip()
party_tag = info.find_elements(By.TAG_NAME, "dd")
for tag in party_tag:
party = tag.text.replace(" ", "")
if party in party_keywords:
break
if party not in party_keywords:
party = "정당 정보 없음"

councilors.append(Councilor(name, party))

return ScrapResult(
council_id="64",
council_type=CouncilType.LOCAL_COUNCIL,
councilors=councilors,
)
42 changes: 19 additions & 23 deletions scrap/utils/spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,12 @@ def main() -> None:
euc_kr = [6, 13, 16, 31, 72, 88, 112, 134, 154, 157, 163, 165, 167, 181, 197, 202]
special_functions = (
list(range(1, 57))
+ [57, 88, 103]
+ [62, 63, 64, 88, 103, 107]
+ list(range(113, 127))
+ [132, 134, 140, 142, 154, 155, 156, 157, 160, 161, 162, 163, 164, 165, 167]
)
no_information = [106, 111]
errors = []
args = {
2: ScrapBasicArgument(
pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em"
Expand All @@ -75,11 +77,10 @@ def main() -> None:
# 인천
57: ScrapBasicArgument(
pf_elt="div",
pf_cls="box",
pf_cls="conbox",
name_elt="p",
name_cls="mem_tit2",
pty_elt="p",
pty_cls="mem_tit2",
name_cls="name",
pty_elt="li",
),
58: ScrapBasicArgument(
pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em"
Expand All @@ -98,9 +99,7 @@ def main() -> None:
61: ScrapBasicArgument(
pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em"
),
# 62 : TODO! /common/selectCouncilMemberProfile.json 을 어떻게 얻을지..
# 63 : TODO! 홈페이지 터짐
# 64 : TODO! /common/selectCouncilMemberProfile.json 을 어떻게 얻을지..
# 62 - 64 : gwangju.py
# 대전
65: ScrapBasicArgument(
pf_elt="dl",
Expand Down Expand Up @@ -372,16 +371,8 @@ def main() -> None:
pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em"
),
# 강원
# 106 : TODO! 정당정보 없음
# TODO! 107이 get_soup에서 실패 중 - HTTPSConnectionPool(host='council.wonju.go.kr', port=443): Max retries exceeded with url: /content/member/memberName.html (Caused by SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1007)')))
107: ScrapBasicArgument(
pf_memlistelt="div",
pf_memlistcls="content",
pf_elt="dl",
name_elt="dd",
name_cls="name",
pty_elt="span",
),
# 106 : 정당정보 없음
# 107 : scrap_gangwon.py
108: ScrapBasicArgument(
pf_elt="dl", pf_cls="profile", name_elt="strong", pty_elt="li"
),
Expand Down Expand Up @@ -625,11 +616,15 @@ def main() -> None:
data: list[dict] = worksheet.get_all_records()
result: str = ""

error_times = 0
parse_error_times = 0
timeouts = 0
N = 226
for n in range(1, 227):
for n in range(57, 113):
if n in no_information:
print(f"| {n} | 오류: 지난번 확인 시, 정당 정보 등이 홈페이지에 없었습니다."\
"다시 확인해보시겠어요? 링크 : ", data[n - 1]["URL"])
errors.append(n)
continue
encoding = "euc-kr" if n in euc_kr else "utf-8"
result = None
try:
Expand All @@ -640,7 +635,7 @@ def main() -> None:
function_name = f"scrap_{n}"
if hasattr(sys.modules[__name__], function_name):
function_to_call = getattr(sys.modules[__name__], function_name)
if n < 57:
if n < 57 or n in [62, 63, 64, 107]:
result = str(function_to_call(council_url).councilors)
else:
result = str(
Expand All @@ -653,16 +648,17 @@ def main() -> None:
if "정보 없음" in result:
print("정보 없음이 포함되어 있습니다.")
parse_error_times += 1
errors.append(n)
print(f"| {n} | {result}")
except Timeout:
print(f"| {n} | 오류: Request to {council_url} timed out.")
timeouts += 1
except Exception as e:
print(f"| {n} | 오류: {e}")
error_times += 1
errors.append(n)
continue # 에러가 발생하면 다음 반복으로 넘어감
print(
f"| 총 실행 횟수: {N} | 에러 횟수: {error_times} | 정보 없음 횟수: {parse_error_times} | 타임아웃 횟수: {timeouts} |"
f"| 총 실행 횟수: {N} | 에러: {errors}, 총 {len(errors)}회 | 그 중 정보 없음 횟수: {parse_error_times} | 타임아웃 횟수: {timeouts} |"
)


Expand Down

0 comments on commit e852d45

Please sign in to comment.