Skip to content

Commit

Permalink
Merge pull request #370 from sparcs-kaist/refactor/portal-view-crawling
Browse files Browse the repository at this point in the history
Add crawler to track portal view count for portal articles
  • Loading branch information
retroinspect authored Feb 6, 2024
2 parents 30cd44f + 5c4f19b commit c73f252
Show file tree
Hide file tree
Showing 13 changed files with 325 additions and 35 deletions.
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ PORTAL_2FA_KEY=/[2-7A-Z]{16}/
DOCKERHUB_USERNAME=
DOCKERHUB_PASSWORD=
SENTRY_DSN=
PORTAL_JSESSIONID=
10 changes: 10 additions & 0 deletions apps/core/management/commands/crawl_portal_view.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from django.core.management import BaseCommand

from apps.core.management.scripts.portal_crawler import crawl_view


class Command(BaseCommand):
help = "포탈 공지글의 조회수를 크롤링합니다"

def handle(self, *args, **options):
crawl_view()
197 changes: 167 additions & 30 deletions apps/core/management/scripts/portal_crawler.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import hashlib
import re
import uuid
from datetime import datetime
from pytz import timezone as pytz_timezone
from datetime import datetime, timedelta

import boto3
import requests
Expand All @@ -11,10 +10,13 @@
from django.db import transaction
from django.utils import timezone
from django.utils.translation import gettext
from pytz import timezone as pytz_timezone
from tqdm import tqdm

from apps.core.models import Article
from apps.core.models.portal_view_count import PortalViewCount
from apps.user.models import UserProfile
from ara.log import log
from ara.settings import (
AWS_S3_BUCKET_NAME,
PORTAL_ID,
Expand All @@ -40,6 +42,7 @@
BASE_URL = "https://portal.kaist.ac.kr"

KST = pytz_timezone("Asia/Seoul")
PORTAL_NOTICE_BOARD_ID = 1


def _login_kaist_portal():
Expand All @@ -48,11 +51,18 @@ def _login_kaist_portal():
f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page=1&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC",
cookies=COOKIES,
)
print("_login_kaist_portal status code: ", response.status_code)
log.info(f"_login_kaist_portal status code: {response.status_code}")
return session


def _get_article(url, session):
def _list_link_to_full_link(link):
board_id = link.split("/")[-2]
num = link.split("/")[-1]
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
return full_link


def _get_portal_article(url, session):
def _already_hyperlinked(html):
soup = bs(html, "lxml")
tagged_links = []
Expand Down Expand Up @@ -92,6 +102,10 @@ def _enable_hyperlink(s):
def _get_new_url_and_save_to_s3(url, session):
if url.startswith("data:") or "." in url.split("/")[-1]: # not a portal image
return url

if url.startswith("/board"):
return f"https://{BASE_URL}/${url}"

enc = hashlib.md5()
enc.update(url.encode())
hash = enc.hexdigest()[:20]
Expand All @@ -108,8 +122,12 @@ def _save_portal_image(html, session):
soup = bs(html, "lxml")
for child in soup.find_all("img", {}):
old_url = child.attrs.get("src")
new_url = _get_new_url_and_save_to_s3(old_url, session)
child["src"] = new_url
try:
new_url = _get_new_url_and_save_to_s3(old_url, session)
child["src"] = new_url
except Exception as exc:
log.info(child)
raise exec

return str(soup)

Expand All @@ -123,18 +141,21 @@ def _save_portal_image(html, session):
.contents[0]
.strip()
)
created_at_str = (
soup.find("th", text="작성일(조회수)")
.findNext("td")
.contents[0]
.strip()
.split("(")[0]

created_at_view_count_str = (
soup.find("th", text="작성일(조회수)").findNext("td").contents[0].strip()
)

created_at_str = created_at_view_count_str.split("(")[0]
created_at = (
datetime.strptime(created_at_str, "%Y.%m.%d %H:%M:%S")
.astimezone(KST)
.astimezone(timezone.utc)
)

view_count_str = created_at_view_count_str.split("(")[1].split(")")[0]
view_count = int(view_count_str)

title = soup.select("table > tbody > tr > td.req_first")[0].contents[0]

trs = soup.select("table > tbody > tr")
Expand Down Expand Up @@ -165,13 +186,15 @@ def _save_portal_image(html, session):
"content": html,
"writer": writer,
"created_at": created_at,
"view_count": view_count,
}


def crawl_hour(day=None):
# parameter에서 default로 바로 today()하면, 캐싱되어서 업데이트가 안됨
if day is None:
day = timezone.datetime.today().date()
log.info(f"crawl_hour running for day {day}")

session = _login_kaist_portal()

Expand All @@ -187,9 +210,9 @@ def _get_board_today(page_num):
dates = soup.select("table > tbody > tr > td:nth-child(5)")

if links:
print("------- portal login success!")
log.info("------- portal login success!")
else:
print("------- portal login failed!")
log.info("------- portal login failed!")

today_date = str(day).replace("-", ".")
for link, date in zip(links, dates):
Expand Down Expand Up @@ -219,7 +242,7 @@ def _get_board_today(page_num):

last_portal_article_in_db = (
Article.objects.filter(
parent_board_id=1,
parent_board_id=PORTAL_NOTICE_BOARD_ID,
)
.order_by("-created_at")
.first()
Expand All @@ -229,11 +252,8 @@ def _get_board_today(page_num):
prev_title = ""

for link in links:
link = link["link"]
board_id = link.split("/")[-2]
num = link.split("/")[-1]
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
info = _get_article(full_link, session)
full_link = _list_link_to_full_link(link["link"])
info = _get_portal_article(full_link, session)

# Since it is time ordered, consequent ones have been posted more than 1 hour ago.

Expand Down Expand Up @@ -264,21 +284,25 @@ def _get_board_today(page_num):
)

article = Article(
parent_board_id=1,
parent_board_id=PORTAL_NOTICE_BOARD_ID,
title=info["title"],
content=info["content"],
content_text=info["content_text"],
created_by=user,
created_at=created_at_utc,
url=full_link,
latest_portal_view_count=info["view_count"],
)

new_articles.append(article)

prev_title = article.title

# DB의 마지막 포탈글과 방금 크롤링한 글 중 가장 이른 글을 비교
if not new_articles:
log.info("no new articles")
return

earliest_new_article = new_articles[-1]
is_same_day = (
last_portal_article_in_db.created_at.date()
Expand All @@ -294,8 +318,21 @@ def _get_board_today(page_num):

created_articles = Article.objects.bulk_create(new_articles)

new_portal_view_counts = []

for article in created_articles:
portal_view_count = PortalViewCount(
article=article,
view_count=article.latest_portal_view_count,
)
new_portal_view_counts.append(portal_view_count)

PortalViewCount.objects.bulk_create(new_portal_view_counts)

for i in range(len(created_articles)):
print(f"crawled article: {created_articles[i].title}")
log.info(f"crawled article: {created_articles[i].title}")

log.info(f"created {len(created_articles)} articles")


def list_contains_article(articles, article_info):
Expand Down Expand Up @@ -327,18 +364,16 @@ def _get_board(page_num):
page_num = 1

while True:
print("page_num:", page_num)
log.info("page_num:", page_num)
links = []
link = _get_board(page_num)
if link:
links.extend(link)

with transaction.atomic():
for link in tqdm(links):
board_id = link.split("/")[-2]
num = link.split("/")[-1]
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
info = _get_article(full_link, session)
full_link = _list_link_to_full_link(link)
info = _get_portal_article(full_link, session)

user_exist = UserProfile.objects.filter(
nickname=info["writer"], is_newara=False
Expand All @@ -356,24 +391,126 @@ def _get_board(page_num):
picture="user_profiles/default_pictures/KAIST-logo.png",
)

a, created = Article.objects.get_or_create(
parent_board_id=1, # 포탈공지 게시판
a, article_created = Article.objects.get_or_create(
parent_board_id=PORTAL_NOTICE_BOARD_ID, # 포탈공지 게시판
title=info["title"],
content=info["content"],
content_text=info["content_text"],
created_by=user,
url=full_link,
)

if created:
if article_created:
a.created_at = info["created_at"]
a.save()

log.info(info["view_count"])

PortalViewCount.objects.update_or_create(
article=a,
view_count=info["view_count"],
)

page_num += 1

else:
break


def crawl_view():
"""
update all portal_view_count of portal articles
from a week ago until now
"""
now = timezone.datetime.today().date()
log.info(f"crawl_view running on {now}")

week_ago = (
(datetime.today() - timedelta(days=7)).astimezone(KST).astimezone(timezone.utc)
)

session = _login_kaist_portal()

def _get_board_week(page_num):
board_req = session.get(
f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC",
cookies=COOKIES,
)
soup = bs(board_req.text, "lxml")
table = soup.select(".req_tbl_01")[0]
info_list_per_page = []

for row in table.find("tbody").find_all("tr"):
cells = row.find_all("td")
created_at_str = cells[4].text.strip()
created_at = (
datetime.strptime(created_at_str, "%Y.%m.%d")
.astimezone(KST)
.astimezone(timezone.utc)
)

if week_ago > created_at:
return info_list_per_page, True # stop

info = {
"title": cells[0].text.strip(),
"view_count": int(cells[3].text.strip()),
"link": cells[0].find("a").attrs["href"],
"created_at": created_at,
}

info_list_per_page.append(info)

return info_list_per_page, False

info_list = []
page_num = 1

while True:
info_list_per_page, stop = _get_board_week(page_num)
info_list.extend(info_list_per_page)
if stop:
break

page_num += 1

if len(info_list) == 0:
log.info("no portal notice in a week")
return

articles = Article.objects.filter(
created_at__gte=week_ago, parent_board_id=PORTAL_NOTICE_BOARD_ID
)
article_dict = {}

for a in articles:
article_dict[a.url] = a

new_portal_view_counts = []
updated_articles = []

for info in info_list:
full_link = _list_link_to_full_link(info["link"])

if full_link not in article_dict.keys():
continue

article = article_dict[full_link]

portal_view_count = PortalViewCount(
article=article,
view_count=info["view_count"],
)

new_portal_view_counts.append(portal_view_count)

article.latest_portal_view_count = info["view_count"]
updated_articles.append(article)

Article.objects.bulk_update(updated_articles, ["latest_portal_view_count"])
PortalViewCount.objects.bulk_create(new_portal_view_counts)
log.info(f"crawled view count of {len(new_portal_view_counts)} portal notices")


if __name__ == "__main__":
_login_kaist_portal()
3 changes: 2 additions & 1 deletion apps/core/management/tasks.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import time
from collections import defaultdict

from apps.core.management.scripts.portal_crawler import crawl_hour
from apps.core.management.scripts.portal_crawler import crawl_hour, crawl_view
from apps.core.management.scripts.reminder_email_for_reply import send_email
from apps.core.models import BestArticle
from ara import celery_app, redis


@celery_app.task
def crawl_portal():
crawl_view()
crawl_hour()


Expand Down
Loading

0 comments on commit c73f252

Please sign in to comment.