Merge pull request #370 from sparcs-kaist/refactor/portal-view-crawling

Add crawler to track portal view count for portal articles
sparcs-kaist · Feb 6, 2024 · c73f252 · c73f252
2 parents 30cd44f + 5c4f19b
commit c73f252
Show file tree

Hide file tree

Showing 13 changed files with 325 additions and 35 deletions.
diff --git a/.env.example b/.env.example
@@ -12,3 +12,4 @@ PORTAL_2FA_KEY=/[2-7A-Z]{16}/
 DOCKERHUB_USERNAME=
 DOCKERHUB_PASSWORD=
 SENTRY_DSN=
+PORTAL_JSESSIONID=
diff --git a/apps/core/management/commands/crawl_portal_view.py b/apps/core/management/commands/crawl_portal_view.py
@@ -0,0 +1,10 @@
+from django.core.management import BaseCommand
+
+from apps.core.management.scripts.portal_crawler import crawl_view
+
+
+class Command(BaseCommand):
+    help = "포탈 공지글의 조회수를 크롤링합니다"
+
+    def handle(self, *args, **options):
+        crawl_view()
diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py
@@ -1,8 +1,7 @@
 import hashlib
 import re
 import uuid
-from datetime import datetime
-from pytz import timezone as pytz_timezone
+from datetime import datetime, timedelta
 
 import boto3
 import requests
@@ -11,10 +10,13 @@
 from django.db import transaction
 from django.utils import timezone
 from django.utils.translation import gettext
+from pytz import timezone as pytz_timezone
 from tqdm import tqdm
 
 from apps.core.models import Article
+from apps.core.models.portal_view_count import PortalViewCount
 from apps.user.models import UserProfile
+from ara.log import log
 from ara.settings import (
     AWS_S3_BUCKET_NAME,
     PORTAL_ID,
@@ -40,6 +42,7 @@
 BASE_URL = "https://portal.kaist.ac.kr"
 
 KST = pytz_timezone("Asia/Seoul")
+PORTAL_NOTICE_BOARD_ID = 1
 
 
 def _login_kaist_portal():
@@ -48,11 +51,18 @@ def _login_kaist_portal():
         f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page=1&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC",
         cookies=COOKIES,
     )
-    print("_login_kaist_portal status code: ", response.status_code)
+    log.info(f"_login_kaist_portal status code: {response.status_code}")
     return session
 
 
-def _get_article(url, session):
+def _list_link_to_full_link(link):
+    board_id = link.split("/")[-2]
+    num = link.split("/")[-1]
+    full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
+    return full_link
+
+
+def _get_portal_article(url, session):
     def _already_hyperlinked(html):
         soup = bs(html, "lxml")
         tagged_links = []
@@ -92,6 +102,10 @@ def _enable_hyperlink(s):
     def _get_new_url_and_save_to_s3(url, session):
         if url.startswith("data:") or "." in url.split("/")[-1]:  # not a portal image
             return url
+
+        if url.startswith("/board"):
+            return f"https://{BASE_URL}/${url}"
+
         enc = hashlib.md5()
         enc.update(url.encode())
         hash = enc.hexdigest()[:20]
@@ -108,8 +122,12 @@ def _save_portal_image(html, session):
         soup = bs(html, "lxml")
         for child in soup.find_all("img", {}):
             old_url = child.attrs.get("src")
-            new_url = _get_new_url_and_save_to_s3(old_url, session)
-            child["src"] = new_url
+            try:
+                new_url = _get_new_url_and_save_to_s3(old_url, session)
+                child["src"] = new_url
+            except Exception as exc:
+                log.info(child)
+                raise exec
 
         return str(soup)
 
@@ -123,18 +141,21 @@ def _save_portal_image(html, session):
         .contents[0]
         .strip()
     )
-    created_at_str = (
-        soup.find("th", text="작성일(조회수)")
-        .findNext("td")
-        .contents[0]
-        .strip()
-        .split("(")[0]
+
+    created_at_view_count_str = (
+        soup.find("th", text="작성일(조회수)").findNext("td").contents[0].strip()
     )
+
+    created_at_str = created_at_view_count_str.split("(")[0]
     created_at = (
         datetime.strptime(created_at_str, "%Y.%m.%d %H:%M:%S")
         .astimezone(KST)
         .astimezone(timezone.utc)
     )
+
+    view_count_str = created_at_view_count_str.split("(")[1].split(")")[0]
+    view_count = int(view_count_str)
+
     title = soup.select("table > tbody > tr > td.req_first")[0].contents[0]
 
     trs = soup.select("table > tbody > tr")
@@ -165,13 +186,15 @@ def _save_portal_image(html, session):
         "content": html,
         "writer": writer,
         "created_at": created_at,
+        "view_count": view_count,
     }
 
 
 def crawl_hour(day=None):
     # parameter에서 default로 바로 today()하면, 캐싱되어서 업데이트가 안됨
     if day is None:
         day = timezone.datetime.today().date()
+    log.info(f"crawl_hour running for day {day}")
 
     session = _login_kaist_portal()
 
@@ -187,9 +210,9 @@ def _get_board_today(page_num):
         dates = soup.select("table > tbody > tr > td:nth-child(5)")
 
         if links:
-            print("------- portal login success!")
+            log.info("------- portal login success!")
         else:
-            print("------- portal login failed!")
+            log.info("------- portal login failed!")
 
         today_date = str(day).replace("-", ".")
         for link, date in zip(links, dates):
@@ -219,7 +242,7 @@ def _get_board_today(page_num):
 
     last_portal_article_in_db = (
         Article.objects.filter(
-            parent_board_id=1,
+            parent_board_id=PORTAL_NOTICE_BOARD_ID,
         )
         .order_by("-created_at")
         .first()
@@ -229,11 +252,8 @@ def _get_board_today(page_num):
     prev_title = ""
 
     for link in links:
-        link = link["link"]
-        board_id = link.split("/")[-2]
-        num = link.split("/")[-1]
-        full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
-        info = _get_article(full_link, session)
+        full_link = _list_link_to_full_link(link["link"])
+        info = _get_portal_article(full_link, session)
 
         # Since it is time ordered, consequent ones have been posted more than 1 hour ago.
 
@@ -264,21 +284,25 @@ def _get_board_today(page_num):
             )
 
         article = Article(
-            parent_board_id=1,
+            parent_board_id=PORTAL_NOTICE_BOARD_ID,
             title=info["title"],
             content=info["content"],
             content_text=info["content_text"],
             created_by=user,
             created_at=created_at_utc,
             url=full_link,
+            latest_portal_view_count=info["view_count"],
         )
 
         new_articles.append(article)
+
         prev_title = article.title
 
     # DB의 마지막 포탈글과 방금 크롤링한 글 중 가장 이른 글을 비교
     if not new_articles:
+        log.info("no new articles")
         return
+
     earliest_new_article = new_articles[-1]
     is_same_day = (
         last_portal_article_in_db.created_at.date()
@@ -294,8 +318,21 @@ def _get_board_today(page_num):
 
     created_articles = Article.objects.bulk_create(new_articles)
 
+    new_portal_view_counts = []
+
+    for article in created_articles:
+        portal_view_count = PortalViewCount(
+            article=article,
+            view_count=article.latest_portal_view_count,
+        )
+        new_portal_view_counts.append(portal_view_count)
+
+    PortalViewCount.objects.bulk_create(new_portal_view_counts)
+
     for i in range(len(created_articles)):
-        print(f"crawled article: {created_articles[i].title}")
+        log.info(f"crawled article: {created_articles[i].title}")
+
+    log.info(f"created {len(created_articles)} articles")
 
 
 def list_contains_article(articles, article_info):
@@ -327,18 +364,16 @@ def _get_board(page_num):
     page_num = 1
 
     while True:
-        print("page_num:", page_num)
+        log.info("page_num:", page_num)
         links = []
         link = _get_board(page_num)
         if link:
             links.extend(link)
 
             with transaction.atomic():
                 for link in tqdm(links):
-                    board_id = link.split("/")[-2]
-                    num = link.split("/")[-1]
-                    full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
-                    info = _get_article(full_link, session)
+                    full_link = _list_link_to_full_link(link)
+                    info = _get_portal_article(full_link, session)
 
                     user_exist = UserProfile.objects.filter(
                         nickname=info["writer"], is_newara=False
@@ -356,24 +391,126 @@ def _get_board(page_num):
                             picture="user_profiles/default_pictures/KAIST-logo.png",
                         )
 
-                    a, created = Article.objects.get_or_create(
-                        parent_board_id=1,  # 포탈공지 게시판
+                    a, article_created = Article.objects.get_or_create(
+                        parent_board_id=PORTAL_NOTICE_BOARD_ID,  # 포탈공지 게시판
                         title=info["title"],
                         content=info["content"],
                         content_text=info["content_text"],
                         created_by=user,
                         url=full_link,
                     )
 
-                    if created:
+                    if article_created:
                         a.created_at = info["created_at"]
                         a.save()
 
+                    log.info(info["view_count"])
+
+                    PortalViewCount.objects.update_or_create(
+                        article=a,
+                        view_count=info["view_count"],
+                    )
+
             page_num += 1
 
         else:
             break
 
 
+def crawl_view():
+    """
+    update all portal_view_count of portal articles
+    from a week ago until now
+    """
+    now = timezone.datetime.today().date()
+    log.info(f"crawl_view running on {now}")
+
+    week_ago = (
+        (datetime.today() - timedelta(days=7)).astimezone(KST).astimezone(timezone.utc)
+    )
+
+    session = _login_kaist_portal()
+
+    def _get_board_week(page_num):
+        board_req = session.get(
+            f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC",
+            cookies=COOKIES,
+        )
+        soup = bs(board_req.text, "lxml")
+        table = soup.select(".req_tbl_01")[0]
+        info_list_per_page = []
+
+        for row in table.find("tbody").find_all("tr"):
+            cells = row.find_all("td")
+            created_at_str = cells[4].text.strip()
+            created_at = (
+                datetime.strptime(created_at_str, "%Y.%m.%d")
+                .astimezone(KST)
+                .astimezone(timezone.utc)
+            )
+
+            if week_ago > created_at:
+                return info_list_per_page, True  # stop
+
+            info = {
+                "title": cells[0].text.strip(),
+                "view_count": int(cells[3].text.strip()),
+                "link": cells[0].find("a").attrs["href"],
+                "created_at": created_at,
+            }
+
+            info_list_per_page.append(info)
+
+        return info_list_per_page, False
+
+    info_list = []
+    page_num = 1
+
+    while True:
+        info_list_per_page, stop = _get_board_week(page_num)
+        info_list.extend(info_list_per_page)
+        if stop:
+            break
+
+        page_num += 1
+
+    if len(info_list) == 0:
+        log.info("no portal notice in a week")
+        return
+
+    articles = Article.objects.filter(
+        created_at__gte=week_ago, parent_board_id=PORTAL_NOTICE_BOARD_ID
+    )
+    article_dict = {}
+
+    for a in articles:
+        article_dict[a.url] = a
+
+    new_portal_view_counts = []
+    updated_articles = []
+
+    for info in info_list:
+        full_link = _list_link_to_full_link(info["link"])
+
+        if full_link not in article_dict.keys():
+            continue
+
+        article = article_dict[full_link]
+
+        portal_view_count = PortalViewCount(
+            article=article,
+            view_count=info["view_count"],
+        )
+
+        new_portal_view_counts.append(portal_view_count)
+
+        article.latest_portal_view_count = info["view_count"]
+        updated_articles.append(article)
+
+    Article.objects.bulk_update(updated_articles, ["latest_portal_view_count"])
+    PortalViewCount.objects.bulk_create(new_portal_view_counts)
+    log.info(f"crawled view count of {len(new_portal_view_counts)} portal notices")
+
+
 if __name__ == "__main__":
     _login_kaist_portal()
diff --git a/apps/core/management/tasks.py b/apps/core/management/tasks.py
@@ -1,14 +1,15 @@
 import time
 from collections import defaultdict
 
-from apps.core.management.scripts.portal_crawler import crawl_hour
+from apps.core.management.scripts.portal_crawler import crawl_hour, crawl_view
 from apps.core.management.scripts.reminder_email_for_reply import send_email
 from apps.core.models import BestArticle
 from ara import celery_app, redis
 
 
 @celery_app.task
 def crawl_portal():
+    crawl_view()
     crawl_hour()