Refactoring, styling and data output enhancements (#51)

jmyrberg · May 23, 2020 · 036ddc0 · 036ddc0
1 parent 45b534f
commit 036ddc0
Show file tree

Hide file tree

Showing 23 changed files with 458 additions and 400 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.0.1dev22
+0.1.0a
diff --git a/docs/source/finscraper.rst b/docs/source/finscraper.rst
@@ -20,14 +20,6 @@ finscraper.extensions module
    :undoc-members:
    :show-inheritance:
 
-finscraper.http module
-----------------------
-
-.. automodule:: finscraper.http
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 finscraper.middlewares module
 -----------------------------
 
@@ -44,6 +36,14 @@ finscraper.pipelines module
    :undoc-members:
    :show-inheritance:
 
+finscraper.request module
+-------------------------
+
+.. automodule:: finscraper.request
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finscraper.settings module
 --------------------------
 
@@ -60,6 +60,14 @@ finscraper.spiders module
    :undoc-members:
    :show-inheritance:
 
+finscraper.text\_utils module
+-----------------------------
+
+.. automodule:: finscraper.text_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finscraper.utils module
 -----------------------
 

diff --git a/finscraper/extensions.py b/finscraper/extensions.py
@@ -6,7 +6,7 @@
 from collections import defaultdict
 
 from scrapy import signals
-from scrapy.exceptions import NotConfigured, CloseSpider
+from scrapy.exceptions import NotConfigured
 
 from tqdm.auto import tqdm
 
@@ -15,7 +15,7 @@
 
 class ProgressBar:
     """Scrapy extension thay displays progress bar.
-    
+
     Enabled via ``PROGRESS_BAR_ENABLED`` Scrapy setting.
     """
     def __init__(self, crawler):

diff --git a/finscraper/http.py b/finscraper/http.py
diff --git a/finscraper/middlewares.py b/finscraper/middlewares.py
@@ -3,20 +3,21 @@
 
 import logging
 
-import time
-
-from scrapy import signals, Request
+from scrapy import signals
 from scrapy.exceptions import NotConfigured
 from scrapy.http import HtmlResponse
 
 from selenium.webdriver.chrome.options import Options
 
-from finscraper.http import SeleniumCallbackRequest
+from finscraper.request import SeleniumCallbackRequest
 from finscraper.utils import get_chromedriver
 
 
 class SeleniumCallbackMiddleware:
-    """Middleware that processes request with given callback."""
+    """Middleware that processes request with given callback.
+
+    Headless mode can be disabled via ``DISABLE_HEADLESS`` Scrapy setting.
+    """
 
     def __init__(self, settings):
         self.settings = settings
@@ -32,23 +33,28 @@ def from_crawler(cls, crawler):
 
     def spider_opened(self, spider):
         options = Options()
-        options.add_argument("--headless")
+        if not self.settings.get('DISABLE_HEADLESS', False):
+            options.add_argument("--headless")
         options.add_argument("--disable-extensions")
         options.add_argument("--disable-gpu")
         if self.settings.get('PROGRESS_BAR_ENABLED', True):
             options.add_argument('--disable-logging')
             for name in ['selenium.webdriver.remote.remote_connection',
                          'requests', 'urllib3']:
                 logging.getLogger(name).propagate = False
-        self.driver = get_chromedriver(options)
+        try:
+            self.driver = get_chromedriver(options)
+        except Exception:
+            raise NotConfigured('Could not get chromedriver')
 
     def spider_closed(self, spider):
-        self.driver.close()
+        if hasattr(self, 'driver'):
+            self.driver.close()
 
     def process_request(self, request, spider):
         if not isinstance(request, SeleniumCallbackRequest):
             return None
-        
+
         selenium_callback = request.meta.get('selenium_callback')
         if selenium_callback is None:
             self.driver.get(request.url)

diff --git a/finscraper/request.py b/finscraper/request.py
@@ -0,0 +1,23 @@
+"""Module for custom Scrapy request components."""
+
+
+from scrapy import Request
+
+
+class SeleniumCallbackRequest(Request):
+    """Process request with given callback using Selenium.
+
+    Args:
+        selenium_callback (func or None, optional): Function that will be
+            called with the chrome webdriver. The function should take in
+            parameters (request, spider, driver) and return request, response
+            or None. If None, driver will be used for fetching the page, and
+            return is response. Defaults to None.
+    """
+
+    def __init__(self, *args, selenium_callback=None, **kwargs):
+        meta = kwargs.pop('meta', {}) or {}
+        if 'selenium_callback' not in meta:
+            meta['selenium_callback'] = selenium_callback
+        new_kwargs = dict(**kwargs, meta=meta)
+        super(SeleniumCallbackRequest, self).__init__(*args, **new_kwargs)
diff --git a/finscraper/scrapy_spiders/demipage.py b/finscraper/scrapy_spiders/demipage.py
@@ -3,19 +3,18 @@
 
 import time
 
-from functools import partial
-
 from scrapy import Item, Field, Selector
 from scrapy.crawler import Spider
-from scrapy.exceptions import DropItem
-from scrapy.http import HtmlResponse
-from scrapy.linkextractors import LinkExtractor
 from scrapy.loader import ItemLoader
+from scrapy.linkextractors import LinkExtractor
 from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Compose
+from scrapy.http import HtmlResponse
+
+from selenium.webdriver.support.wait import WebDriverWait
 
 from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin
-from finscraper.utils import strip_join, safe_cast_int, strip_elements, \
-    drop_empty_elements
+from finscraper.text_utils import strip_join, safe_cast_int, strip_elements, \
+    drop_empty_elements, paragraph_join
 
 
 class _DemiPageSpider(FollowAndParseItemMixin, Spider):
@@ -24,14 +23,14 @@ class _DemiPageSpider(FollowAndParseItemMixin, Spider):
     follow_link_extractor = LinkExtractor(
         allow_domains=('demi.fi'),
         allow=(r'.*\/keskustelu[t]*\/.*'),
-        deny=('\?'),
+        deny=(r'\?'),
         deny_domains=(),
         canonicalize=True
     )
     item_link_extractor = LinkExtractor(
         allow_domains=('demi.fi'),
         allow=(rf'.*/keskustelu/[A-z0-9\-]+'),
-        deny=('\?'),
+        deny=(r'\?'),
         deny_domains=(),
         restrict_xpaths=['//div[contains(@class, "threadItem")]'],
         canonicalize=True
@@ -44,44 +43,66 @@ class _DemiPageSpider(FollowAndParseItemMixin, Spider):
 
     def __init__(self, *args, **kwargs):
         """Fetch comments from demi.fi.
-        
+
         Args:
         """
-        kwargs['items_selenium_callback'] = None  # Enable JS for items
+        kwargs['items_selenium_callback'] = self._wait_item_page
         super(_DemiPageSpider, self).__init__(*args, **kwargs)
 
+    @staticmethod
+    def _wait_item_page(request, spider, driver):
+        # Wait until number of comments corresponds to numbering
+        driver.get(request.url)
+        reply_xpath = '//div[contains(@class, "__reply__")]'
+        numbering_xpath = (
+            f'{reply_xpath}//div[contains(@class, "replyNumbering")]')
+        numbering = driver.find_element_by_xpath(numbering_xpath)
+        try:
+            n_comments = int(numbering.text.split('/')[-1])
+        except Exception:
+            n_comments = 0
+        (WebDriverWait(driver, 2, 0.1).until(
+         lambda d: len(d.find_elements_by_xpath(reply_xpath)) >= n_comments))
+        return HtmlResponse(
+            driver.current_url,
+            body=driver.page_source.encode('utf-8'),
+            encoding='utf-8',
+            request=request
+        )
+
     def _parse_comment(self, comment):
-        l = ItemLoader(item=_DemiCommentItem(), selector=comment)
-        l.add_xpath('author',
+        il = ItemLoader(item=_DemiCommentItem(), selector=comment)
+        il.add_xpath(
+            'author',
             '//span[contains(@class, "discussionItemAuthor")]//text()')
-        l.add_xpath('date',
-            '//span[contains(@class, "__time__")]//text()')
-        l.add_xpath('quotes', '//blockquote//text()')
-        l.add_xpath('content', '//p//text()')
-        l.add_xpath('numbering',
-            '//div[contains(@class, "replyNumbering")]//text()')
-        l.add_xpath('likes', '//span[contains(@class, "LikeCount")]//text()')
-        return l.load_item()
-
+        il.add_xpath('date', '//span[contains(@class, "__time__")]//text()')
+        il.add_xpath('quotes', '//blockquote//text()')
+        il.add_xpath('content', '//p//text()')
+        il.add_xpath(
+            'numbering', '//div[contains(@class, "replyNumbering")]//text()')
+        il.add_xpath('likes', '//span[contains(@class, "LikeCount")]//text()')
+        return il.load_item()
+
     def _parse_item(self, resp):
-        l = ItemLoader(item=_DemiPageItem(), response=resp)
-        l.add_value('url', resp.url)
-        l.add_value('time', int(time.time()))
-        first_reply = l.nested_xpath(
+        il = ItemLoader(item=_DemiPageItem(), response=resp)
+        il.add_value('url', resp.url)
+        il.add_value('time', int(time.time()))
+        first_reply = il.nested_xpath(
             '//div[contains(@class, "firstReplyContainer")]')
-        first_reply.add_xpath('title',
-            '//div[contains(@class, "__title__")]//text()')
-        first_reply.add_xpath('published',
-            '//span[contains(@class, "__time__")]//text()')
-        l.add_xpath('author',
+        first_reply.add_xpath(
+            'title', '//div[contains(@class, "__title__")]//text()')
+        first_reply.add_xpath(
+            'published', '//span[contains(@class, "__time__")]//text()')
+        il.add_xpath(
+            'author',
             '//span[contains(@class, "discussionItemAuthor")]//text()')
-        
+
         comments = []
         comment_xpath = '//div[contains(@class, "__reply__")]'
         for comment in resp.xpath(comment_xpath):
             comments.append(self._parse_comment(Selector(text=comment.get())))
-        l.add_value('comments', comments)
-        return l.load_item()
+        il.add_value('comments', comments)
+        return il.load_item()
 
 
 class _DemiCommentItem(Item):
@@ -107,7 +128,7 @@ class _DemiCommentItem(Item):
         output_processor=Identity()
     )
     content = Field(
-        input_processor=strip_join,
+        input_processor=paragraph_join,
         output_processor=TakeFirst()
     )
     numbering = Field(