Skip to content

Commit

Permalink
Refactoring, styling and data output enhancements (#51)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmyrberg authored May 23, 2020
1 parent 45b534f commit 036ddc0
Show file tree
Hide file tree
Showing 23 changed files with 458 additions and 400 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.1dev22
0.1.0a
24 changes: 16 additions & 8 deletions docs/source/finscraper.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,6 @@ finscraper.extensions module
:undoc-members:
:show-inheritance:

finscraper.http module
----------------------

.. automodule:: finscraper.http
:members:
:undoc-members:
:show-inheritance:

finscraper.middlewares module
-----------------------------

Expand All @@ -44,6 +36,14 @@ finscraper.pipelines module
:undoc-members:
:show-inheritance:

finscraper.request module
-------------------------

.. automodule:: finscraper.request
:members:
:undoc-members:
:show-inheritance:

finscraper.settings module
--------------------------

Expand All @@ -60,6 +60,14 @@ finscraper.spiders module
:undoc-members:
:show-inheritance:

finscraper.text\_utils module
-----------------------------

.. automodule:: finscraper.text_utils
:members:
:undoc-members:
:show-inheritance:

finscraper.utils module
-----------------------

Expand Down
4 changes: 2 additions & 2 deletions finscraper/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from collections import defaultdict

from scrapy import signals
from scrapy.exceptions import NotConfigured, CloseSpider
from scrapy.exceptions import NotConfigured

from tqdm.auto import tqdm

Expand All @@ -15,7 +15,7 @@

class ProgressBar:
"""Scrapy extension thay displays progress bar.
Enabled via ``PROGRESS_BAR_ENABLED`` Scrapy setting.
"""
def __init__(self, crawler):
Expand Down
25 changes: 0 additions & 25 deletions finscraper/http.py

This file was deleted.

24 changes: 15 additions & 9 deletions finscraper/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,21 @@

import logging

import time

from scrapy import signals, Request
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse

from selenium.webdriver.chrome.options import Options

from finscraper.http import SeleniumCallbackRequest
from finscraper.request import SeleniumCallbackRequest
from finscraper.utils import get_chromedriver


class SeleniumCallbackMiddleware:
"""Middleware that processes request with given callback."""
"""Middleware that processes request with given callback.
Headless mode can be disabled via ``DISABLE_HEADLESS`` Scrapy setting.
"""

def __init__(self, settings):
self.settings = settings
Expand All @@ -32,23 +33,28 @@ def from_crawler(cls, crawler):

def spider_opened(self, spider):
options = Options()
options.add_argument("--headless")
if not self.settings.get('DISABLE_HEADLESS', False):
options.add_argument("--headless")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
if self.settings.get('PROGRESS_BAR_ENABLED', True):
options.add_argument('--disable-logging')
for name in ['selenium.webdriver.remote.remote_connection',
'requests', 'urllib3']:
logging.getLogger(name).propagate = False
self.driver = get_chromedriver(options)
try:
self.driver = get_chromedriver(options)
except Exception:
raise NotConfigured('Could not get chromedriver')

def spider_closed(self, spider):
self.driver.close()
if hasattr(self, 'driver'):
self.driver.close()

def process_request(self, request, spider):
if not isinstance(request, SeleniumCallbackRequest):
return None

selenium_callback = request.meta.get('selenium_callback')
if selenium_callback is None:
self.driver.get(request.url)
Expand Down
23 changes: 23 additions & 0 deletions finscraper/request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Module for custom Scrapy request components."""


from scrapy import Request


class SeleniumCallbackRequest(Request):
"""Process request with given callback using Selenium.
Args:
selenium_callback (func or None, optional): Function that will be
called with the chrome webdriver. The function should take in
parameters (request, spider, driver) and return request, response
or None. If None, driver will be used for fetching the page, and
return is response. Defaults to None.
"""

def __init__(self, *args, selenium_callback=None, **kwargs):
meta = kwargs.pop('meta', {}) or {}
if 'selenium_callback' not in meta:
meta['selenium_callback'] = selenium_callback
new_kwargs = dict(**kwargs, meta=meta)
super(SeleniumCallbackRequest, self).__init__(*args, **new_kwargs)
91 changes: 56 additions & 35 deletions finscraper/scrapy_spiders/demipage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@

import time

from functools import partial

from scrapy import Item, Field, Selector
from scrapy.crawler import Spider
from scrapy.exceptions import DropItem
from scrapy.http import HtmlResponse
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Compose
from scrapy.http import HtmlResponse

from selenium.webdriver.support.wait import WebDriverWait

from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin
from finscraper.utils import strip_join, safe_cast_int, strip_elements, \
drop_empty_elements
from finscraper.text_utils import strip_join, safe_cast_int, strip_elements, \
drop_empty_elements, paragraph_join


class _DemiPageSpider(FollowAndParseItemMixin, Spider):
Expand All @@ -24,14 +23,14 @@ class _DemiPageSpider(FollowAndParseItemMixin, Spider):
follow_link_extractor = LinkExtractor(
allow_domains=('demi.fi'),
allow=(r'.*\/keskustelu[t]*\/.*'),
deny=('\?'),
deny=(r'\?'),
deny_domains=(),
canonicalize=True
)
item_link_extractor = LinkExtractor(
allow_domains=('demi.fi'),
allow=(rf'.*/keskustelu/[A-z0-9\-]+'),
deny=('\?'),
deny=(r'\?'),
deny_domains=(),
restrict_xpaths=['//div[contains(@class, "threadItem")]'],
canonicalize=True
Expand All @@ -44,44 +43,66 @@ class _DemiPageSpider(FollowAndParseItemMixin, Spider):

def __init__(self, *args, **kwargs):
"""Fetch comments from demi.fi.
Args:
"""
kwargs['items_selenium_callback'] = None # Enable JS for items
kwargs['items_selenium_callback'] = self._wait_item_page
super(_DemiPageSpider, self).__init__(*args, **kwargs)

@staticmethod
def _wait_item_page(request, spider, driver):
# Wait until number of comments corresponds to numbering
driver.get(request.url)
reply_xpath = '//div[contains(@class, "__reply__")]'
numbering_xpath = (
f'{reply_xpath}//div[contains(@class, "replyNumbering")]')
numbering = driver.find_element_by_xpath(numbering_xpath)
try:
n_comments = int(numbering.text.split('/')[-1])
except Exception:
n_comments = 0
(WebDriverWait(driver, 2, 0.1).until(
lambda d: len(d.find_elements_by_xpath(reply_xpath)) >= n_comments))
return HtmlResponse(
driver.current_url,
body=driver.page_source.encode('utf-8'),
encoding='utf-8',
request=request
)

def _parse_comment(self, comment):
l = ItemLoader(item=_DemiCommentItem(), selector=comment)
l.add_xpath('author',
il = ItemLoader(item=_DemiCommentItem(), selector=comment)
il.add_xpath(
'author',
'//span[contains(@class, "discussionItemAuthor")]//text()')
l.add_xpath('date',
'//span[contains(@class, "__time__")]//text()')
l.add_xpath('quotes', '//blockquote//text()')
l.add_xpath('content', '//p//text()')
l.add_xpath('numbering',
'//div[contains(@class, "replyNumbering")]//text()')
l.add_xpath('likes', '//span[contains(@class, "LikeCount")]//text()')
return l.load_item()

il.add_xpath('date', '//span[contains(@class, "__time__")]//text()')
il.add_xpath('quotes', '//blockquote//text()')
il.add_xpath('content', '//p//text()')
il.add_xpath(
'numbering', '//div[contains(@class, "replyNumbering")]//text()')
il.add_xpath('likes', '//span[contains(@class, "LikeCount")]//text()')
return il.load_item()

def _parse_item(self, resp):
l = ItemLoader(item=_DemiPageItem(), response=resp)
l.add_value('url', resp.url)
l.add_value('time', int(time.time()))
first_reply = l.nested_xpath(
il = ItemLoader(item=_DemiPageItem(), response=resp)
il.add_value('url', resp.url)
il.add_value('time', int(time.time()))
first_reply = il.nested_xpath(
'//div[contains(@class, "firstReplyContainer")]')
first_reply.add_xpath('title',
'//div[contains(@class, "__title__")]//text()')
first_reply.add_xpath('published',
'//span[contains(@class, "__time__")]//text()')
l.add_xpath('author',
first_reply.add_xpath(
'title', '//div[contains(@class, "__title__")]//text()')
first_reply.add_xpath(
'published', '//span[contains(@class, "__time__")]//text()')
il.add_xpath(
'author',
'//span[contains(@class, "discussionItemAuthor")]//text()')

comments = []
comment_xpath = '//div[contains(@class, "__reply__")]'
for comment in resp.xpath(comment_xpath):
comments.append(self._parse_comment(Selector(text=comment.get())))
l.add_value('comments', comments)
return l.load_item()
il.add_value('comments', comments)
return il.load_item()


class _DemiCommentItem(Item):
Expand All @@ -107,7 +128,7 @@ class _DemiCommentItem(Item):
output_processor=Identity()
)
content = Field(
input_processor=strip_join,
input_processor=paragraph_join,
output_processor=TakeFirst()
)
numbering = Field(
Expand Down
Loading

0 comments on commit 036ddc0

Please sign in to comment.