diff --git a/VERSION b/VERSION index 7367c91..2e17582 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.0.1dev22 \ No newline at end of file +0.1.0a \ No newline at end of file diff --git a/docs/source/finscraper.rst b/docs/source/finscraper.rst index d420162..65ab01d 100644 --- a/docs/source/finscraper.rst +++ b/docs/source/finscraper.rst @@ -20,14 +20,6 @@ finscraper.extensions module :undoc-members: :show-inheritance: -finscraper.http module ----------------------- - -.. automodule:: finscraper.http - :members: - :undoc-members: - :show-inheritance: - finscraper.middlewares module ----------------------------- @@ -44,6 +36,14 @@ finscraper.pipelines module :undoc-members: :show-inheritance: +finscraper.request module +------------------------- + +.. automodule:: finscraper.request + :members: + :undoc-members: + :show-inheritance: + finscraper.settings module -------------------------- @@ -60,6 +60,14 @@ finscraper.spiders module :undoc-members: :show-inheritance: +finscraper.text\_utils module +----------------------------- + +.. automodule:: finscraper.text_utils + :members: + :undoc-members: + :show-inheritance: + finscraper.utils module ----------------------- diff --git a/finscraper/extensions.py b/finscraper/extensions.py index bed5d5c..fee1337 100644 --- a/finscraper/extensions.py +++ b/finscraper/extensions.py @@ -6,7 +6,7 @@ from collections import defaultdict from scrapy import signals -from scrapy.exceptions import NotConfigured, CloseSpider +from scrapy.exceptions import NotConfigured from tqdm.auto import tqdm @@ -15,7 +15,7 @@ class ProgressBar: """Scrapy extension thay displays progress bar. - + Enabled via ``PROGRESS_BAR_ENABLED`` Scrapy setting. """ def __init__(self, crawler): diff --git a/finscraper/http.py b/finscraper/http.py deleted file mode 100644 index 90e7473..0000000 --- a/finscraper/http.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Module for custom Scrapy HTTP components.""" - - -from scrapy import Request - - -class SeleniumCallbackRequest(Request): - """Process request with given callback using Selenium. - - Args: - selenium_callback (func or None): Function that will be called with the - chrome webdriver. The function should take in parameters - (request, spider, driver) and return request, response or None. - If None, driver will be used for fetching the page, and return is - response. Defaults to None. - """ - - def __init__(self, *args, selenium_callback=None, **kwargs): - #if not callable(selenium_callback): - # raise ValueError('`callback` must be a function!') - meta = kwargs.pop('meta', {}) or {} - if 'selenium_callback' not in meta: - meta['selenium_callback'] = selenium_callback - new_kwargs = dict(**kwargs, meta=meta) - super(SeleniumCallbackRequest, self).__init__(*args, **new_kwargs) diff --git a/finscraper/middlewares.py b/finscraper/middlewares.py index f80a26b..5630c92 100644 --- a/finscraper/middlewares.py +++ b/finscraper/middlewares.py @@ -3,20 +3,21 @@ import logging -import time - -from scrapy import signals, Request +from scrapy import signals from scrapy.exceptions import NotConfigured from scrapy.http import HtmlResponse from selenium.webdriver.chrome.options import Options -from finscraper.http import SeleniumCallbackRequest +from finscraper.request import SeleniumCallbackRequest from finscraper.utils import get_chromedriver class SeleniumCallbackMiddleware: - """Middleware that processes request with given callback.""" + """Middleware that processes request with given callback. + + Headless mode can be disabled via ``DISABLE_HEADLESS`` Scrapy setting. + """ def __init__(self, settings): self.settings = settings @@ -32,7 +33,8 @@ def from_crawler(cls, crawler): def spider_opened(self, spider): options = Options() - options.add_argument("--headless") + if not self.settings.get('DISABLE_HEADLESS', False): + options.add_argument("--headless") options.add_argument("--disable-extensions") options.add_argument("--disable-gpu") if self.settings.get('PROGRESS_BAR_ENABLED', True): @@ -40,15 +42,19 @@ def spider_opened(self, spider): for name in ['selenium.webdriver.remote.remote_connection', 'requests', 'urllib3']: logging.getLogger(name).propagate = False - self.driver = get_chromedriver(options) + try: + self.driver = get_chromedriver(options) + except Exception: + raise NotConfigured('Could not get chromedriver') def spider_closed(self, spider): - self.driver.close() + if hasattr(self, 'driver'): + self.driver.close() def process_request(self, request, spider): if not isinstance(request, SeleniumCallbackRequest): return None - + selenium_callback = request.meta.get('selenium_callback') if selenium_callback is None: self.driver.get(request.url) diff --git a/finscraper/request.py b/finscraper/request.py new file mode 100644 index 0000000..41528f1 --- /dev/null +++ b/finscraper/request.py @@ -0,0 +1,23 @@ +"""Module for custom Scrapy request components.""" + + +from scrapy import Request + + +class SeleniumCallbackRequest(Request): + """Process request with given callback using Selenium. + + Args: + selenium_callback (func or None, optional): Function that will be + called with the chrome webdriver. The function should take in + parameters (request, spider, driver) and return request, response + or None. If None, driver will be used for fetching the page, and + return is response. Defaults to None. + """ + + def __init__(self, *args, selenium_callback=None, **kwargs): + meta = kwargs.pop('meta', {}) or {} + if 'selenium_callback' not in meta: + meta['selenium_callback'] = selenium_callback + new_kwargs = dict(**kwargs, meta=meta) + super(SeleniumCallbackRequest, self).__init__(*args, **new_kwargs) diff --git a/finscraper/scrapy_spiders/demipage.py b/finscraper/scrapy_spiders/demipage.py index ffad9ba..0ef9603 100644 --- a/finscraper/scrapy_spiders/demipage.py +++ b/finscraper/scrapy_spiders/demipage.py @@ -3,19 +3,18 @@ import time -from functools import partial - from scrapy import Item, Field, Selector from scrapy.crawler import Spider -from scrapy.exceptions import DropItem -from scrapy.http import HtmlResponse -from scrapy.linkextractors import LinkExtractor from scrapy.loader import ItemLoader +from scrapy.linkextractors import LinkExtractor from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Compose +from scrapy.http import HtmlResponse + +from selenium.webdriver.support.wait import WebDriverWait from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin -from finscraper.utils import strip_join, safe_cast_int, strip_elements, \ - drop_empty_elements +from finscraper.text_utils import strip_join, safe_cast_int, strip_elements, \ + drop_empty_elements, paragraph_join class _DemiPageSpider(FollowAndParseItemMixin, Spider): @@ -24,14 +23,14 @@ class _DemiPageSpider(FollowAndParseItemMixin, Spider): follow_link_extractor = LinkExtractor( allow_domains=('demi.fi'), allow=(r'.*\/keskustelu[t]*\/.*'), - deny=('\?'), + deny=(r'\?'), deny_domains=(), canonicalize=True ) item_link_extractor = LinkExtractor( allow_domains=('demi.fi'), allow=(rf'.*/keskustelu/[A-z0-9\-]+'), - deny=('\?'), + deny=(r'\?'), deny_domains=(), restrict_xpaths=['//div[contains(@class, "threadItem")]'], canonicalize=True @@ -44,44 +43,66 @@ class _DemiPageSpider(FollowAndParseItemMixin, Spider): def __init__(self, *args, **kwargs): """Fetch comments from demi.fi. - + Args: """ - kwargs['items_selenium_callback'] = None # Enable JS for items + kwargs['items_selenium_callback'] = self._wait_item_page super(_DemiPageSpider, self).__init__(*args, **kwargs) + @staticmethod + def _wait_item_page(request, spider, driver): + # Wait until number of comments corresponds to numbering + driver.get(request.url) + reply_xpath = '//div[contains(@class, "__reply__")]' + numbering_xpath = ( + f'{reply_xpath}//div[contains(@class, "replyNumbering")]') + numbering = driver.find_element_by_xpath(numbering_xpath) + try: + n_comments = int(numbering.text.split('/')[-1]) + except Exception: + n_comments = 0 + (WebDriverWait(driver, 2, 0.1).until( + lambda d: len(d.find_elements_by_xpath(reply_xpath)) >= n_comments)) + return HtmlResponse( + driver.current_url, + body=driver.page_source.encode('utf-8'), + encoding='utf-8', + request=request + ) + def _parse_comment(self, comment): - l = ItemLoader(item=_DemiCommentItem(), selector=comment) - l.add_xpath('author', + il = ItemLoader(item=_DemiCommentItem(), selector=comment) + il.add_xpath( + 'author', '//span[contains(@class, "discussionItemAuthor")]//text()') - l.add_xpath('date', - '//span[contains(@class, "__time__")]//text()') - l.add_xpath('quotes', '//blockquote//text()') - l.add_xpath('content', '//p//text()') - l.add_xpath('numbering', - '//div[contains(@class, "replyNumbering")]//text()') - l.add_xpath('likes', '//span[contains(@class, "LikeCount")]//text()') - return l.load_item() - + il.add_xpath('date', '//span[contains(@class, "__time__")]//text()') + il.add_xpath('quotes', '//blockquote//text()') + il.add_xpath('content', '//p//text()') + il.add_xpath( + 'numbering', '//div[contains(@class, "replyNumbering")]//text()') + il.add_xpath('likes', '//span[contains(@class, "LikeCount")]//text()') + return il.load_item() + def _parse_item(self, resp): - l = ItemLoader(item=_DemiPageItem(), response=resp) - l.add_value('url', resp.url) - l.add_value('time', int(time.time())) - first_reply = l.nested_xpath( + il = ItemLoader(item=_DemiPageItem(), response=resp) + il.add_value('url', resp.url) + il.add_value('time', int(time.time())) + first_reply = il.nested_xpath( '//div[contains(@class, "firstReplyContainer")]') - first_reply.add_xpath('title', - '//div[contains(@class, "__title__")]//text()') - first_reply.add_xpath('published', - '//span[contains(@class, "__time__")]//text()') - l.add_xpath('author', + first_reply.add_xpath( + 'title', '//div[contains(@class, "__title__")]//text()') + first_reply.add_xpath( + 'published', '//span[contains(@class, "__time__")]//text()') + il.add_xpath( + 'author', '//span[contains(@class, "discussionItemAuthor")]//text()') - + comments = [] comment_xpath = '//div[contains(@class, "__reply__")]' for comment in resp.xpath(comment_xpath): comments.append(self._parse_comment(Selector(text=comment.get()))) - l.add_value('comments', comments) - return l.load_item() + il.add_value('comments', comments) + return il.load_item() class _DemiCommentItem(Item): @@ -107,7 +128,7 @@ class _DemiCommentItem(Item): output_processor=Identity() ) content = Field( - input_processor=strip_join, + input_processor=paragraph_join, output_processor=TakeFirst() ) numbering = Field( diff --git a/finscraper/scrapy_spiders/ilarticle.py b/finscraper/scrapy_spiders/ilarticle.py index b685e51..692dfa7 100644 --- a/finscraper/scrapy_spiders/ilarticle.py +++ b/finscraper/scrapy_spiders/ilarticle.py @@ -3,6 +3,8 @@ import time +from functools import partial + from scrapy import Item, Field, Selector from scrapy.crawler import Spider from scrapy.linkextractors import LinkExtractor @@ -10,7 +12,7 @@ from scrapy.loader.processors import TakeFirst, Identity, MapCompose from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin -from finscraper.utils import strip_join +from finscraper.text_utils import strip_join, paragraph_join class _ILArticleSpider(FollowAndParseItemMixin, Spider): @@ -34,7 +36,7 @@ class _ILArticleSpider(FollowAndParseItemMixin, Spider): def __init__(self, *args, **kwargs): """Fetch Iltalehti news articles. - + Args: """ super(_ILArticleSpider, self).__init__(*args, **kwargs) @@ -48,24 +50,31 @@ def _get_image_metadata(text): 'caption': sel.xpath( '//div[contains(@class, "media-caption")]//text()').getall() } - + def _parse_item(self, resp): - l = ItemLoader(item=_ILArticleItem(), response=resp) - l.add_value('url', resp.url) - l.add_value('time', int(time.time())) - l.add_xpath('title', + il = ItemLoader(item=_ILArticleItem(), response=resp) + il.add_value('url', resp.url) + il.add_value('time', int(time.time())) + il.add_xpath( + 'title', '//article//h1[contains(@class, "article-headline")]//text()') - l.add_xpath('ingress', + il.add_xpath( + 'ingress', '//article//div[contains(@class, "article-description")]//text()') - l.add_xpath('content', - '//article//div[contains(@class, "article-body")]//text()') - l.add_xpath('published', - '//time//text()') - l.add_xpath('author', + + pgraphs_xpath = '//article//div[contains(@class, "article-body")]//p' + content = [''.join(Selector(text=pgraph).xpath('//text()').getall()) + for pgraph in resp.xpath(pgraphs_xpath).getall()] + il.add_value('content', content) + + il.add_xpath('published', '//time//text()') + il.add_xpath( + 'author', '//article//div[contains(@class, "author-name")]//text()') - l.add_xpath('images', + il.add_xpath( + 'images', '//article//div[contains(@class, "article-image")]') - return l.load_item() + return il.load_item() class _ILArticleItem(Item): @@ -97,7 +106,7 @@ class _ILArticleItem(Item): output_processor=TakeFirst() ) content = Field( - input_processor=strip_join, + input_processor=paragraph_join, output_processor=TakeFirst() ) published = Field( diff --git a/finscraper/scrapy_spiders/isarticle.py b/finscraper/scrapy_spiders/isarticle.py index 7f6e815..ab3ae13 100644 --- a/finscraper/scrapy_spiders/isarticle.py +++ b/finscraper/scrapy_spiders/isarticle.py @@ -3,6 +3,8 @@ import time +from functools import partial + from scrapy import Item, Field, Selector from scrapy.crawler import Spider from scrapy.linkextractors import LinkExtractor @@ -10,7 +12,7 @@ from scrapy.loader.processors import TakeFirst, Identity, MapCompose from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin -from finscraper.utils import strip_join +from finscraper.text_utils import strip_join, paragraph_join class _ISArticleSpider(FollowAndParseItemMixin, Spider): @@ -32,9 +34,10 @@ class _ISArticleSpider(FollowAndParseItemMixin, Spider): deny_domains=('ravit.is.fi'), canonicalize=True ) + def __init__(self, *args, **kwargs): """Fetch IltaSanomat news articles. - + Args: """ super(_ISArticleSpider, self).__init__(*args, **kwargs) @@ -47,23 +50,28 @@ def _get_image_metadata(text): 'alt': sel.xpath('//img//@alt').get(), 'caption': strip_join(sel.xpath('//p//text()').getall()) } - + def _parse_item(self, resp): - l = ItemLoader(item=_ISArticleItem(), response=resp) - l.add_value('url', resp.url) - l.add_value('time', int(time.time())) - l.add_xpath('title', '//article//h1//text()') - l.add_xpath('ingress', + il = ItemLoader(item=_ISArticleItem(), response=resp) + il.add_value('url', resp.url) + il.add_value('time', int(time.time())) + il.add_xpath('title', '//article//h1//text()') + il.add_xpath( + 'ingress', '//section//article//p[contains(@class, "ingress")]//text()') - l.add_xpath('content', + il.add_xpath( + 'content', '//article//p[contains(@class, "body")]//text()') - l.add_xpath('published', + il.add_xpath( + 'published', '//article//div[contains(@class, "timestamp")]//text()') - l.add_xpath('author', + il.add_xpath( + 'author', '//article//div[contains(@itemprop, "author")]//text()') - l.add_xpath('images', + il.add_xpath( + 'images', '//section//article//div[contains(@class, "clearing-container")]') - return l.load_item() + return il.load_item() class _ISArticleItem(Item): @@ -95,7 +103,7 @@ class _ISArticleItem(Item): output_processor=TakeFirst() ) content = Field( - input_processor=strip_join, + input_processor=paragraph_join, output_processor=TakeFirst() ) published = Field( diff --git a/finscraper/scrapy_spiders/mixins.py b/finscraper/scrapy_spiders/mixins.py index 12996e9..df6a670 100644 --- a/finscraper/scrapy_spiders/mixins.py +++ b/finscraper/scrapy_spiders/mixins.py @@ -4,12 +4,12 @@ from scrapy import Request from scrapy.exceptions import CloseSpider -from finscraper.http import SeleniumCallbackRequest +from finscraper.request import SeleniumCallbackRequest class FollowAndParseItemMixin: """Parse items and follow links based on defined link extractors. - + The following needs to be defined when inheriting: 1) ``item_link_extractor`` -attribute: LinkExtractor that defines \ the links to parse items from. @@ -37,6 +37,7 @@ class FollowAndParseItemMixin: AttributeError, if required attributes not defined when inheriting. """ itemcount = 0 + def __init__(self, follow_meta=None, items_meta=None, follow_selenium_callback=False, items_selenium_callback=False): @@ -45,8 +46,8 @@ def __init__(self, follow_meta=None, items_meta=None, self.follow_selenium_callback = follow_selenium_callback self.items_selenium_callback = items_selenium_callback - self._follow_selenium = not (self.follow_selenium_callback == False) - self._items_selenium = not (self.items_selenium_callback == False) + self._follow_selenium = not (self.follow_selenium_callback is False) + self._items_selenium = not (self.items_selenium_callback is False) for attr in ['follow_link_extractor', 'item_link_extractor']: if not hasattr(self, attr): @@ -61,9 +62,9 @@ def start_requests(self): def parse(self, resp, to_parse=False): """Parse items and follow links based on defined link extractors.""" - if (self.itemcount and - self.itemcount == self.settings.get('CLOSESPIDER_ITEMCOUNT', 0)): - raise CloseSpider + max_itemcount = self.settings.get('CLOSESPIDER_ITEMCOUNT', 0) + if self.itemcount and self.itemcount == max_itemcount: + raise CloseSpider if to_parse: yield self._parse_item(resp) diff --git a/finscraper/scrapy_spiders/oikotieapartment.py b/finscraper/scrapy_spiders/oikotieapartment.py index a606819..f133cd9 100644 --- a/finscraper/scrapy_spiders/oikotieapartment.py +++ b/finscraper/scrapy_spiders/oikotieapartment.py @@ -3,21 +3,17 @@ import time -from functools import partial - -from scrapy import Item, Field, Selector, Request +from scrapy import Item, Field, Request from scrapy.crawler import Spider -from scrapy.exceptions import DropItem, CloseSpider +from scrapy.exceptions import CloseSpider from scrapy.http import HtmlResponse from scrapy.linkextractors import LinkExtractor from scrapy.loader import ItemLoader -from scrapy.loader.processors import TakeFirst, Identity, MapCompose, \ - Compose, Join +from scrapy.loader.processors import TakeFirst, Identity, Compose -from finscraper.http import SeleniumCallbackRequest -from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin -from finscraper.utils import strip_join, safe_cast_int, strip_elements, \ - drop_empty_elements +from finscraper.request import SeleniumCallbackRequest +from finscraper.text_utils import strip_join, drop_empty_elements, \ + paragraph_join class _OikotieApartmentSpider(Spider): @@ -26,14 +22,14 @@ class _OikotieApartmentSpider(Spider): follow_link_extractor = LinkExtractor( allow_domains=('asunnot.oikotie.fi'), allow=(r'.*\/myytavat-asunnot\/.*'), - deny=('.*?origin\=.*'), + deny=(r'.*?origin\=.*'), deny_domains=(), canonicalize=True ) item_link_extractor = LinkExtractor( allow_domains=('asunnot.oikotie.fi'), allow=(rf'.*/myytavat-asunnot/.*/[0-9]+'), - deny=('.*?origin\=.*'), + deny=(r'.*?origin\=.*'), deny_domains=(), canonicalize=True ) @@ -41,19 +37,6 @@ class _OikotieApartmentSpider(Spider): 'ROBOTSTXT_OBEY': False, # No robots.txt, will fail with yes 'DOWNLOADER_MIDDLEWARES': { 'finscraper.middlewares.SeleniumCallbackMiddleware': 800 - }, - 'DEFAULT_REQUEST_HEADERS': { - 'Connection': 'keep-alive', - 'Cache-Control': 'max-age=0', - 'DNT': '1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36', - 'Sec-Fetch-User': '?1', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', - 'Sec-Fetch-Site': 'same-origin', - 'Sec-Fetch-Mode': 'navigate', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en-US,en;q=0.9', } } itemcount = 0 @@ -97,7 +80,7 @@ class _OikotieApartmentSpider(Spider): 'Neliöhinta': 'price_per_sq', 'Velkaosuus': 'share_of_liabilities', 'Kiinnitykset': 'mortgages', - + 'Rahoitusvastike': 'financial_charge', 'Hoitovastike': 'condominium_payment', 'Yhtiövastike': 'maintenance_charge', @@ -137,7 +120,7 @@ class _OikotieApartmentSpider(Spider): def __init__(self, *args, **kwargs): """Fetch oikotie.fi apartments. - + Args: """ kwargs['follow_request_type'] = SeleniumCallbackRequest @@ -151,8 +134,10 @@ def start_requests(self): @staticmethod def _handle_start(request, spider, driver): driver.get(request.url) - driver.find_element_by_xpath( - '//div[contains(@class, "sccm-button-green")]').click() + policy_modal = driver.find_element_by_xpath( + '//div[contains(@class, "sccm-button-green")]') + if policy_modal: + policy_modal.click() return HtmlResponse( driver.current_url, body=driver.page_source.encode('utf-8'), @@ -162,9 +147,9 @@ def _handle_start(request, spider, driver): def parse(self, resp, to_parse=False): """Parse items and follow links based on defined link extractors.""" - if (self.itemcount and - self.itemcount == self.settings.get('CLOSESPIDER_ITEMCOUNT', 0)): - raise CloseSpider + max_itemcount = self.settings.get('CLOSESPIDER_ITEMCOUNT', 0) + if self.itemcount and self.itemcount == max_itemcount: + raise CloseSpider if to_parse: yield self._parse_item(resp) @@ -181,44 +166,45 @@ def parse(self, resp, to_parse=False): for link in follow_links: yield SeleniumCallbackRequest( link.url, callback=self.parse, priority=10) - + def _parse_item(self, resp): - l = ItemLoader(item=_OikotieApartmentItem(), response=resp) - l.add_value('url', resp.url) - l.add_value('time', int(time.time())) - + il = ItemLoader(item=_OikotieApartmentItem(), response=resp) + il.add_value('url', resp.url) + il.add_value('time', int(time.time())) + # Apartment info - l.add_xpath('title', '//title//text()') - l.add_xpath('overview', + il.add_xpath('title', '//title//text()') + il.add_xpath( + 'overview', '//div[contains(@class, "listing-overview")]//text()') # From tables table_xpath = '//dt[text()="{title}"]/following-sibling::dd[1]//text()' for title, field in self.title2field.items(): - l.add_xpath(field, table_xpath.format(title=title)) + il.add_xpath(field, table_xpath.format(title=title)) # Contact information - l.add_xpath( + il.add_xpath( 'contact_person_name', '//div[contains(@class, "listing-person__details-item--big")]' '//text()' ) - l.add_xpath( + il.add_xpath( 'contact_person_job_title', '//div[contains(@class, "listing-person__details-item--waisted")]' '//text()' ) - l.add_xpath( + il.add_xpath( 'contact_person_phone_number', '(//div[contains(@class, "listing-person__details-item' '--sm-top-margin")]/span)[2]//text()' ) - l.add_xpath( + il.add_xpath( 'contact_person_company', '//div[@class="listing-company__name"]/a/span//text()' ) - l.add_xpath('contact_person_email', '(//p)[1]//text()') - return l.load_item() + il.add_xpath('contact_person_email', '(//p)[1]//text()') + return il.load_item() class _OikotieApartmentItem(Item): @@ -233,9 +219,11 @@ class _OikotieApartmentItem(Item): * contact_person_phone_number (str): Phone number of the contact \ person. * contact_person_company (str): Company of the contact person. - """.strip() + '\n' + ( + """.strip() + ( + '\n' + '\n'.join(f'{" " * 8}* {field} (str): {desc}' - for desc, field in _OikotieApartmentSpider.title2field.items())) + for desc, field + in _OikotieApartmentSpider.title2field.items())) url = Field( input_processor=Identity(), output_processor=TakeFirst() @@ -250,7 +238,7 @@ class _OikotieApartmentItem(Item): output_processor=TakeFirst() ) overview = Field( - input_processor=strip_join, + input_processor=Compose(drop_empty_elements, paragraph_join), output_processor=TakeFirst() ) # Basic information diff --git a/finscraper/scrapy_spiders/suomi24page.py b/finscraper/scrapy_spiders/suomi24page.py index 111ea51..778e6c2 100644 --- a/finscraper/scrapy_spiders/suomi24page.py +++ b/finscraper/scrapy_spiders/suomi24page.py @@ -3,18 +3,16 @@ import time -from functools import partial from scrapy import Item, Field, Selector from scrapy.crawler import Spider -from scrapy.exceptions import DropItem from scrapy.linkextractors import LinkExtractor from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Compose from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin -from finscraper.utils import strip_join, safe_cast_int, strip_elements, \ - drop_empty_elements +from finscraper.text_utils import strip_join, safe_cast_int, strip_elements, \ + drop_empty_elements, paragraph_join class _Suomi24PageSpider(FollowAndParseItemMixin, Spider): @@ -23,14 +21,14 @@ class _Suomi24PageSpider(FollowAndParseItemMixin, Spider): follow_link_extractor = LinkExtractor( allow_domains=('keskustelu.suomi24.fi'), allow=(), - deny=('\?'), + deny=(r'\?'), deny_domains=(), canonicalize=True ) item_link_extractor = LinkExtractor( allow_domains=('keskustelu.suomi24.fi'), allow=(rf'/t/[0-9]+/[A-z0-9\-]+'), - deny=('\?'), + deny=(r'\?'), deny_domains=(), canonicalize=True ) @@ -38,62 +36,75 @@ class _Suomi24PageSpider(FollowAndParseItemMixin, Spider): def __init__(self, *args, **kwargs): """Fetch comments from suomi24.fi. - + Args: """ super(_Suomi24PageSpider, self).__init__(*args, **kwargs) def _parse_comment_response(self, response): - l = ItemLoader(item=_Suomi24CommentResponseItem(), selector=response) - l.add_xpath('author', + il = ItemLoader(item=_Suomi24CommentResponseItem(), selector=response) + il.add_xpath( + 'author', '//*[contains(@class, "Username")]//text()') - l.add_xpath('date', + il.add_xpath( + 'date', '//*[contains(@class, "Timestamp")]//text()') - l.add_xpath('quotes', '//blockquote//text()', strip_join) - l.add_xpath('content', '//p[contains(@class, "Text")]//text()') - return l.load_item() + il.add_xpath('quotes', '//blockquote//text()', strip_join) + il.add_xpath( + 'content', + '//p[contains(@class, "ListItem__Text")]//text()') + return il.load_item() def _parse_comment(self, comment): - l = ItemLoader(item=_Suomi24CommentItem(), selector=comment) - l.add_xpath('author', + il = ItemLoader(item=_Suomi24CommentItem(), selector=comment) + il.add_xpath( + 'author', '(//*[contains(@class, "Username")])[1]//text()') - l.add_xpath('date', + il.add_xpath( + 'date', '(//*[contains(@class, "Timestamp")])[1]//text()') - l.add_xpath('quotes', + il.add_xpath( + 'quotes', '(//article)[1]//blockquote//text()') - l.add_xpath('content', - '(//article)[1]//p[contains(@class, "Text")]//text()') + il.add_xpath( + 'content', + '(//article)[1]//p[contains(@class, "ListItem__Text")]//text()') responses = [] responses_xpath = '//li[contains(@class, "CommentResponsesItem")]' for response in comment.xpath(responses_xpath): responses.append( self._parse_comment_response(Selector(text=response.get()))) - l.add_value('responses', responses) - return l.load_item() - + il.add_value('responses', responses) + return il.load_item() + def _parse_item(self, resp): - l = ItemLoader(item=_Suomi24PageItem(), response=resp) - l.add_value('url', resp.url) - l.add_value('time', int(time.time())) - l.add_xpath('title', '//*[contains(@*, "thread-title")]//text()') - l.add_xpath('published', + il = ItemLoader(item=_Suomi24PageItem(), response=resp) + il.add_value('url', resp.url) + il.add_value('time', int(time.time())) + il.add_xpath('title', '//*[contains(@*, "thread-title")]//text()') + il.add_xpath( + 'published', '(//*[contains(@class, "Timestamp")])[1]//text()') - l.add_xpath('author', + il.add_xpath( + 'author', '(//*[contains(@class, "Username")])[1]//text()') - l.add_xpath('content', + il.add_xpath( + 'content', '(//*[contains(@*, "thread-body-text")])[1]//text()') - l.add_xpath('n_comments', + il.add_xpath( + 'n_comments', '(//*[contains(@*, "stats-comments")])[1]//text()') - l.add_xpath('views', + il.add_xpath( + 'views', '(//*[contains(@*, "stats-views")])[1]//text()') - + comments = [] comment_xpath = '//li[contains(@class, "CommentItem")]' for comment in resp.xpath(comment_xpath): comments.append(self._parse_comment(Selector(text=comment.get()))) - l.add_value('comments', comments) - return l.load_item() + il.add_value('comments', comments) + return il.load_item() class _Suomi24CommentResponseItem(Item): @@ -117,7 +128,7 @@ class _Suomi24CommentResponseItem(Item): output_processor=Identity() ) content = Field( - input_processor=strip_join, + input_processor=paragraph_join, output_processor=TakeFirst() ) @@ -179,7 +190,7 @@ class _Suomi24PageItem(Item): output_processor=TakeFirst() ) content = Field( - input_processor=strip_join, + input_processor=paragraph_join, output_processor=TakeFirst() ) comments = Field( diff --git a/finscraper/scrapy_spiders/torideal.py b/finscraper/scrapy_spiders/torideal.py index 667dc34..3e80a1b 100644 --- a/finscraper/scrapy_spiders/torideal.py +++ b/finscraper/scrapy_spiders/torideal.py @@ -3,19 +3,16 @@ import time -from functools import partial - from scrapy import Item, Field, Selector from scrapy.crawler import Spider -from scrapy.exceptions import DropItem from scrapy.linkextractors import LinkExtractor from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst, Identity, MapCompose, \ - Compose, Join + Compose from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin -from finscraper.utils import strip_join, safe_cast_int, strip_elements, \ - drop_empty_elements, replace +from finscraper.text_utils import strip_join, drop_empty_elements, \ + paragraph_join class _ToriDealSpider(FollowAndParseItemMixin, Spider): @@ -43,7 +40,7 @@ class _ToriDealSpider(FollowAndParseItemMixin, Spider): def __init__(self, *args, **kwargs): """Fetch deals from tori.fi. - + Args: """ super(_ToriDealSpider, self).__init__(*args, **kwargs) @@ -56,27 +53,33 @@ def _get_image_metadata(text): 'alt': sel.xpath('//@alt').get(), 'title': sel.xpath('//@title').get() } - + def _parse_item(self, resp): - l = ItemLoader(item=_ToriDealItem(), response=resp) - l.add_value('url', resp.url) - l.add_value('time', int(time.time())) - l.add_xpath('seller', + il = ItemLoader(item=_ToriDealItem(), response=resp) + il.add_value('url', resp.url) + il.add_value('time', int(time.time())) + il.add_xpath( + 'seller', '//div[contains(@id, "seller_info")]//text()') - l.add_xpath('name', + il.add_xpath( + 'name', '//div[@class="topic"]//*[contains(@itemprop, "name")]//text()') - l.add_xpath('description', + il.add_xpath( + 'description', '//*[contains(@itemprop, "description")]//text()') - l.add_xpath('price', + il.add_xpath( + 'price', '//*[contains(@itemprop, "price")]//text()') - l.add_xpath('type', + il.add_xpath( + 'type', '//td[contains(text(), "Ilmoitustyyppi")]' '/following-sibling::td[1]//text()') - l.add_xpath('published', + il.add_xpath( + 'published', '//td[contains(text(), "Ilmoitus jätetty")]' '/following-sibling::td[1]//text()') - l.add_xpath('images', '//div[@class="media_container"]//img') - return l.load_item() + il.add_xpath('images', '//div[@class="media_container"]//img') + return il.load_item() class _ToriDealItem(Item): @@ -101,11 +104,7 @@ class _ToriDealItem(Item): output_processor=TakeFirst() ) seller = Field( - input_processor=Compose( - strip_elements, - drop_empty_elements, - MapCompose(partial(replace, source='\n', target=' ')), - partial(strip_join, join_with='\n')), + input_processor=Compose(drop_empty_elements, paragraph_join), output_processor=TakeFirst() ) name = Field( @@ -113,10 +112,7 @@ class _ToriDealItem(Item): output_processor=TakeFirst() ) description = Field( - input_processor=Compose( - strip_elements, - MapCompose(partial(replace, source='\n', target=' ')), - partial(strip_join, join_with='\n')), + input_processor=Compose(drop_empty_elements, paragraph_join), output_processor=TakeFirst() ) price = Field( diff --git a/finscraper/scrapy_spiders/vauvapage.py b/finscraper/scrapy_spiders/vauvapage.py index bb618cc..749cd3c 100644 --- a/finscraper/scrapy_spiders/vauvapage.py +++ b/finscraper/scrapy_spiders/vauvapage.py @@ -3,17 +3,15 @@ import time -from functools import partial from scrapy import Item, Field, Selector from scrapy.crawler import Spider -from scrapy.exceptions import DropItem from scrapy.linkextractors import LinkExtractor from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Compose from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin -from finscraper.utils import strip_join, safe_cast_int, strip_elements, \ +from finscraper.text_utils import strip_join, safe_cast_int, strip_elements, \ drop_empty_elements @@ -38,48 +36,53 @@ class _VauvaPageSpider(FollowAndParseItemMixin, Spider): def __init__(self, *args, **kwargs): """Fetch comments from vauva.fi. - + Args: """ super(_VauvaPageSpider, self).__init__(*args, **kwargs) def _parse_comment(self, comment): - l = ItemLoader(item=_VauvaCommentItem(), selector=comment) - l.add_xpath('author', '//*[contains(@property, "name")]//text()') - l.add_xpath('date', '//div[contains(@class, "post-date")]//text()') - l.add_xpath('quotes', '//blockquote//text()', strip_join) - l.add_xpath('content', '//p//text()') - votes = l.nested_xpath('//span[contains(@class, "voting-count")]') + il = ItemLoader(item=_VauvaCommentItem(), selector=comment) + il.add_xpath('author', '//*[contains(@property, "name")]//text()') + il.add_xpath('date', '//div[contains(@class, "post-date")]//text()') + il.add_xpath('quotes', '//blockquote//text()') + il.add_xpath('content', '//p//text()') + votes = il.nested_xpath('//span[contains(@class, "voting-count")]') votes.add_xpath('upvotes', '//li[@class="first"]//text()') votes.add_xpath('downvotes', '//li[@class="last"]//text()') - return l.load_item() - + return il.load_item() + def _parse_item(self, resp): - l = ItemLoader(item=_VauvaPageItem(), response=resp) - l.add_value('url', resp.url) - l.add_value('time', int(time.time())) - l.add_xpath('title', + il = ItemLoader(item=_VauvaPageItem(), response=resp) + il.add_value('url', resp.url) + il.add_value('time', int(time.time())) + il.add_xpath( + 'title', '//article//div[contains(@property, "title")]//text()') - l.add_xpath('page', + il.add_xpath( + 'page', '//article//li[contains(@class, "pager-current")]//text()') - l.add_value('page', ['1']) - l.add_xpath('pages', + il.add_value('page', ['1']) + il.add_xpath( + 'pages', '//article//li[contains(@class, "pager-last")]//text()') - l.add_xpath('pages', + il.add_xpath( + 'pages', '//article//li[contains(@class, "pager-current")]//text()') - l.add_value('pages', ['1']) - l.add_xpath('published', + il.add_value('pages', ['1']) + il.add_xpath( + 'published', '(//article//div[contains(@class, "post-date")])[1]//text()') - l.add_xpath('author', + il.add_xpath( + 'author', '(//article//*[contains(@property, "name")])[1]//text()') - + comments = [] comment_xpath = '//article//article[contains(@class, "comment")]' for comment in resp.xpath(comment_xpath): comments.append(self._parse_comment(Selector(text=comment.get()))) - l.add_value('comments', comments) - loaded_item = l.load_item() - return loaded_item + il.add_value('comments', comments) + return il.load_item() class _VauvaCommentItem(Item): diff --git a/finscraper/scrapy_spiders/ylearticle.py b/finscraper/scrapy_spiders/ylearticle.py index 54ed625..7ff4693 100644 --- a/finscraper/scrapy_spiders/ylearticle.py +++ b/finscraper/scrapy_spiders/ylearticle.py @@ -7,13 +7,12 @@ from scrapy import Item, Field, Selector from scrapy.crawler import Spider -from scrapy.exceptions import DropItem from scrapy.linkextractors import LinkExtractor from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst, Identity, MapCompose from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin -from finscraper.utils import strip_join +from finscraper.text_utils import strip_join, paragraph_join class _YLEArticleSpider(FollowAndParseItemMixin, Spider): @@ -37,7 +36,7 @@ class _YLEArticleSpider(FollowAndParseItemMixin, Spider): def __init__(self, *args, **kwargs): """Fetch YLE news articles. - + Args: """ super(_YLEArticleSpider, self).__init__(*args, **kwargs) @@ -51,43 +50,51 @@ def _get_image_metadata(text): 'caption': sel.xpath( '//figcaption//text()').getall() } - + def _parse_item(self, resp): - l = ItemLoader(item=_YLEArticleItem(), response=resp) - l.add_value('url', resp.url) - l.add_value('time', int(time.time())) - # Oldskool - l.add_xpath('title', + il = ItemLoader(item=_YLEArticleItem(), response=resp) + il.add_value('url', resp.url) + il.add_value('time', int(time.time())) + + # Tradition style + il.add_xpath( + 'title', '//article' '//div[contains(@class, "article__header")]' '//h1[contains(@class, "article__heading")]//text()') - l.add_xpath('ingress', + il.add_xpath( + 'ingress', '//article' '//div[contains(@class, "article__header")]' '//p[contains(@class, "article__paragraph")]//text()') - l.add_xpath('content', - '//article'+ - '//div[contains(@class, "article__content")]'+ - '//p[contains(@class, "article__paragraph")]//text()') - l.add_xpath('published', - '//article'+ - '//div[contains(@class, "article__header")]'+ + + pgraphs_xpath = ('//article//div[contains(@class, "article__content")]' + '//p[contains(@class, "article__paragraph")]') + content = [''.join(Selector(text=pgraph).xpath('//text()').getall()) + for pgraph in resp.xpath(pgraphs_xpath).getall()] + il.add_value('content', content) + + il.add_xpath( + 'published', + '//article//div[contains(@class, "article__header")]' '//span[contains(@class, "article__date")]//text()') - l.add_xpath('author', - '//article'+ - '//span[contains(@class, "author__name")]//text()') - l.add_xpath('images', - '//article'+ - '//figure[contains(@class, "article__figure")]') - - # "Modern" - l.add_xpath('title', - '//article'+ + il.add_xpath( + 'author', + '//article//span[contains(@class, "author__name")]//text()') + il.add_xpath( + 'images', + '//article//figure[contains(@class, "article__figure")]') + + # "Modern" style news + il.add_xpath( + 'title', + '//article' '//h1[contains(@class, "article__feature__heading")]//text()') - l.add_xpath('content', - '//article'+ + il.add_xpath( + 'content', + '//article' '//p[contains(@class, "article__feature__paragraph")]//text()') - return l.load_item() + return il.load_item() class _YLEArticleItem(Item): @@ -119,7 +126,7 @@ class _YLEArticleItem(Item): output_processor=TakeFirst() ) content = Field( - input_processor=strip_join, + input_processor=paragraph_join, output_processor=TakeFirst() ) published = Field( diff --git a/finscraper/settings.py b/finscraper/settings.py index ee2966d..d535741 100644 --- a/finscraper/settings.py +++ b/finscraper/settings.py @@ -11,50 +11,50 @@ BOT_NAME = 'finscraper' -#SPIDER_MODULES = ['finscraper.scrapy_spiders'] -#NEWSPIDER_MODULE = 'finscraper.scrapy_spiders' +# SPIDER_MODULES = ['finscraper.scrapy_spiders'] +# NEWSPIDER_MODULE = 'finscraper.scrapy_spiders' -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'finscraper (+http://www.yourdomain.com)' +# Crawl responsibly by identifying yourself on the user-agent +# USER_AGENT = 'finscraper (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'finscraper.middlewares.${ProjectName}SpiderMiddleware': 543, -#} +# SPIDER_MIDDLEWARES = { +# 'finscraper.middlewares.${ProjectName}SpiderMiddleware': 543, +# } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'finscraper.middlewares.${ProjectName}DownloaderMiddleware': 543, -#} +# DOWNLOADER_MIDDLEWARES = { +# 'finscraper.middlewares.${ProjectName}DownloaderMiddleware': 543, +# } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html @@ -71,21 +71,20 @@ # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' \ No newline at end of file +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/finscraper/spiders.py b/finscraper/spiders.py index 4c759cd..237b3a7 100644 --- a/finscraper/spiders.py +++ b/finscraper/spiders.py @@ -13,7 +13,7 @@ from finscraper.scrapy_spiders.suomi24page import _Suomi24PageSpider, \ _Suomi24PageItem from finscraper.scrapy_spiders.vauvapage import _VauvaPageSpider, \ - _VauvaPageItem, _VauvaPageSpider + _VauvaPageItem from finscraper.scrapy_spiders.ylearticle import _YLEArticleSpider, \ _YLEArticleItem from finscraper.scrapy_spiders.oikotieapartment import \ @@ -31,7 +31,7 @@ log_level (str or None, optional): Logging level to display. Should be in ['debug', 'info', 'warn', 'error', 'critical'] or None (disabled). Defaults to None. - + .. note:: This parameter can be overridden through Scrapy ``settings`` (LOG_LEVEL, LOG_ENABLED) within the ``scrape`` -method. @@ -46,6 +46,7 @@ def _get_docstring(spider_cls, item_cls): class ISArticle(_SpiderWrapper): __doc__ = _get_docstring(_ISArticleSpider, _ISArticleItem) + def __init__(self, jobdir=None, progress_bar=True, log_level=None): super(ISArticle, self).__init__( spider_cls=_ISArticleSpider, @@ -57,6 +58,7 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None): class ILArticle(_SpiderWrapper): __doc__ = _get_docstring(_ILArticleSpider, _ILArticleItem) + def __init__(self, jobdir=None, progress_bar=True, log_level=None): super(ILArticle, self).__init__( spider_cls=_ILArticleSpider, @@ -68,6 +70,7 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None): class YLEArticle(_SpiderWrapper): __doc__ = _get_docstring(_YLEArticleSpider, _YLEArticleItem) + def __init__(self, jobdir=None, progress_bar=True, log_level=None): super(YLEArticle, self).__init__( spider_cls=_YLEArticleSpider, @@ -79,6 +82,7 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None): class DemiPage(_SpiderWrapper): __doc__ = _get_docstring(_DemiPageSpider, _DemiPageItem) + def __init__(self, jobdir=None, progress_bar=True, log_level=None): super(DemiPage, self).__init__( spider_cls=_DemiPageSpider, @@ -90,6 +94,7 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None): class Suomi24Page(_SpiderWrapper): __doc__ = _get_docstring(_Suomi24PageSpider, _Suomi24PageItem) + def __init__(self, jobdir=None, progress_bar=True, log_level=None): super(Suomi24Page, self).__init__( spider_cls=_Suomi24PageSpider, @@ -101,6 +106,7 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None): class VauvaPage(_SpiderWrapper): __doc__ = _get_docstring(_VauvaPageSpider, _VauvaPageItem) + def __init__(self, jobdir=None, progress_bar=True, log_level=None): super(VauvaPage, self).__init__( spider_cls=_VauvaPageSpider, @@ -112,6 +118,7 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None): class OikotieApartment(_SpiderWrapper): __doc__ = _get_docstring(_OikotieApartmentSpider, _OikotieApartmentItem) + def __init__(self, jobdir=None, progress_bar=True, log_level=None): super(OikotieApartment, self).__init__( spider_cls=_OikotieApartmentSpider, @@ -123,6 +130,7 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None): class ToriDeal(_SpiderWrapper): __doc__ = _get_docstring(_ToriDealSpider, _ToriDealItem) + def __init__(self, jobdir=None, progress_bar=True, log_level=None): super(ToriDeal, self).__init__( spider_cls=_ToriDealSpider, diff --git a/finscraper/text_utils.py b/finscraper/text_utils.py new file mode 100644 index 0000000..01cb7c3 --- /dev/null +++ b/finscraper/text_utils.py @@ -0,0 +1,33 @@ +"""Module for text processing utility functions and classes.""" + + +def strip_join(text_list, join_with=' '): + joined_text = join_with.join(text.strip() for text in text_list + if text is not None) + return joined_text + + +def paragraph_join(text_list): + return '\n\n'.join([replace(text, '\n', ' ') + for text in strip_elements(text_list)]) + + +def replace(text, source, target): + return text.replace(source, target) if text is not None else None + + +def strip_elements(text_list): + return [text.strip() for text in text_list if text is not None] + + +def drop_empty_elements(text_list): + return [text for text in text_list + if text is not None + and (type(text) == str and text.strip() != '')] + + +def safe_cast_int(text): + try: + return int(text) + except Exception: + return None diff --git a/finscraper/utils.py b/finscraper/utils.py index 58d9d35..bcb30cb 100644 --- a/finscraper/utils.py +++ b/finscraper/utils.py @@ -4,23 +4,21 @@ import io import logging import pickle -import re # Monkey patch, see https://github.com/pypa/pipenv/issues/2609 import webdriver_manager.utils -def console(text, bold=False): +def console(text, bold=False): # NOQA pass -webdriver_manager.utils.console = console +webdriver_manager.utils.console = console # NOQA from selenium import webdriver from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager -from tqdm.auto import tqdm - class TqdmLogger(io.StringIO): """File-like object that redirects buffer to stdout.""" + def __init__(self, logger): self.logger = logger self.buf = '' @@ -39,7 +37,8 @@ class QueueHandler(logging.Handler): This handler checks for picklability before saving items into queue. Modified from: https://gist.github.com/vsajip/591589 """ - def __init__(self, queue): + + def __init__(self, queue): logging.Handler.__init__(self) self.queue = queue @@ -53,9 +52,9 @@ def _get_picklable_attrs(self, record): attrdict[attr] = value except AttributeError: pass - except: + except Exception: pass - + if type(record.args) == tuple: attrdict['args'] = record.args else: @@ -66,8 +65,7 @@ def _get_picklable_attrs(self, record): args[attr] = value except AttributeError: args[attr] = str(value) - pass - except: + except Exception: pass attrdict['args'] = args new_record = logging.makeLogRecord(attrdict) @@ -89,7 +87,7 @@ def emit(self, record): self.enqueue(self.prepare(record)) except (KeyboardInterrupt, SystemExit): raise - except: + except Exception: self.handleError(record) @@ -101,29 +99,3 @@ def get_chromedriver(options=None): options.add_argument("--disable-gpu") driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) return driver - - -def strip_join(text_list, join_with=' '): - joined_text = join_with.join(text.strip() for text in text_list - if text is not None) - return joined_text - - -def replace(text, source, target): - return text.replace(source, target) if text is not None else None - - -def strip_elements(text_list): - return [text.strip() for text in text_list if text is not None] - - -def drop_empty_elements(text_list): - return [text for text in text_list - if text is not None or (type(text) == str and text.strip() != '')] - - -def safe_cast_int(text): - try: - return int(text) - except: - return None diff --git a/finscraper/wrappers.py b/finscraper/wrappers.py index 7366f5a..f3badab 100644 --- a/finscraper/wrappers.py +++ b/finscraper/wrappers.py @@ -7,7 +7,6 @@ import pickle import platform import shutil -import sys import tempfile import uuid import weakref @@ -17,11 +16,8 @@ import pandas as pd -from scrapy import Request -from scrapy.crawler import CrawlerProcess, CrawlerRunner -from scrapy.exceptions import CloseSpider +from scrapy.crawler import CrawlerRunner from scrapy.settings import Settings -from scrapy.spiders import Spider from scrapy.utils.log import configure_logging from twisted.internet import reactor @@ -39,7 +35,6 @@ def _run_as_process(func, spider_cls, spider_params, settings): # (queuehandler --> listener --> root logger --> streamhandler) progress_bar_enabled = settings['PROGRESS_BAR_ENABLED'] log_enabled = settings['LOG_ENABLED'] - log_stdout = settings['LOG_STDOUT'] q_log = None ql = None if log_enabled or progress_bar_enabled: @@ -57,7 +52,7 @@ def _run_as_process(func, spider_cls, spider_params, settings): logger = logging.getLogger() logger.setLevel(settings.get('LOG_LEVEL')) logger.addHandler(handler) - + # Start function as a separate process q = mp.Queue() p = mp.Process(target=func, @@ -122,13 +117,13 @@ def __init__(self, spider_cls, spider_params, jobdir=None, self.log_level = log_level self.progress_bar = progress_bar and self.log_level is None - + self._items_save_path = self._jobdir / 'items.jl' self._spider_save_path = self._jobdir / 'spider.pkl' self._finalizer = weakref.finalize( self, shutil.rmtree, self._jobdir, ignore_errors=True) - + # Note: Parameters cannot be changed outside by setting them for param in self.spider_params: setattr(self, param, self.spider_params[param]) @@ -136,7 +131,7 @@ def __init__(self, spider_cls, spider_params, jobdir=None, @property def jobdir(self): """Working directory of the spider. - + Can be changed after initialization of a spider. """ return str(self._jobdir) @@ -144,7 +139,7 @@ def jobdir(self): @property def log_level(self): """Logging level of the spider. - + This attribute can be changed after initialization of a spider. """ return self._log_level @@ -153,7 +148,7 @@ def log_level(self): def log_level(self, log_level): if log_level is None: self._log_level = log_level - elif (type(log_level) == str + elif (type(log_level) == str and log_level.strip().lower() in self._log_levels): self._log_level = self._log_levels[log_level.strip().lower()] else: @@ -163,7 +158,7 @@ def log_level(self, log_level): @property def progress_bar(self): """Whether progress bar is enabled or not. - + Can be changed after initialization of a spider. """ return self._progress_bar @@ -178,7 +173,7 @@ def progress_bar(self, progress_bar): @property def items_save_path(self): """Save of path of the scraped items. - + Cannot be changed after initialization of a spider. """ return str(self._items_save_path) @@ -186,7 +181,7 @@ def items_save_path(self): @property def spider_save_path(self): """Save path of the spider. - + Cannot be changed after initialization of a spider. """ return str(self._spider_save_path) @@ -195,25 +190,25 @@ def _run_spider(self, itemcount=10, timeout=60, pagecount=0, errorcount=0, settings=None): _settings = Settings() _settings.setmodule('finscraper.settings', priority='project') - + _settings['JOBDIR'] = self.jobdir _settings['FEEDS'] = {self.items_save_path: {'format': 'jsonlines'}} - + _settings['CLOSESPIDER_ITEMCOUNT'] = itemcount _settings['CLOSESPIDER_TIMEOUT'] = timeout _settings['CLOSESPIDER_PAGECOUNT'] = pagecount _settings['CLOSESPIDER_ERRORCOUNT'] = errorcount - + _settings['LOG_STDOUT'] = True _settings['LOG_LEVEL'] = self.log_level or logging.NOTSET _settings['LOG_ENABLED'] = self.log_level is not None # Logging dominates progress bar _settings['PROGRESS_BAR_ENABLED'] = self.progress_bar - + # Will always be prioritized --> conflicts are possible if settings is not None: _settings.update(settings) - + try: _run_as_process( func=_run_spider_func, @@ -223,10 +218,10 @@ def _run_spider(self, itemcount=10, timeout=60, pagecount=0, errorcount=0, ) except KeyboardInterrupt: pass - + def scrape(self, n=10, timeout=60, settings=None): """Scrape given number of items. - + Args: n (int, optional): Number of items to attempt to scrape. Zero corresponds to no limit. Defaults to 10. @@ -236,7 +231,7 @@ def scrape(self, n=10, timeout=60, settings=None): Defaults to None, which correspond to default settings. See list of available settings at: https://docs.scrapy.org/en/latest/topics/settings.html. - + Returns: self """ @@ -245,7 +240,7 @@ def scrape(self, n=10, timeout=60, settings=None): def get(self, fmt='df'): """Return scraped data as DataFrame or list. - + Args: fmt (str, optional): Format to return parsed items as. Should be in ['df', 'list']. Defaults to 'df'. @@ -268,10 +263,10 @@ def get(self, fmt='df'): return jsonlines elif fmt == 'df': return pd.DataFrame(jsonlines) - + def save(self): """Save spider in ``jobdir`` for later use. - + Returns: str: Path to job directory. """ @@ -284,10 +279,10 @@ def save(self): @classmethod def load(cls, jobdir): """Load existing spider from ``jobdir``. - + Args: jobdir (str): Path to job directory. - + Returns: Spider loaded from job directory. """ diff --git a/scripts/release.sh b/scripts/release.sh deleted file mode 100644 index 28103d9..0000000 --- a/scripts/release.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -conda activate finscraper && \ -rm -rf dist/ &&\ -python setup.py sdist && \ -twine upload dist/* \ No newline at end of file diff --git a/tests/test_spiders.py b/tests/test_spiders.py index d9cfe74..192fa59 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -4,7 +4,6 @@ import time import pytest -pytestmark = [pytest.mark.spider] from finscraper.spiders import ILArticle, ISArticle, YLEArticle, VauvaPage, \ OikotieApartment, DemiPage, Suomi24Page, ToriDeal @@ -12,6 +11,8 @@ from tests.utils import calc_field_emptiness +pytestmark = [pytest.mark.spider] + # Spiders can be added here, and basic tests will be set up automatically spiders = [ { @@ -64,12 +65,11 @@ } ] - scrape_cases = [pytest.param(s['class'], p, s['n_fields'], marks=s['mark']) for s in spiders for p in s['params']] other_cases = [pytest.param(s['class'], p, marks=s['mark']) - for s in spiders for p in s['params']] + for s in spiders for p in s['params']] @pytest.fixture(scope='function') diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index 4c3728c..8675049 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -11,7 +11,7 @@ def test_spider_save_load_with_jobdir(): jobdir = '../jobdir' spider = ISArticle(jobdir=jobdir) - + save_jobdir = spider.save() loaded_spider = ISArticle.load(save_jobdir) @@ -22,7 +22,7 @@ def test_spider_save_load_with_jobdir(): def test_spider_save_load_without_jobdir(): spider = ISArticle() - + save_jobdir = spider.save() loaded_spider = ISArticle.load(save_jobdir) @@ -77,9 +77,10 @@ def test_spider_logging(): # Attribute set try: spider.log_level = 'test' + assert False except ValueError: assert True - except: + except Exception: assert False spider.log_level = 'info' assert spider.log_level == logging.INFO @@ -91,13 +92,13 @@ def test_spider_progress_bar(): # Progress bar true by default spider = ILArticle() spider.scrape(1) - assert spider.progress_bar == True + assert spider.progress_bar is True assert len(spider.get()) > 0 # Progress bar disabled, when log level given spider = ILArticle(log_level='info') spider.scrape(1) - assert spider.progress_bar == False + assert spider.progress_bar is False assert len(spider.get()) > 0 # TODO: Test the output