From e929adde75d387d20c685972658df37ac88f0798 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 24 Jul 2024 21:30:44 -0400 Subject: [PATCH] fix: items_dir ending in slash no longer causes FEEDS to contain double slashes. items_dir starting with drive letter is no longer treated as URL. --- docs/news.rst | 2 + scrapyd/environ.py | 29 ++++++------ scrapyd/utils.py | 6 +++ scrapyd/website.py | 6 +-- tests/conftest.py | 6 +++ tests/test_environ.py | 100 +++++++++++++++++++++++++++++++++-------- tests/test_launcher.py | 12 ++--- 7 files changed, 115 insertions(+), 46 deletions(-) diff --git a/docs/news.rst b/docs/news.rst index 2af28e0b..1cc04f8e 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -88,6 +88,7 @@ Fixed ~~~~~ - Restore support for :ref:`eggstorage` implementations whose ``get()`` methods return file-like objects without ``name`` attributes (1.4.3 regression). +- If the :ref:`items_dir` setting is a URL and the path component ends with ``/``, the ``FEEDS`` setting no longer contains double slashes. - The ``MemoryJobStorage`` class returns finished jobs in reverse chronological order, like the ``SqliteJobStorage`` class. - The ``list_projects`` method of the ``SpiderScheduler`` class returns a ``list``, instead of ``dict_keys``. - Log errors to Scrapyd's log, even when :ref:`debug` mode is enabled. @@ -115,6 +116,7 @@ Scrapyd is now tested on macOS and Windows, in addition to Linux. - The :ref:`cancel.json` webservice now works on Windows, by using SIGBREAK instead of SIGINT or SIGTERM. - The :ref:`dbs_dir` setting no longer causes an error if it contains a drive letter on Windows. +- The :ref:`items_dir` setting is considered a local path if it contains a drive letter on Windows. - The :ref:`jobs_to_keep` setting no longer causes an error if a file to delete can't be deleted (for example, if the file is open on Windows). Removed diff --git a/scrapyd/environ.py b/scrapyd/environ.py index 1386b9c7..8bf6df28 100644 --- a/scrapyd/environ.py +++ b/scrapyd/environ.py @@ -1,13 +1,15 @@ import json import os from contextlib import suppress -from urllib.parse import urlparse, urlunparse +from posixpath import join as urljoin +from urllib.parse import urlsplit from w3lib.url import path_to_file_uri from zope.interface import implementer from scrapyd.exceptions import DirectoryTraversalError from scrapyd.interfaces import IEnvironment +from scrapyd.utils import local_items @implementer(IEnvironment) @@ -25,7 +27,7 @@ def get_settings(self, message): if self.logs_dir: settings["LOG_FILE"] = self._get_file(message, self.logs_dir, "log") if self.items_dir: - settings["FEEDS"] = json.dumps({self._get_feed_uri(message, "jl"): {"format": "jsonlines"}}) + settings["FEEDS"] = json.dumps({self._get_feeds(message, "jl"): {"format": "jsonlines"}}) return settings def get_environment(self, message, slot): @@ -44,20 +46,15 @@ def get_environment(self, message, slot): return env - def _get_feed_uri(self, message, extension): - url = urlparse(self.items_dir) - if url.scheme.lower() in ["", "file"]: - return path_to_file_uri(self._get_file(message, url.path, extension)) - return urlunparse( - ( - url.scheme, - url.netloc, - "/".join([url.path, message["_project"], message["_spider"], f"{message['_job']}.{extension}"]), - url.params, - url.query, - url.fragment, - ) - ) + def _get_feeds(self, message, extension): + parsed = urlsplit(self.items_dir) + + if local_items(self.items_dir, parsed): + # File URLs do not have query or fragment components. https://www.rfc-editor.org/rfc/rfc8089#section-2 + return path_to_file_uri(self._get_file(message, parsed.path, extension)) + + path = urljoin(parsed.path, message["_project"], message["_spider"], f"{message['_job']}.{extension}") + return parsed._replace(path=path).geturl() def _get_file(self, message, directory, extension): resolvedir = os.path.realpath(directory) diff --git a/scrapyd/utils.py b/scrapyd/utils.py index c658cf1b..4605fae2 100644 --- a/scrapyd/utils.py +++ b/scrapyd/utils.py @@ -1,3 +1,5 @@ +import os.path + from scrapy.utils.misc import load_object @@ -7,6 +9,10 @@ def initialize_component(config, setting, default, *args): return cls(config, *args) +def local_items(items_dir, parsed): + return parsed.scheme.lower() in ("", "file", os.path.splitdrive(items_dir)[0].rstrip(":").lower()) + + def job_log_url(job): return f"/logs/{job.project}/{job.spider}/{job.job}.log" diff --git a/scrapyd/website.py b/scrapyd/website.py index 86ee161a..84bdd630 100644 --- a/scrapyd/website.py +++ b/scrapyd/website.py @@ -1,7 +1,7 @@ import socket from datetime import datetime, timedelta from html import escape -from urllib.parse import quote, urlparse +from urllib.parse import quote, urlsplit from scrapy.utils.misc import load_object from twisted.application.service import IServiceCollection @@ -9,7 +9,7 @@ from twisted.web import resource, static from scrapyd.interfaces import IEggStorage, IPoller, ISpiderScheduler -from scrapyd.utils import job_items_url, job_log_url +from scrapyd.utils import job_items_url, job_log_url, local_items class PrefixHeaderMixin: @@ -136,7 +136,7 @@ def __init__(self, config, app): self.debug = config.getboolean("debug", False) self.runner = config.get("runner", "scrapyd.runner") self.prefix_header = config.get("prefix_header") - self.local_items = items_dir and (urlparse(items_dir).scheme.lower() in ["", "file"]) + self.local_items = items_dir and local_items(items_dir, urlsplit(items_dir)) self.nodename = config.get("node_name", socket.gethostname()) self.putChild(b"", Home(self, self.local_items)) diff --git a/tests/conftest.py b/tests/conftest.py index 2441da36..dc519a1f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,6 +8,7 @@ from scrapyd import Config from scrapyd.app import application +from scrapyd.interfaces import IEnvironment from scrapyd.webservice import spider_list from scrapyd.website import Root from tests import root_add_version @@ -56,6 +57,11 @@ def app(config): return application(config) +@pytest.fixture() +def environ(app): + return app.getComponent(IEnvironment) + + @pytest.fixture() def root(config, app): return Root(config, app) diff --git a/tests/test_environ.py b/tests/test_environ.py index fb241a23..c942fd4a 100644 --- a/tests/test_environ.py +++ b/tests/test_environ.py @@ -1,4 +1,7 @@ +import json import os +import re +from unittest.mock import patch import pytest from zope.interface.verify import verifyObject @@ -7,40 +10,101 @@ from scrapyd.environ import Environment from scrapyd.exceptions import DirectoryTraversalError from scrapyd.interfaces import IEnvironment +from tests import has_settings -msg = {"_project": "mybot", "_spider": "myspider", "_job": "ID"} +def test_interface(environ): + verifyObject(IEnvironment, environ) -@pytest.fixture() -def environ(tmpdir): - config = Config(values={"eggs_dir": tmpdir, "logs_dir": tmpdir}) - config.cp.add_section("settings") - config.cp.set("settings", "newbot", "newbot.settings") - return Environment(config, initenv={}) +def test_get_settings(environ): + settings = environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"}) + + assert re.search(r"^\S+j1\.log$", settings["LOG_FILE"]) + + if environ.items_dir: + feeds = json.loads(settings["FEEDS"]) + path, value = feeds.popitem() + + assert feeds == {} + assert re.search(r"^file:///\S+j1\.jl$", path) + assert value == {"format": "jsonlines"} -def test_interface(environ): - verifyObject(IEnvironment, environ) +@pytest.mark.parametrize( + ("items_dir", "expected"), + [ + ( + "https://host.example/path?query=value#fragment", + "https://host.example/path/p1/s1/j1.jl?query=value#fragment", + ), + ( + "https://host.example/path/", + "https://host.example/path/p1/s1/j1.jl", # no double slashes + ), + ( + "file:/host.example/path?ignored#ignored", + "file:///host.example/path/p1/s1/j1.jl", + ), + ( + "file://hostname/host.example/path?ignored#ignored", + "file:///host.example/path/p1/s1/j1.jl", + ), + ( + "file:///host.example/path?ignored#ignored", + "file:///host.example/path/p1/s1/j1.jl", + ), + ], +) +@patch("twisted.python.filepath.listdir") +@patch("os.makedirs") +def test_get_settings_url(listdir, makedirs, items_dir, expected): + config = Config(values={"logs_dir": "", "items_dir": items_dir}) + environ = Environment(config, initenv={}) -def test_get_environment_with_eggfile(environ): - env = environ.get_environment(msg, 3) + settings = environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"}) - assert env["SCRAPY_PROJECT"] == "mybot" - assert "SCRAPY_SETTINGS_MODULE" not in env + assert settings == {"FEEDS": f'{{"{expected}": {{"format": "jsonlines"}}}}'} @pytest.mark.parametrize("values", [{"items_dir": "../items"}, {"logs_dir": "../logs"}]) @pytest.mark.parametrize(("key", "value"), [("_project", "../p"), ("_spider", "../s"), ("_job", "../j")]) -def test_get_environment_secure(values, key, value): +def test_get_settings_secure(values, key, value): config = Config(values=values) environ = Environment(config, initenv={}) with pytest.raises(DirectoryTraversalError) as exc: - environ.get_settings({**msg, key: value}) + environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1", key: value}) assert str(exc.value) == ( - f"{value if key == '_project' else 'mybot'}{os.sep}" - f"{value if key == '_spider' else 'myspider'}{os.sep}" - f"{value if key == '_job' else 'ID'}.log" + f"{value if key == '_project' else 'p1'}{os.sep}" + f"{value if key == '_spider' else 's1'}{os.sep}" + f"{value if key == '_job' else 'j1'}.log" ) + + +@pytest.mark.parametrize( + ("message", "run_only_if_has_settings"), + [ + ({"_project": "mybot"}, False), + ({"_project": "mybot", "_version": "v1"}, False), + ({"_project": "localproject"}, True), + ], +) +def test_get_environment(environ, message, run_only_if_has_settings): + if run_only_if_has_settings and not has_settings(): + pytest.skip("[settings] section is not set") + + env = environ.get_environment(message, 3) + + assert env["SCRAPY_PROJECT"] == message["_project"] + + if "_version" in message: + assert env["SCRAPYD_EGG_VERSION"] == "v1" + else: + assert "SCRAPYD_EGG_VERSION" not in env + + if run_only_if_has_settings: + assert env["SCRAPY_SETTINGS_MODULE"] == "localproject.settings" + else: + assert "SCRAPY_SETTINGS_MODULE" not in env diff --git a/tests/test_launcher.py b/tests/test_launcher.py index 21758b8f..4765d6ae 100644 --- a/tests/test_launcher.py +++ b/tests/test_launcher.py @@ -8,7 +8,6 @@ from scrapyd import __version__ from scrapyd.config import Config -from scrapyd.interfaces import IEnvironment from scrapyd.launcher import Launcher, get_crawl_args from tests import has_settings @@ -17,11 +16,6 @@ def message(captured): return eventAsText(captured[0]).split(" ", 1)[1] -@pytest.fixture() -def environ(app): - return app.getComponent(IEnvironment) - - @pytest.fixture() def launcher(app): return Launcher(Config(), app) @@ -137,7 +131,7 @@ def test_connection_made(environ, process): assert re.match( f"\\[scrapyd\\.launcher#info\\] Process started: project='p1' spider='s1' job='j1' pid={pid} " "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', " - """'FEEDS={"file://\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""", + """'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""", message(captured), ) else: @@ -159,7 +153,7 @@ def test_process_ended_done(environ, process): assert re.match( f"\\[scrapyd\\.launcher#info\\] Process finished: project='p1' spider='s1' job='j1' pid={pid} " "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', " - """'FEEDS={"file://\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""", + """'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""", message(captured), ) else: @@ -181,7 +175,7 @@ def test_process_ended_terminated(environ, process): assert re.match( f"\\[scrapyd\\.launcher#error\\] Process died: exitstatus=1 project='p1' spider='s1' job='j1' pid={pid} " "args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', " - """'FEEDS={"file://\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""", + """'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""", message(captured), ) else: