Skip to content

Commit

Permalink
fix: items_dir ending in slash no longer causes FEEDS to contain doub…
Browse files Browse the repository at this point in the history
…le slashes. items_dir starting with drive letter is no longer treated as URL.
  • Loading branch information
jpmckinney committed Jul 25, 2024
1 parent bcdabe2 commit e929add
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 46 deletions.
2 changes: 2 additions & 0 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Fixed
~~~~~

- Restore support for :ref:`eggstorage` implementations whose ``get()`` methods return file-like objects without ``name`` attributes (1.4.3 regression).
- If the :ref:`items_dir` setting is a URL and the path component ends with ``/``, the ``FEEDS`` setting no longer contains double slashes.
- The ``MemoryJobStorage`` class returns finished jobs in reverse chronological order, like the ``SqliteJobStorage`` class.
- The ``list_projects`` method of the ``SpiderScheduler`` class returns a ``list``, instead of ``dict_keys``.
- Log errors to Scrapyd's log, even when :ref:`debug` mode is enabled.
Expand Down Expand Up @@ -115,6 +116,7 @@ Scrapyd is now tested on macOS and Windows, in addition to Linux.

- The :ref:`cancel.json` webservice now works on Windows, by using SIGBREAK instead of SIGINT or SIGTERM.
- The :ref:`dbs_dir` setting no longer causes an error if it contains a drive letter on Windows.
- The :ref:`items_dir` setting is considered a local path if it contains a drive letter on Windows.
- The :ref:`jobs_to_keep` setting no longer causes an error if a file to delete can't be deleted (for example, if the file is open on Windows).

Removed
Expand Down
29 changes: 13 additions & 16 deletions scrapyd/environ.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import json
import os
from contextlib import suppress
from urllib.parse import urlparse, urlunparse
from posixpath import join as urljoin
from urllib.parse import urlsplit

from w3lib.url import path_to_file_uri
from zope.interface import implementer

from scrapyd.exceptions import DirectoryTraversalError
from scrapyd.interfaces import IEnvironment
from scrapyd.utils import local_items


@implementer(IEnvironment)
Expand All @@ -25,7 +27,7 @@ def get_settings(self, message):
if self.logs_dir:
settings["LOG_FILE"] = self._get_file(message, self.logs_dir, "log")
if self.items_dir:
settings["FEEDS"] = json.dumps({self._get_feed_uri(message, "jl"): {"format": "jsonlines"}})
settings["FEEDS"] = json.dumps({self._get_feeds(message, "jl"): {"format": "jsonlines"}})
return settings

def get_environment(self, message, slot):
Expand All @@ -44,20 +46,15 @@ def get_environment(self, message, slot):

return env

def _get_feed_uri(self, message, extension):
url = urlparse(self.items_dir)
if url.scheme.lower() in ["", "file"]:
return path_to_file_uri(self._get_file(message, url.path, extension))
return urlunparse(
(
url.scheme,
url.netloc,
"/".join([url.path, message["_project"], message["_spider"], f"{message['_job']}.{extension}"]),
url.params,
url.query,
url.fragment,
)
)
def _get_feeds(self, message, extension):
parsed = urlsplit(self.items_dir)

if local_items(self.items_dir, parsed):
# File URLs do not have query or fragment components. https://www.rfc-editor.org/rfc/rfc8089#section-2
return path_to_file_uri(self._get_file(message, parsed.path, extension))

path = urljoin(parsed.path, message["_project"], message["_spider"], f"{message['_job']}.{extension}")
return parsed._replace(path=path).geturl()

def _get_file(self, message, directory, extension):
resolvedir = os.path.realpath(directory)
Expand Down
6 changes: 6 additions & 0 deletions scrapyd/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os.path

from scrapy.utils.misc import load_object


Expand All @@ -7,6 +9,10 @@ def initialize_component(config, setting, default, *args):
return cls(config, *args)


def local_items(items_dir, parsed):
return parsed.scheme.lower() in ("", "file", os.path.splitdrive(items_dir)[0].rstrip(":").lower())


def job_log_url(job):
return f"/logs/{job.project}/{job.spider}/{job.job}.log"

Expand Down
6 changes: 3 additions & 3 deletions scrapyd/website.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import socket
from datetime import datetime, timedelta
from html import escape
from urllib.parse import quote, urlparse
from urllib.parse import quote, urlsplit

from scrapy.utils.misc import load_object
from twisted.application.service import IServiceCollection
from twisted.python import filepath
from twisted.web import resource, static

from scrapyd.interfaces import IEggStorage, IPoller, ISpiderScheduler
from scrapyd.utils import job_items_url, job_log_url
from scrapyd.utils import job_items_url, job_log_url, local_items


class PrefixHeaderMixin:
Expand Down Expand Up @@ -136,7 +136,7 @@ def __init__(self, config, app):
self.debug = config.getboolean("debug", False)
self.runner = config.get("runner", "scrapyd.runner")
self.prefix_header = config.get("prefix_header")
self.local_items = items_dir and (urlparse(items_dir).scheme.lower() in ["", "file"])
self.local_items = items_dir and local_items(items_dir, urlsplit(items_dir))
self.nodename = config.get("node_name", socket.gethostname())

self.putChild(b"", Home(self, self.local_items))
Expand Down
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from scrapyd import Config
from scrapyd.app import application
from scrapyd.interfaces import IEnvironment
from scrapyd.webservice import spider_list
from scrapyd.website import Root
from tests import root_add_version
Expand Down Expand Up @@ -56,6 +57,11 @@ def app(config):
return application(config)


@pytest.fixture()
def environ(app):
return app.getComponent(IEnvironment)


@pytest.fixture()
def root(config, app):
return Root(config, app)
Expand Down
100 changes: 82 additions & 18 deletions tests/test_environ.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import json
import os
import re
from unittest.mock import patch

import pytest
from zope.interface.verify import verifyObject
Expand All @@ -7,40 +10,101 @@
from scrapyd.environ import Environment
from scrapyd.exceptions import DirectoryTraversalError
from scrapyd.interfaces import IEnvironment
from tests import has_settings

msg = {"_project": "mybot", "_spider": "myspider", "_job": "ID"}

def test_interface(environ):
verifyObject(IEnvironment, environ)

@pytest.fixture()
def environ(tmpdir):
config = Config(values={"eggs_dir": tmpdir, "logs_dir": tmpdir})
config.cp.add_section("settings")
config.cp.set("settings", "newbot", "newbot.settings")
return Environment(config, initenv={})

def test_get_settings(environ):
settings = environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"})

assert re.search(r"^\S+j1\.log$", settings["LOG_FILE"])

if environ.items_dir:
feeds = json.loads(settings["FEEDS"])
path, value = feeds.popitem()

assert feeds == {}
assert re.search(r"^file:///\S+j1\.jl$", path)
assert value == {"format": "jsonlines"}

def test_interface(environ):
verifyObject(IEnvironment, environ)

@pytest.mark.parametrize(
("items_dir", "expected"),
[
(
"https://host.example/path?query=value#fragment",
"https://host.example/path/p1/s1/j1.jl?query=value#fragment",
),
(
"https://host.example/path/",
"https://host.example/path/p1/s1/j1.jl", # no double slashes
),
(
"file:/host.example/path?ignored#ignored",
"file:///host.example/path/p1/s1/j1.jl",
),
(
"file://hostname/host.example/path?ignored#ignored",
"file:///host.example/path/p1/s1/j1.jl",
),
(
"file:///host.example/path?ignored#ignored",
"file:///host.example/path/p1/s1/j1.jl",
),
],
)
@patch("twisted.python.filepath.listdir")
@patch("os.makedirs")
def test_get_settings_url(listdir, makedirs, items_dir, expected):
config = Config(values={"logs_dir": "", "items_dir": items_dir})
environ = Environment(config, initenv={})

def test_get_environment_with_eggfile(environ):
env = environ.get_environment(msg, 3)
settings = environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1"})

assert env["SCRAPY_PROJECT"] == "mybot"
assert "SCRAPY_SETTINGS_MODULE" not in env
assert settings == {"FEEDS": f'{{"{expected}": {{"format": "jsonlines"}}}}'}


@pytest.mark.parametrize("values", [{"items_dir": "../items"}, {"logs_dir": "../logs"}])
@pytest.mark.parametrize(("key", "value"), [("_project", "../p"), ("_spider", "../s"), ("_job", "../j")])
def test_get_environment_secure(values, key, value):
def test_get_settings_secure(values, key, value):
config = Config(values=values)
environ = Environment(config, initenv={})

with pytest.raises(DirectoryTraversalError) as exc:
environ.get_settings({**msg, key: value})
environ.get_settings({"_project": "p1", "_spider": "s1", "_job": "j1", key: value})

assert str(exc.value) == (
f"{value if key == '_project' else 'mybot'}{os.sep}"
f"{value if key == '_spider' else 'myspider'}{os.sep}"
f"{value if key == '_job' else 'ID'}.log"
f"{value if key == '_project' else 'p1'}{os.sep}"
f"{value if key == '_spider' else 's1'}{os.sep}"
f"{value if key == '_job' else 'j1'}.log"
)


@pytest.mark.parametrize(
("message", "run_only_if_has_settings"),
[
({"_project": "mybot"}, False),
({"_project": "mybot", "_version": "v1"}, False),
({"_project": "localproject"}, True),
],
)
def test_get_environment(environ, message, run_only_if_has_settings):
if run_only_if_has_settings and not has_settings():
pytest.skip("[settings] section is not set")

env = environ.get_environment(message, 3)

assert env["SCRAPY_PROJECT"] == message["_project"]

if "_version" in message:
assert env["SCRAPYD_EGG_VERSION"] == "v1"
else:
assert "SCRAPYD_EGG_VERSION" not in env

if run_only_if_has_settings:
assert env["SCRAPY_SETTINGS_MODULE"] == "localproject.settings"
else:
assert "SCRAPY_SETTINGS_MODULE" not in env
12 changes: 3 additions & 9 deletions tests/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from scrapyd import __version__
from scrapyd.config import Config
from scrapyd.interfaces import IEnvironment
from scrapyd.launcher import Launcher, get_crawl_args
from tests import has_settings

Expand All @@ -17,11 +16,6 @@ def message(captured):
return eventAsText(captured[0]).split(" ", 1)[1]


@pytest.fixture()
def environ(app):
return app.getComponent(IEnvironment)


@pytest.fixture()
def launcher(app):
return Launcher(Config(), app)
Expand Down Expand Up @@ -137,7 +131,7 @@ def test_connection_made(environ, process):
assert re.match(
f"\\[scrapyd\\.launcher#info\\] Process started: project='p1' spider='s1' job='j1' pid={pid} "
"args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', "
"""'FEEDS={"file://\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""",
"""'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""",
message(captured),
)
else:
Expand All @@ -159,7 +153,7 @@ def test_process_ended_done(environ, process):
assert re.match(
f"\\[scrapyd\\.launcher#info\\] Process finished: project='p1' spider='s1' job='j1' pid={pid} "
"args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', "
"""'FEEDS={"file://\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""",
"""'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""",
message(captured),
)
else:
Expand All @@ -181,7 +175,7 @@ def test_process_ended_terminated(environ, process):
assert re.match(
f"\\[scrapyd\\.launcher#error\\] Process died: exitstatus=1 project='p1' spider='s1' job='j1' pid={pid} "
"args=\\['\\S+', '-m', 'scrapyd\\.runner', 'crawl', 's1', '-s', 'LOG_FILE=\\S+j1\\.log', '-s', "
"""'FEEDS={"file://\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""",
"""'FEEDS={"file:///\\S+j1\\.jl": {"format": "jsonlines"}}', '-a', '_job=j1'\\]""",
message(captured),
)
else:
Expand Down

0 comments on commit e929add

Please sign in to comment.