From 7bbe5c4101a598305e9f5e79628cb12f967b1899 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 14:52:35 +0200 Subject: [PATCH 01/24] Future self-type annotations. --- src/databricks/labs/blueprint/paths.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 8903e66..53d2327 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import abc import fnmatch import locale @@ -244,7 +246,7 @@ def __new__(cls, ws: WorkspaceClient, path: str | Path): return this.__from_raw_parts(this, ws, this._flavour, drv, root, parts) @staticmethod - def __from_raw_parts(this, ws: WorkspaceClient, flavour: _DatabricksFlavour, drv, root, parts) -> "WorkspacePath": + def __from_raw_parts(this, ws: WorkspaceClient, flavour: _DatabricksFlavour, drv, root, parts) -> WorkspacePath: # pylint: disable=protected-access this._accessor = _DatabricksAccessor(ws) this._flavour = flavour From 261c67c88b10f068661d31021a787b2311f3d1ff Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 15:16:50 +0200 Subject: [PATCH 02/24] Enable type checking for the existing tests and fix problems. --- tests/unit/test_paths.py | 99 ++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/tests/unit/test_paths.py b/tests/unit/test_paths.py index ca9c200..e622bcd 100644 --- a/tests/unit/test_paths.py +++ b/tests/unit/test_paths.py @@ -13,167 +13,168 @@ from databricks.labs.blueprint.paths import WorkspacePath -def test_exists_when_path_exists(): +def test_exists_when_path_exists() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = True assert workspace_path.exists() -def test_exists_when_path_does_not_exist(): +def test_exists_when_path_does_not_exist() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") - ws.workspace.get_status.side_effect = NotFound(...) + ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") assert not workspace_path.exists() -def test_mkdir_creates_directory(): +def test_mkdir_creates_directory() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.mkdir() ws.workspace.mkdirs.assert_called_once_with("/test/path") -def test_rmdir_removes_directory(): +def test_rmdir_removes_directory() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.rmdir() ws.workspace.delete.assert_called_once_with("/test/path", recursive=False) -def test_is_dir_when_path_is_directory(): +def test_is_dir_when_path_is_directory() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.DIRECTORY) assert workspace_path.is_dir() -def test_is_dir_when_path_is_not_directory(): +def test_is_dir_when_path_is_not_directory() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.FILE) assert not workspace_path.is_dir() -def test_is_file_when_path_is_file(): +def test_is_file_when_path_is_file() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.FILE) assert workspace_path.is_file() -def test_is_file_when_path_is_not_file(): +def test_is_file_when_path_is_not_file() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.DIRECTORY) assert not workspace_path.is_file() -def test_is_notebook_when_path_is_notebook(): +def test_is_notebook_when_path_is_notebook() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.NOTEBOOK) assert workspace_path.is_notebook() -def test_is_notebook_when_path_is_not_notebook(): +def test_is_notebook_when_path_is_not_notebook() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.FILE) assert not workspace_path.is_notebook() -def test_open_file_in_read_binary_mode(): +def test_open_file_in_read_binary_mode() -> None: ws = create_autospec(WorkspaceClient) ws.workspace.download.return_value.__enter__.return_value.read.return_value = b"test" workspace_path = WorkspacePath(ws, "/test/path") assert workspace_path.read_bytes() == b"test" -def test_open_file_in_write_binary_mode(): +def test_open_file_in_write_binary_mode() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.write_bytes(b"test") ws.workspace.upload.assert_called_with("/test/path", b"test", format=ImportFormat.AUTO) -def test_open_file_in_read_text_mode(): +def test_open_file_in_read_text_mode() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.download.return_value.__enter__.return_value.read.return_value = b"test" assert workspace_path.read_text() == "test" -def test_open_file_in_write_text_mode(): +def test_open_file_in_write_text_mode() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.write_text("test") ws.workspace.upload.assert_called_with("/test/path", "test", format=ImportFormat.AUTO) -def test_open_file_in_invalid_mode(): +def test_open_file_in_invalid_mode() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") with pytest.raises(ValueError): workspace_path.open(mode="invalid") -def test_suffix_when_file_has_extension(): +def test_suffix_when_file_has_extension() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path.py") assert workspace_path.suffix == ".py" -def test_suffix_when_file_is_notebook_and_language_matches(): +def test_suffix_when_file_is_notebook_and_language_matches() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info = ObjectInfo(language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) assert workspace_path.suffix == ".py" -def test_suffix_when_file_is_notebook_and_language_does_not_match(): +def test_suffix_when_file_is_notebook_and_language_does_not_match() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") - workspace_path._object_info.language = "unknown" + workspace_path._object_info.language = None assert workspace_path.suffix == "" -def test_suffix_when_file_is_not_notebook(): +def test_suffix_when_file_is_not_notebook() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") - workspace_path.is_notebook = lambda: False - assert workspace_path.suffix == "" + with patch("databricks.labs.blueprint.paths.WorkspacePath.is_notebook") as mock_is_notebook: + mock_is_notebook.return_value = False + assert workspace_path.suffix == "" -def test_mkdir_creates_directory_with_valid_mode(): +def test_mkdir_creates_directory_with_valid_mode() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.mkdir(mode=0o600) ws.workspace.mkdirs.assert_called_once_with("/test/path") -def test_mkdir_raises_error_with_invalid_mode(): +def test_mkdir_raises_error_with_invalid_mode() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") with pytest.raises(ValueError): workspace_path.mkdir(mode=0o700) -def test_rmdir_removes_directory_non_recursive(): +def test_rmdir_removes_directory_non_recursive() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.rmdir() ws.workspace.delete.assert_called_once_with("/test/path", recursive=False) -def test_rmdir_removes_directory_recursive(): +def test_rmdir_removes_directory_recursive() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.rmdir(recursive=True) ws.workspace.delete.assert_called_once_with("/test/path", recursive=True) -def test_rename_file_without_overwrite(): +def test_rename_file_without_overwrite() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.download.return_value.__enter__.return_value.read.return_value = b"test" @@ -181,7 +182,7 @@ def test_rename_file_without_overwrite(): ws.workspace.upload.assert_called_once_with("/new/path", b"test", format=ImportFormat.AUTO, overwrite=False) -def test_rename_file_with_overwrite(): +def test_rename_file_with_overwrite() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.download.return_value.__enter__.return_value.read.return_value = b"test" @@ -189,7 +190,7 @@ def test_rename_file_with_overwrite(): ws.workspace.upload.assert_called_once_with("/new/path", b"test", format=ImportFormat.AUTO, overwrite=True) -def test_unlink_existing_file(): +def test_unlink_existing_file() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = True @@ -197,22 +198,22 @@ def test_unlink_existing_file(): ws.workspace.delete.assert_called_once_with("/test/path") -def test_unlink_non_existing_file(): +def test_unlink_non_existing_file() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") - ws.workspace.get_status.side_effect = NotFound(...) + ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") with pytest.raises(FileNotFoundError): workspace_path.unlink() -def test_relative_to(): +def test_relative_to() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path/subpath") result = workspace_path.relative_to("/test/path") assert str(result) == "subpath" -def test_as_fuse_in_databricks_runtime(): +def test_as_fuse_in_databricks_runtime() -> None: with patch.dict("os.environ", {"DATABRICKS_RUNTIME_VERSION": "14.3"}): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") @@ -220,7 +221,7 @@ def test_as_fuse_in_databricks_runtime(): assert str(result) == "/Workspace/test/path" -def test_as_fuse_outside_databricks_runtime(): +def test_as_fuse_outside_databricks_runtime() -> None: with patch.dict("os.environ", {}, clear=True): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") @@ -228,7 +229,7 @@ def test_as_fuse_outside_databricks_runtime(): assert str(result) == "/Workspace/test/path" -def test_home_directory(): +def test_home_directory() -> None: ws = create_autospec(WorkspaceClient) ws.current_user.me.return_value.user_name = "test_user" workspace_path = WorkspacePath(ws, "/test/path") @@ -236,70 +237,70 @@ def test_home_directory(): assert str(result) == "/Users/test_user" -def test_is_dir_when_object_type_is_directory(): +def test_is_dir_when_object_type_is_directory() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.DIRECTORY assert workspace_path.is_dir() is True -def test_is_dir_when_object_type_is_not_directory(): +def test_is_dir_when_object_type_is_not_directory() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.FILE assert workspace_path.is_dir() is False -def test_is_dir_when_databricks_error_occurs(): +def test_is_dir_when_databricks_error_occurs() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") - ws.workspace.get_status.side_effect = NotFound(...) + ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") assert workspace_path.is_dir() is False -def test_is_file_when_object_type_is_file(): +def test_is_file_when_object_type_is_file() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.FILE assert workspace_path.is_file() is True -def test_is_file_when_object_type_is_not_file(): +def test_is_file_when_object_type_is_not_file() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.DIRECTORY assert workspace_path.is_file() is False -def test_is_file_when_databricks_error_occurs(): +def test_is_file_when_databricks_error_occurs() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") - ws.workspace.get_status.side_effect = NotFound(...) + ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") assert workspace_path.is_file() is False -def test_is_notebook_when_object_type_is_notebook(): +def test_is_notebook_when_object_type_is_notebook() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.NOTEBOOK assert workspace_path.is_notebook() is True -def test_is_notebook_when_object_type_is_not_notebook(): +def test_is_notebook_when_object_type_is_not_notebook() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.FILE assert workspace_path.is_notebook() is False -def test_is_notebook_when_databricks_error_occurs(): +def test_is_notebook_when_databricks_error_occurs() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") - ws.workspace.get_status.side_effect = NotFound(...) + ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") assert workspace_path.is_notebook() is False -def test_globbing_when_nested_json_files_exist(): +def test_globbing_when_nested_json_files_exist() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.DIRECTORY) From 25c50c4cb49cda3276004a61220c611a1d1edf5d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 17:55:29 +0200 Subject: [PATCH 03/24] Update internals to (mostly) work across Python 3.10-3.12, and test more functionality. Work-in-progress: not yet complete. --- src/databricks/labs/blueprint/paths.py | 463 ++++++++++++++++++++----- tests/unit/test_paths.py | 382 +++++++++++++++++++- 2 files changed, 741 insertions(+), 104 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 53d2327..f8dd4ba 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -5,13 +5,12 @@ import locale import logging import os -import pathlib import posixpath import re import sys -from functools import cached_property from io import BytesIO, StringIO -from pathlib import Path +from pathlib import Path, PurePath +from typing import NoReturn from urllib.parse import quote_from_bytes as urlquote_from_bytes from databricks.sdk import WorkspaceClient @@ -217,11 +216,51 @@ def __init__(self, ws: WorkspaceClient, path: str): class WorkspacePath(Path): """Experimental implementation of pathlib.Path for Databricks Workspace.""" - _SUFFIXES = {".py": Language.PYTHON, ".sql": Language.SQL, ".scala": Language.SCALA, ".R": Language.R} - - _ws: WorkspaceClient - _flavour: _DatabricksFlavour - _accessor: _DatabricksAccessor + # Implementation notes: + # - The builtin Path classes are not designed for extension, which in turn makes everything a little cumbersome. + # - The internals of the builtin pathlib have changed dramatically across supported Python versions (3.10-3.12 at + # the time of writing) so relying on those details is brittle. (Python 3.13 also includes a significant + # refactoring.) + # - Until 3.11 the implementation was decomposed and delegated to two internal interfaces: + # 1. Flavour (scope=class) which encapsulates the path style and manipulation. + # 2. Accessor (scope=instance) to which I/O-related calls are delegated. + # These interfaces are internal/protected. + # - Since 3.12 the implementation of these interfaces have been removed: + # 1. Flavour has been replaced with posixpath and ntpath (normally imported as os.path). Still class-scoped. + # 2. Accessor has been replaced with inline implementations based directly on the 'os' module. + # + # This implementation for Workspace paths does the following: + # 1. Flavour is basically posix-style, with the caveat that we don't bother with the special //-prefix handling. + # 2. The Accessor is delegated to existing routines available via the workspace client. + # 3. Python 3.12 introduces some new API elements. Because these are sourec-compatible with earlier versions + # these are forward-ported and implemented. + # + __slots__ = ( + # For us this is always the empty string. Consistent with the superclass attribute for Python 3.10-3.13b. + "_drv", + # The (normalized) root property for the path. Consistent with the superclass attribute for Python 3.10-3.13b. + "_root", + # The (normalized) path components (relative to the root) for the path. + # - For python <=3.11 this supersedes _parts + # - For python 3.12+ this supersedes _raw_paths + "_path_parts", + # The cached str() value of the instance. Consistent with the superclass attribute for Python 3.10-3.13b. + "_str", + # The cached hash() value for the instance. Consistent with the superclass attribute for Python 3.10-3.13b. + "_hash", + # The workspace client that we use to perform I/O operations on the path. + "_ws", + # The cached _object_info value for the instance. + "_cached_object_info", + ) + + _SUFFIXES = {Language.PYTHON: ".py", Language.SQL: ".sql", Language.SCALA: ".scala", Language.R: ".R"} + + # Path semantics are posix-like. + parser = posixpath + + # Compatibility attribute, for when superclass implementations get invoked on python <= 3.11. + _flavour = object() cwd = _na("cwd") stat = _na("stat") @@ -237,66 +276,320 @@ class WorkspacePath(Path): link_to = _na("link_to") samefile = _na("samefile") - def __new__(cls, ws: WorkspaceClient, path: str | Path): - this = object.__new__(cls) - # pathlib does a lot of clever performance tricks, and it's not designed to be subclassed, - # so we need to set the attributes directly, bypassing the most of a common sense. - this._flavour = _DatabricksFlavour(ws) - drv, root, parts = this._parse_args([path]) - return this.__from_raw_parts(this, ws, this._flavour, drv, root, parts) + def __new__(cls, *args, **kwargs) -> WorkspacePath: + # Force all initialisation to go via __init__() irrespective of the (Python-specific) base version. + return object.__new__(cls) - @staticmethod - def __from_raw_parts(this, ws: WorkspaceClient, flavour: _DatabricksFlavour, drv, root, parts) -> WorkspacePath: - # pylint: disable=protected-access - this._accessor = _DatabricksAccessor(ws) - this._flavour = flavour - this._drv = drv - this._root = root - this._parts = parts - this._ws = ws - return this - - def _make_child_relpath(self, part): - # used in dir walking - path = self._flavour.join(self._parts + [part]) - # self._flavour.join duplicates leading '/' (possibly a python bug) - # but we can't override join in _DatabricksFlavour because it's built-in - # and if we remove the leading '/' part then we don't get any - # so let's just do a slow but safe sanity check afterward - if os.sep == path[0] == path[1]: - path = path[1:] - return WorkspacePath(self._ws, path) - - def _parse_args(self, args): # pylint: disable=arguments-differ - # instance method adapted from pathlib.Path - parts = [] - for a in args: - if isinstance(a, pathlib.PurePath): - parts += a._parts # pylint: disable=protected-access - continue - parts.append(str(a)) - return self._flavour.parse_parts(parts) + def __init__(self, ws: WorkspaceClient, *args) -> None: + raw_paths: list[str] = [] + for arg in args: + if isinstance(arg, PurePath): + raw_paths.extend(arg.parts) + else: + try: + path = os.fspath(arg) + except TypeError: + path = arg + if not isinstance(path, str): + msg = ( + f"argument should be a str or an os.PathLib object where __fspath__ returns a str, " + f"not {type(path).__name__!r}" + ) + raise TypeError(msg) + raw_paths.append(path) + + # Normalise the paths that we have. + root, path_parts = self._parse_and_normalize(raw_paths) + + self._drv = "" + self._root = root + self._path_parts = path_parts + self._ws = ws - def _format_parsed_parts(self, drv, root, parts): # pylint: disable=arguments-differ - # instance method adapted from pathlib.Path - if drv or root: - return drv + root + self._flavour.join(parts[1:]) - return self._flavour.join(parts) + @classmethod + def _parse_and_normalize(cls, parts: list[str]) -> tuple[str, tuple[str, ...]]: + """Parse and normalize a list of path components. + + Args: + parts: a list of path components to parse and normalize. + Returns: + A tuple containing: + - The normalized drive (always '') + - The normalized root for this path, or '' if there isn't any. + - The normalized path components, if any, (relative) to the root. + """ + match parts: + case []: + path = "" + case [part]: + path = part + case [*parts]: + path = cls.parser.join(*parts) + if path: + root, rel = cls._splitroot(path, sep=cls.parser.sep) + # No need to split drv because we don't support it. + parsed = tuple(str(x) for x in rel.split(cls.parser.sep) if x and x != ".") + else: + root, parsed = "", () + return root, parsed + + @classmethod + def _splitroot(cls, part: str, sep: str) -> tuple[str, str]: + # Based on the upstream implementation, with the '//'-specific bit elided because we don't need to + # bother with Posix semantics. + if part and part[0] == sep: + root, path = sep, part.lstrip(sep) + else: + root, path = "", part + return root, path + + def __reduce__(self) -> NoReturn: + # Cannot support pickling because we can't pickle the workspace client. + msg = "Pickling Workspace paths is not supported." + raise NotImplementedError(msg) - def _from_parsed_parts(self, drv, root, parts): # pylint: disable=arguments-differ - # instance method adapted from pathlib.Path - this = object.__new__(self.__class__) - return self.__from_raw_parts(this, self._ws, self._flavour, drv, root, parts) + def __fspath__(self): + # Cannot support this: Workspace objects aren't accessible via the filesystem. + msg = f"Workspace paths are not path-like: {self}" + raise NotImplementedError(msg) - def _from_parts(self, args): # pylint: disable=arguments-differ - # instance method adapted from pathlib.Path - drv, root, parts = self._parse_args(args) - return self._from_parsed_parts(drv, root, parts) + def as_posix(self): + return str(self) + + def __str__(self): + try: + return self._str + except AttributeError: + self._str = (self._root + self.parser.sep.join(self._path_parts)) or "." + return self._str + + def __bytes__(self): + # Super implementations are fine. + return super(self).__bytes__() + + def __repr__(self): + return f"{self.__class__.__name__}({str(self)!r})" + + def as_uri( + self, + ) -> str: + return self._ws.config.host + "#workspace" + urlquote_from_bytes(bytes(self)) + + def __eq__(self, other): + if not isinstance(other, type(self)): + return NotImplemented + return str(self) == str(other) + + def __hash__(self): + try: + return self._hash + except AttributeError: + self._hash = hash(str(self)) + return self._hash - def relative_to(self, *other) -> pathlib.PurePath: # type: ignore - """Databricks Workspace works only with absolute paths, so we make sure to - return pathlib.Path instead of WorkspacePath to avoid confusion.""" - return pathlib.PurePath(super().relative_to(*other)) + def _parts(self) -> tuple[str, ...]: + """Return a tuple that has the same natural ordering as paths of this type.""" + return self._root, *self._path_parts + + @property + def _cparts(self): + # Compatibility property (python <= 3.11), accessed via reverse equality comparison. This can't be avoided. + return self._parts() + + @property + def _str_normcase(self): + # Compatibility property (python 3.12+), accessed via equality comparison. This can't be avoided. + return str(self) + + def __lt__(self, other): + if not isinstance(other, type(self)): + return NotImplemented + return self._path_parts < other._path_parts + + def __le__(self, other): + if not isinstance(other, type(self)): + return NotImplemented + return self._path_parts <= other._path_parts + + def __gt__(self, other): + if not isinstance(other, type(self)): + return NotImplemented + return self._path_parts > other._path_parts + + def __ge__(self, other): + if not isinstance(other, type(self)): + return NotImplemented + return self._path_parts >= other._path_parts + + def with_segments(self, *pathsegments): + return type(self)(self._ws, *pathsegments) + + @property + def drive(self) -> str: + return self._drv + + @property + def root(self): + return self._root + + @property + def anchor(self): + return self.drive + self.root + + @property + def name(self): + path_parts = self._path_parts + return path_parts[-1] if path_parts else "" + + @property + def parts(self): + if self.drive or self.root: + parts = (self.drive + self.root, *self._path_parts) + else: + parts = self._path_parts + return parts + + @property + def suffix(self): + # Super implementations are mostly fine... + suffix = super().suffix + # ...but if there is no suffix and this path is for a notebook then infer the extension based on the notebook + # language. + if not suffix and self.is_notebook(): + try: + suffix = self._SUFFIXES.get(self._object_info.language, "") + except DatabricksError: + pass + return suffix + + @property + def suffixes(self): + # Super implementations are all fine. + # TODO: Make this consistent with .suffix for notebooks. + return super().suffixes + + @property + def stem(self): + # Super implementations are all fine. + return super().stem + + def with_name(self, name): + parser = self.parser + if not name or parser.sep in name or name == ".": + msg = f"Invalid name: {name!r}" + raise ValueError(msg) + path_parts = list(self._path_parts) + if not path_parts: + raise ValueError(f"{self!r} has an empty name") + path_parts[-1] = name + return type(self)(self._ws, self.anchor, *path_parts) + + def with_stem(self, stem): + # Super implementations are all fine. + return super().with_stem(stem) + + def with_suffix(self, suffix): + stem = self.stem + if not stem: + msg = f"{self!r} has an empty name" + raise ValueError(msg) + elif suffix and not suffix.startswith("."): + msg = f"{self!r} invalid suffix: {suffix}" + raise ValueError(msg) + else: + return self.with_name(stem + suffix) + + @property + def _stack(self): + return self.anchor, list(reversed(self._path_parts)) + + def relative_to(self, other, *more_other, walk_up=False): + other = self.with_segments(other, *more_other) + anchor0, parts0 = self._stack + anchor1, parts1 = other._stack + if anchor0 != anchor1: + msg = f"{str(self)!r} and {str(other)!r} have different anchors" + raise ValueError(msg) + while parts0 and parts1 and parts0[-1] == parts1[-1]: + parts0.pop() + parts1.pop() + for part in parts1: + if not walk_up: + msg = f"{str(self)!r} is not in the subpath of {str(other)!r}" + raise ValueError(msg) + elif part == "..": + raise ValueError(f"'..' segment in {str(other)!r} cannot be walked") + else: + parts0.append("..") + return self.with_segments("", *reversed(parts0)) + + def is_relative_to(self, other, *more_other): + other = self.with_segments(other, *more_other) + if self.anchor != other.anchor: + return False + parts0 = list(reversed(self._path_parts)) + parts1 = list(reversed(other._path_parts)) + while parts0 and parts1 and parts0[-1] == parts1[-1]: + parts0.pop() + parts1.pop() + for part in parts1: + if part and part != ".": + return False + return True + + @property + def parent(self): + rel_path = self._path_parts + return self.with_segments(self.anchor, *rel_path[:-1]) if rel_path else self + + @property + def parents(self): + parents = [] + path = self + parent = path.parent + while path != parent: + parents.append(parent) + path = parent + parent = path.parent + return tuple(parents) + + def is_absolute(self): + return bool(self.anchor) + + def is_reserved(self): + return False + + def joinpath(self, *pathsegments): + return self.with_segments(self, *pathsegments) + + @classmethod + def _compile_pattern(cls, pattern: str, case_sensitive: bool) -> re.Pattern: + flags = 0 if case_sensitive else re.IGNORECASE + regex = fnmatch.translate(pattern) + return re.compile(regex, flags=flags) + + def match(self, path_pattern, *, case_sensitive=None): + # Convert the pattern to a fake path (with globs) to help with matching parts. + if not isinstance(path_pattern, PurePath): + path_pattern = self.with_segments(path_pattern) + # Default to false if not specified. + if case_sensitive is None: + case_sensitive = True + # Reverse the parts. + path_parts = self.parts + pattern_parts = path_pattern.parts + # Error + if not pattern_parts: + raise ValueError("empty pattern") + # Impossible matches. + if len(path_parts) < len(pattern_parts) or len(path_parts) > len(pattern_parts) and path_pattern.anchor: + return False + # Check each part. + for path_part, pattern_part in zip(reversed(path_parts), reversed(pattern_parts)): + pattern = self._compile_pattern(pattern_part, case_sensitive=case_sensitive) + if not pattern.match(path_part): + return False + return True def as_fuse(self): """Return FUSE-mounted path in Databricks Runtime.""" @@ -365,33 +658,17 @@ def open(self, mode="r", buffering=-1, encoding=None, errors=None, newline=None) raise ValueError(f"invalid mode: {mode}") @property - def suffix(self): - """Return the file extension. If the file is a notebook, return the suffix based on the language.""" - suffix = super().suffix - if suffix: - return suffix - if not self.is_notebook(): - return "" - for sfx, lang in self._SUFFIXES.items(): - try: - if self._object_info.language == lang: - return sfx - except DatabricksError: - return "" - return "" - - def __lt__(self, other: pathlib.PurePath): - if not isinstance(other, pathlib.PurePath): - return NotImplemented - return self.as_posix() < other.as_posix() - - @cached_property def _object_info(self) -> ObjectInfo: # this method is cached because it is used in multiple is_* methods. # DO NOT use this method in methods, where fresh result is required. - return self._ws.workspace.get_status(self.as_posix()) + try: + return self._cached_object_info + except AttributeError: + self._cached_object_info = self._ws.workspace.get_status(self.as_posix()) + return self._object_info - def _return_false(self) -> bool: + @staticmethod + def _return_false() -> bool: return False is_symlink = _return_false @@ -427,17 +704,17 @@ def _scandir(self): def expanduser(self): # Expand ~ (but NOT ~user) constructs. - if not (self._drv or self._root) and self._parts and self._parts[0][:1] == "~": - if self._parts[0] == "~": + if not (self._drv or self._root) and self._path_parts and self._path_parts[0][:1] == "~": + if self._path_parts[0] == "~": user_name = self._ws.current_user.me().user_name else: - other_user = self._parts[0][1:] + other_user = self._path_parts[0][1:] msg = f"Cannot determine home directory for: {other_user}" raise RuntimeError(msg) if user_name is None: raise RuntimeError("Could not determine home directory.") homedir = f"/Users/{user_name}" - return self._from_parts([homedir, *self._parts[1:]]) + return self.with_segments(homedir, *self._path_parts[1:]) return self def is_notebook(self): @@ -446,9 +723,3 @@ def is_notebook(self): return self._object_info.object_type == ObjectType.NOTEBOOK except DatabricksError: return False - - def __eq__(self, other): - return isinstance(other, Path) and self.as_posix() == other.as_posix() - - def __hash__(self): - return Path.__hash__(self) diff --git a/tests/unit/test_paths.py b/tests/unit/test_paths.py index e622bcd..5b9eda2 100644 --- a/tests/unit/test_paths.py +++ b/tests/unit/test_paths.py @@ -1,3 +1,5 @@ +import os +from pathlib import Path, PurePath, PurePosixPath, PureWindowsPath from unittest.mock import create_autospec, patch import pytest @@ -13,6 +15,376 @@ from databricks.labs.blueprint.paths import WorkspacePath +def test_empty_init() -> None: + """Ensure that basic initialization works.""" + ws = create_autospec(WorkspaceClient) + + # Simple initialisation; the empty path is valid. + WorkspacePath(ws) + + +@pytest.mark.parametrize( + ("args", "expected"), + [ + # Some absolute paths. + (["/foo/bar"], ("/", ("/", "foo", "bar"))), + (["/", "foo", "bar"], ("/", ("/", "foo", "bar"))), + (["/", "foo/bar"], ("/", ("/", "foo", "bar"))), + (["/", "foo", "bar/baz"], ("/", ("/", "foo", "bar", "baz"))), + # Some relative paths. + (["foo/bar"], ("", ("foo", "bar"))), + (["foo", "bar"], ("", ("foo", "bar"))), + (["foo", "/bar"], ("/", ("/", "bar"))), + # Some paths with mixed components of Path types. + (["/", "foo", PurePosixPath("bar/baz")], ("/", ("/", "foo", "bar", "baz"))), + (["/", "foo", PureWindowsPath("config.sys")], ("/", ("/", "foo", "config.sys"))), + (["/", "foo", PurePath("bar")], ("/", ("/", "foo", "bar"))), + # Some corner cases: empty, root and trailing '/'. + ([], ("", ())), + (["/"], ("/", ("/",))), + (["/foo/"], ("/", ("/", "foo"))), + # Intermediate '.' are supposed to be dropped during normalization. + (["/foo/./bar"], ("/", ("/", "foo", "bar"))), + ], + ids=lambda param: f"WorkspacePath({param!r})" if isinstance(param, list) else repr(param), +) +def test_init(args: list[str | PurePath], expected: tuple[str, list[str]]) -> None: + """Ensure that initialization with various combinations of segments works as expected.""" + ws = create_autospec(WorkspaceClient) + + # Run the test. + p = WorkspacePath(ws, Path(*args)) + + # Validate the initialisation results. + assert (p.drive, p.root, p.parts) == ("", *expected) + + +def test_init_error() -> None: + """Ensure that we detect initialisation with non-string or path-like path components.""" + ws = create_autospec(WorkspaceClient) + + expected_msg = "argument should be a str or an os.PathLib object where __fspath__ returns a str, not 'int'" + with pytest.raises(TypeError, match=expected_msg): + WorkspacePath(ws, 12) # type: ignore[arg-type] + + +def test_equality() -> None: + """Test that Workspace paths can be compared with each other for equality.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/foo/bar") == WorkspacePath(ws, Path("/", "foo", "bar")) + assert WorkspacePath(ws, "foo/bar") == WorkspacePath(ws, Path("foo", "bar")) + assert WorkspacePath(ws, "/foo/bar") != WorkspacePath(ws, Path("foo", "bar")) + + assert WorkspacePath(ws, "/foo/bar") != Path("foo", "bar") + assert Path("/foo/bar") != WorkspacePath(ws, "/foo/bar") + + +def test_hash() -> None: + """Test that equal Workspace paths have the same hash value.""" + ws = create_autospec(WorkspaceClient) + + assert hash(WorkspacePath(ws, "/foo/bar")) == hash(WorkspacePath(ws, Path("/", "foo", "bar"))) + assert hash(WorkspacePath(ws, "foo/bar")) == hash(WorkspacePath(ws, Path("foo", "bar"))) + + +@pytest.mark.parametrize( + "increasing_paths", + [ + ("/foo", "/foo/bar", "/foo/baz"), + ("foo", "foo/bar", "foo/baz"), + ], +) +def test_comparison(increasing_paths: tuple[str | list[str], str | list[str], str | list[str]]) -> None: + """Test that comparing paths works as expected.""" + ws = create_autospec(WorkspaceClient) + + p1, p2, p3 = (WorkspacePath(ws, p) for p in increasing_paths) + assert p1 < p2 < p3 + assert p1 <= p2 <= p2 <= p3 + assert p3 > p2 > p1 + assert p3 >= p2 >= p2 > p1 + + +def test_comparison_errors() -> None: + """Test that comparing Workspace paths with other types yields the error we expect, irrespective of comparison order.""" + ws = create_autospec(WorkspaceClient) + + with pytest.raises(TypeError, match="'<' not supported between instances"): + _ = WorkspacePath(ws, "foo") < PurePosixPath("foo") + with pytest.raises(TypeError, match="'>' not supported between instances"): + _ = WorkspacePath(ws, "foo") > PurePosixPath("foo") + with pytest.raises(TypeError, match="'<=' not supported between instances"): + _ = WorkspacePath(ws, "foo") <= PurePosixPath("foo") + with pytest.raises(TypeError, match="'>=' not supported between instances"): + _ = WorkspacePath(ws, "foo") >= PurePosixPath("foo") + with pytest.raises(TypeError, match="'<' not supported between instances"): + _ = PurePosixPath("foo") < WorkspacePath(ws, "foo") + with pytest.raises(TypeError, match="'>' not supported between instances"): + _ = PurePosixPath("foo") > WorkspacePath(ws, "foo") + with pytest.raises(TypeError, match="'<=' not supported between instances"): + _ = PurePosixPath("foo") <= WorkspacePath(ws, "foo") + with pytest.raises(TypeError, match="'>=' not supported between instances"): + _ = PurePosixPath("foo") >= WorkspacePath(ws, "foo") + + +def test_drive() -> None: + """Test that the drive is empty for our paths.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/foo").drive == "" + assert WorkspacePath(ws, "foo").root == "" + + +def test_root() -> None: + """Test that absolute paths have the '/' root and relative paths do not.""" + # More comprehensive tests are part of test_init() + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/foo").root == "/" + assert WorkspacePath(ws, "foo").root == "" + + +def test_anchor() -> None: + """Test that the anchor for absolute paths is '/' and empty for relative paths.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/foo").anchor == "/" + assert WorkspacePath(ws, "foo").anchor == "" + + +def test_pathlike_error() -> None: + """Paths are "path-like" but Workspace ones aren't, so verify that triggers an error.""" + ws = create_autospec(WorkspaceClient) + p = WorkspacePath(ws, "/some/path") + + with pytest.raises(NotImplementedError, match="Workspace paths are not path-like"): + _ = os.fspath(p) + + +def test_name() -> None: + """Test that the last part of the path is properly noted as the name.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/foo/bar").name == "bar" + assert WorkspacePath(ws, "/foo/").name == "foo" + assert WorkspacePath(ws, "/").name == "" + assert WorkspacePath(ws, Path()).name == "" + + +def test_parts() -> None: + """Test that parts returns the anchor and path components.""" + # More comprehensive tests are part of test_init() + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/foo/bar").parts == ("/", "foo", "bar") + assert WorkspacePath(ws, "/foo/").parts == ("/", "foo") + assert WorkspacePath(ws, "/").parts == ("/",) + assert WorkspacePath(ws, "foo/bar").parts == ("foo", "bar") + assert WorkspacePath(ws, "foo/").parts == ("foo",) + + +def test_suffix() -> None: + """Test that the suffix is correctly extracted.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/path/to/distribution.tar.gz").suffix == ".gz" + assert WorkspacePath(ws, "/no/suffix/here").suffix == "" + + +def test_suffixes() -> None: + """Test that multiple suffixes are correctly extracted.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/path/to/distribution.tar.gz").suffixes == [".tar", ".gz"] + assert WorkspacePath(ws, "/path/to/file.txt").suffixes == [".txt"] + assert WorkspacePath(ws, "/no/suffix/here").suffixes == [] + + +def test_stem() -> None: + """Test that the stem is correctly extracted.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/path/to/distribution.tar.gz").stem == "distribution.tar" + assert WorkspacePath(ws, "/path/to/file.txt").stem == "file" + assert WorkspacePath(ws, "/no/suffix/here").stem == "here" + + +def test_with_name() -> None: + """Test that the name in a path can be replaced.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/path/to/notebook.py").with_name("requirements.txt") == WorkspacePath( + ws, "/path/to/requirements.txt" + ) + assert WorkspacePath(ws, "relative/notebook.py").with_name("requirements.txt") == WorkspacePath( + ws, "relative/requirements.txt" + ) + + +@pytest.mark.parametrize( + ("path", "name"), + [ + # Invalid names. + ("/a/path", "invalid/replacement"), + ("/a/path", ""), + ("/a/path", "."), + # Invalid paths for using with_name() + ("/", "file.txt"), + ("", "file.txt"), + ], +) +def test_with_name_errors(path, name) -> None: + """Test that various forms of invalid .with_name() invocations are detected.""" + ws = create_autospec(WorkspaceClient) + + with pytest.raises(ValueError): + _ = WorkspacePath(ws, path).with_name(name) + + +def test_with_stem() -> None: + """Test that the stem in a path can be replaced.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/dir/file.txt").with_stem("README") == WorkspacePath(ws, "/dir/README.txt") + + +def test_with_suffix() -> None: + """Test that the suffix of a path can be replaced, and that some errors are handled.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/dir/README.txt").with_suffix(".md") == WorkspacePath(ws, "/dir/README.md") + with pytest.raises(ValueError, match="[Ii]nvalid suffix"): + _ = WorkspacePath(ws, "/dir/README.txt").with_suffix("txt") + with pytest.raises(ValueError, match="empty name"): + _ = WorkspacePath(ws, "/").with_suffix(".txt") + + +def test_relative_to() -> None: + """Test that it is possible to get the relative path between two paths.""" + ws = create_autospec(WorkspaceClient) + + # Basics. + assert WorkspacePath(ws, "/home/bob").relative_to("/") == WorkspacePath(ws, "home/bob") + assert WorkspacePath(ws, "/home/bob").relative_to("/home") == WorkspacePath(ws, "bob") + assert WorkspacePath(ws, "/home/bob").relative_to("/./home") == WorkspacePath(ws, "bob") + assert WorkspacePath(ws, "foo/bar/baz").relative_to("foo") == WorkspacePath(ws, "bar/baz") + assert WorkspacePath(ws, "foo/bar/baz").relative_to("foo/bar") == WorkspacePath(ws, "baz") + assert WorkspacePath(ws, "foo/bar/baz").relative_to("foo/./bar") == WorkspacePath(ws, "baz") + + # Walk-up (3.12+) behaviour. + assert WorkspacePath(ws, "/home/bob").relative_to("/usr", walk_up=True) == WorkspacePath(ws, "../home/bob") + + # Check some errors. + with pytest.raises(ValueError, match="different anchors"): + _ = WorkspacePath(ws, "/home/bob").relative_to("home") + with pytest.raises(ValueError, match="not in the subpath"): + _ = WorkspacePath(ws, "/home/bob").relative_to("/usr") + with pytest.raises(ValueError, match="cannot be walked"): + _ = WorkspacePath(ws, "/home/bob").relative_to("/home/../usr", walk_up=True) + + +@pytest.mark.parametrize( + ("path", "parent"), + [ + ("/foo/bar/baz", "/foo/bar"), + ("/", "/"), + (".", "."), + ("foo/bar", "foo"), + ("foo", "."), + ], +) +def test_parent(path, parent) -> None: + """Test that the parent of a path is properly calculated.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, path).parent == WorkspacePath(ws, parent) + + +@pytest.mark.parametrize( + ("path", "parents"), + [ + ("/foo/bar/baz", ("/foo/bar", "/foo", "/")), + ("/", ()), + (".", ()), + ("foo/bar", ("foo", ".")), + ("foo", (".",)), + ], +) +def test_parents(path, parents) -> None: + """Test that each of the parents of a path is returned.""" + ws = create_autospec(WorkspaceClient) + + expected_parents = tuple(WorkspacePath(ws, parent) for parent in parents) + assert tuple(WorkspacePath(ws, path).parents) == expected_parents + + +def test_is_relative_to() -> None: + """Test detection of whether a path is relative to the target.""" + ws = create_autospec(WorkspaceClient) + + # Basics where it's true. + assert WorkspacePath(ws, "/home/bob").is_relative_to("/") + assert WorkspacePath(ws, "/home/bob").is_relative_to("/home") + assert WorkspacePath(ws, "/home/bob").is_relative_to("/./home") + assert WorkspacePath(ws, "foo/bar/baz").is_relative_to("foo") + assert WorkspacePath(ws, "foo/bar/baz").is_relative_to("foo/bar") + assert WorkspacePath(ws, "foo/bar/baz").is_relative_to("foo/./bar") + + # Some different situations where it isn't. + assert not WorkspacePath(ws, "/home/bob").is_relative_to("home") # Different anchor. + assert not WorkspacePath(ws, "/home/bob").is_relative_to("/usr") # Not a prefix. + + +def test_is_absolute() -> None: + """Test detection of absolute versus relative paths.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/foo/bar").is_absolute() + assert WorkspacePath(ws, "/").is_absolute() + assert not WorkspacePath(ws, "foo/bar").is_absolute() + assert not WorkspacePath(ws, ".").is_absolute() + + +def test_is_reserved() -> None: + """Test detection of reserved paths (which don't exist with Workspace paths).""" + ws = create_autospec(WorkspaceClient) + + assert not WorkspacePath(ws, "NUL").is_reserved() + + +def test_joinpath() -> None: + """Test that paths can be joined.""" + ws = create_autospec(WorkspaceClient) + + assert WorkspacePath(ws, "/home").joinpath("bob") == WorkspacePath(ws, "/home/bob") + assert WorkspacePath(ws, "/home").joinpath(WorkspacePath(ws, "bob")) == WorkspacePath(ws, "/home/bob") + assert WorkspacePath(ws, "/home").joinpath(PurePosixPath("bob")) == WorkspacePath(ws, "/home/bob") + assert WorkspacePath(ws, "/usr").joinpath("local", "bin") == WorkspacePath(ws, "/usr/local/bin") + assert WorkspacePath(ws, "home").joinpath("jane") == WorkspacePath(ws, "home/jane") + + +def test_match() -> None: + """Test that glob matching works.""" + ws = create_autospec(WorkspaceClient) + + # Relative patterns, match from the right. + assert WorkspacePath(ws, "foo/bar/file.txt").match("*.txt") + assert WorkspacePath(ws, "/foo/bar/file.txt").match("bar/*.txt") + assert not WorkspacePath(ws, "/foo/bar/file.txt").match("foo/*.txt") + + # Absolute patterns, match from the left (and only against absolute paths) + assert WorkspacePath(ws, "/file.txt").match("/*.txt") + assert not WorkspacePath(ws, "foo/bar/file.txt").match("/*.txt") + + # Case-sensitive by default, but can be overridden. + assert not WorkspacePath(ws, "file.txt").match("*.TXT") + # assert WorkspacePath(ws, "file.txt").match("*.TXT", case_sensitive=False) + + # No recursive globs. + assert not WorkspacePath(ws, "/foo/bar/file.txt").match("/**/*.txt") + + def test_exists_when_path_exists() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") @@ -127,7 +499,7 @@ def test_suffix_when_file_has_extension() -> None: def test_suffix_when_file_is_notebook_and_language_matches() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") - workspace_path._object_info = ObjectInfo(language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) + workspace_path._cached_object_info = ObjectInfo(language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) assert workspace_path.suffix == ".py" @@ -206,13 +578,6 @@ def test_unlink_non_existing_file() -> None: workspace_path.unlink() -def test_relative_to() -> None: - ws = create_autospec(WorkspaceClient) - workspace_path = WorkspacePath(ws, "/test/path/subpath") - result = workspace_path.relative_to("/test/path") - assert str(result) == "subpath" - - def test_as_fuse_in_databricks_runtime() -> None: with patch.dict("os.environ", {"DATABRICKS_RUNTIME_VERSION": "14.3"}): ws = create_autospec(WorkspaceClient) @@ -300,6 +665,7 @@ def test_is_notebook_when_databricks_error_occurs() -> None: assert workspace_path.is_notebook() is False +@pytest.mark.xfail(reason="Implementation pending.") def test_globbing_when_nested_json_files_exist() -> None: ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") From 8065bc575451bffc5cc4d16d704275af8077fb11 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 18:00:52 +0200 Subject: [PATCH 04/24] Update project metadata to reflect Python 3.12 support. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 5aeb537..288c08a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", ] dependencies = ["databricks-sdk>=0.16.0"] From f9fc5ab6c6f5bd3660eb0b412349859fdd307a09 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 21:25:56 +0200 Subject: [PATCH 05/24] Fix typo. --- src/databricks/labs/blueprint/paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index f8dd4ba..2467d96 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -232,7 +232,7 @@ class WorkspacePath(Path): # This implementation for Workspace paths does the following: # 1. Flavour is basically posix-style, with the caveat that we don't bother with the special //-prefix handling. # 2. The Accessor is delegated to existing routines available via the workspace client. - # 3. Python 3.12 introduces some new API elements. Because these are sourec-compatible with earlier versions + # 3. Python 3.12 introduces some new API elements. Because these are source-compatible with earlier versions # these are forward-ported and implemented. # __slots__ = ( From 4ec3ff75c79e263fdd5aa4f67f40340ab1f67f32 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 21:26:31 +0200 Subject: [PATCH 06/24] Fix broken super() call. --- src/databricks/labs/blueprint/paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 2467d96..40ffdbb 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -365,7 +365,7 @@ def __str__(self): def __bytes__(self): # Super implementations are fine. - return super(self).__bytes__() + return super().__bytes__() def __repr__(self): return f"{self.__class__.__name__}({str(self)!r})" From 39c80db772b78d9b70a739c4db32e1cd9cf22b3b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 21:26:48 +0200 Subject: [PATCH 07/24] Formatting. --- src/databricks/labs/blueprint/paths.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 40ffdbb..f4d6485 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -370,9 +370,7 @@ def __bytes__(self): def __repr__(self): return f"{self.__class__.__name__}({str(self)!r})" - def as_uri( - self, - ) -> str: + def as_uri(self) -> str: return self._ws.config.host + "#workspace" + urlquote_from_bytes(bytes(self)) def __eq__(self, other): From c8fab614465e6994f81e9701bd59684651eab1f3 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 21:29:32 +0200 Subject: [PATCH 08/24] Adjust code to satisfy the linter. --- src/databricks/labs/blueprint/paths.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index f4d6485..967531a 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -253,6 +253,7 @@ class WorkspacePath(Path): # The cached _object_info value for the instance. "_cached_object_info", ) + _cached_object_info: ObjectInfo _SUFFIXES = {Language.PYTHON: ".py", Language.SQL: ".sql", Language.SCALA: ".scala", Language.R: ".R"} @@ -442,10 +443,8 @@ def name(self): @property def parts(self): if self.drive or self.root: - parts = (self.drive + self.root, *self._path_parts) - else: - parts = self._path_parts - return parts + return self.drive + self.root, *self._path_parts + return self._path_parts @property def suffix(self): @@ -491,11 +490,10 @@ def with_suffix(self, suffix): if not stem: msg = f"{self!r} has an empty name" raise ValueError(msg) - elif suffix and not suffix.startswith("."): + if suffix and not suffix.startswith("."): msg = f"{self!r} invalid suffix: {suffix}" raise ValueError(msg) - else: - return self.with_name(stem + suffix) + return self.with_name(stem + suffix) @property def _stack(self): @@ -515,10 +513,9 @@ def relative_to(self, other, *more_other, walk_up=False): if not walk_up: msg = f"{str(self)!r} is not in the subpath of {str(other)!r}" raise ValueError(msg) - elif part == "..": + if part == "..": raise ValueError(f"'..' segment in {str(other)!r} cannot be walked") - else: - parts0.append("..") + parts0.append("..") return self.with_segments("", *reversed(parts0)) def is_relative_to(self, other, *more_other): @@ -665,8 +662,7 @@ def _object_info(self) -> ObjectInfo: self._cached_object_info = self._ws.workspace.get_status(self.as_posix()) return self._object_info - @staticmethod - def _return_false() -> bool: + def _return_false(self) -> bool: return False is_symlink = _return_false @@ -696,9 +692,9 @@ def is_file(self): return False def _scandir(self): - # Python 3.10: Accesses _accessor.scandir() directly. - # Python 3.11: Instead invokes this (which normally dispatches to os.scandir()) - return self._accessor.scandir(self) + # TODO: Not yet invoked; work-in-progress. + objects = self._ws.workspace.list(self.as_posix()) + return _ScandirIterator(objects) def expanduser(self): # Expand ~ (but NOT ~user) constructs. From 4c7ef4094b71cab20210ec2b51d489e82fa88ce1 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 21:31:39 +0200 Subject: [PATCH 09/24] Suppress linting warnings that are unavoidable, or where avoidance is worse than the problem. --- src/databricks/labs/blueprint/paths.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 967531a..8fe9bc4 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -213,7 +213,7 @@ def __init__(self, ws: WorkspaceClient, path: str): StringIO.__init__(self) -class WorkspacePath(Path): +class WorkspacePath(Path): # pylint: disable=too-many-public-methods """Experimental implementation of pathlib.Path for Databricks Workspace.""" # Implementation notes: @@ -235,7 +235,7 @@ class WorkspacePath(Path): # 3. Python 3.12 introduces some new API elements. Because these are source-compatible with earlier versions # these are forward-ported and implemented. # - __slots__ = ( + __slots__ = ( # pylint: disable=redefined-slots-in-subclass # For us this is always the empty string. Consistent with the superclass attribute for Python 3.10-3.13b. "_drv", # The (normalized) root property for the path. Consistent with the superclass attribute for Python 3.10-3.13b. @@ -281,7 +281,9 @@ def __new__(cls, *args, **kwargs) -> WorkspacePath: # Force all initialisation to go via __init__() irrespective of the (Python-specific) base version. return object.__new__(cls) - def __init__(self, ws: WorkspaceClient, *args) -> None: + def __init__(self, ws: WorkspaceClient, *args) -> None: # pylint: disable=super-init-not-called,useless-suppression + # We deliberately do _not_ call the super initializer because we're taking over complete responsibility for the + # implementation of the public API. raw_paths: list[str] = [] for arg in args: if isinstance(arg, PurePath): @@ -499,10 +501,10 @@ def with_suffix(self, suffix): def _stack(self): return self.anchor, list(reversed(self._path_parts)) - def relative_to(self, other, *more_other, walk_up=False): + def relative_to(self, other, *more_other, walk_up=False): # pylint: disable=arguments-differ other = self.with_segments(other, *more_other) anchor0, parts0 = self._stack - anchor1, parts1 = other._stack + anchor1, parts1 = other._stack # pylint: disable=protected-access if anchor0 != anchor1: msg = f"{str(self)!r} and {str(other)!r} have different anchors" raise ValueError(msg) @@ -518,12 +520,12 @@ def relative_to(self, other, *more_other, walk_up=False): parts0.append("..") return self.with_segments("", *reversed(parts0)) - def is_relative_to(self, other, *more_other): + def is_relative_to(self, other, *more_other): # pylint: disable=arguments-differ other = self.with_segments(other, *more_other) if self.anchor != other.anchor: return False parts0 = list(reversed(self._path_parts)) - parts1 = list(reversed(other._path_parts)) + parts1 = list(reversed(other._path_parts)) # pylint: disable=protected-access while parts0 and parts1 and parts0[-1] == parts1[-1]: parts0.pop() parts1.pop() From df396606654c920a0232513e389a8c92c900a519 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Jul 2024 21:33:58 +0200 Subject: [PATCH 10/24] Mark some code where a decision is pending. The code as-is is temporary. --- src/databricks/labs/blueprint/paths.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 8fe9bc4..5ee7f5b 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -368,6 +368,7 @@ def __str__(self): def __bytes__(self): # Super implementations are fine. + # TODO: Decide before PR merge whether to: a) remove; b) allow as marker that we checked it; c) inline to be independent. return super().__bytes__() def __repr__(self): @@ -483,8 +484,9 @@ def with_name(self, name): path_parts[-1] = name return type(self)(self._ws, self.anchor, *path_parts) - def with_stem(self, stem): + def with_stem(self, stem): # pylint: disable=useless-parent-delegation # Super implementations are all fine. + # TODO: Decide before PR merge whether to: a) remove; b) allow as marker that we checked it; c) inline to be independent. return super().with_stem(stem) def with_suffix(self, suffix): From 7a1e4500ed560004b365db0d3559866ac4c59494 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 14:09:25 +0200 Subject: [PATCH 11/24] Factor out init logic into a method. --- src/databricks/labs/blueprint/paths.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 5ee7f5b..7439316 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -284,6 +284,19 @@ def __new__(cls, *args, **kwargs) -> WorkspacePath: def __init__(self, ws: WorkspaceClient, *args) -> None: # pylint: disable=super-init-not-called,useless-suppression # We deliberately do _not_ call the super initializer because we're taking over complete responsibility for the # implementation of the public API. + # Convert the arguments into string-based path segments, irrespective of their type. + raw_paths = self._to_raw_paths(*args) + + # Normalise the paths that we have. + root, path_parts = self._parse_and_normalize(raw_paths) + + self._drv = "" + self._root = root + self._path_parts = path_parts + self._ws = ws + + @staticmethod + def _to_raw_paths(*args) -> list[str]: raw_paths: list[str] = [] for arg in args: if isinstance(arg, PurePath): @@ -300,14 +313,7 @@ def __init__(self, ws: WorkspaceClient, *args) -> None: # pylint: disable=super ) raise TypeError(msg) raw_paths.append(path) - - # Normalise the paths that we have. - root, path_parts = self._parse_and_normalize(raw_paths) - - self._drv = "" - self._root = root - self._path_parts = path_parts - self._ws = ws + return raw_paths @classmethod def _parse_and_normalize(cls, parts: list[str]) -> tuple[str, tuple[str, ...]]: From b536d3f80f25b6d3d728501e54ea3a59d0a65afe Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 14:10:57 +0200 Subject: [PATCH 12/24] Early return. --- src/databricks/labs/blueprint/paths.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 7439316..625a516 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -334,12 +334,11 @@ def _parse_and_normalize(cls, parts: list[str]) -> tuple[str, tuple[str, ...]]: path = part case [*parts]: path = cls.parser.join(*parts) - if path: - root, rel = cls._splitroot(path, sep=cls.parser.sep) - # No need to split drv because we don't support it. - parsed = tuple(str(x) for x in rel.split(cls.parser.sep) if x and x != ".") - else: - root, parsed = "", () + if not path: + return "", () + root, rel = cls._splitroot(path, sep=cls.parser.sep) + # No need to split drv because we don't support it. + parsed = tuple(str(x) for x in rel.split(cls.parser.sep) if x and x != ".") return root, parsed @classmethod From af2f118bb93d34f75f968b1f603dbc3d85b11f58 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 14:12:12 +0200 Subject: [PATCH 13/24] Support for /-style building of paths. --- src/databricks/labs/blueprint/paths.py | 31 ++++++++++++++++++++++++++ tests/unit/test_paths.py | 25 +++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 625a516..bc3cc52 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -408,6 +408,18 @@ def _str_normcase(self): # Compatibility property (python 3.12+), accessed via equality comparison. This can't be avoided. return str(self) + @classmethod + def _from_parts(cls, *args) -> NoReturn: + # Compatibility method (python <= 3.11), accessed via reverse /-style building. This can't be avoided. + # See __rtruediv__ for more information. + raise TypeError("trigger NotImplemented") + + @property + def _raw_paths(self) -> NoReturn: + # Compatibility method (python 3.12+), accessed via reverse /-style building. This can't be avoided. + # See __rtruediv__ for more information. + raise TypeError("trigger NotImplemented") + def __lt__(self, other): if not isinstance(other, type(self)): return NotImplemented @@ -566,6 +578,25 @@ def is_reserved(self): def joinpath(self, *pathsegments): return self.with_segments(self, *pathsegments) + def __truediv__(self, other): + try: + return self.with_segments(*self._parts(), other) + except TypeError: + return NotImplemented + + def __rtruediv__(self, other): + # Note: this is only invoked if __truediv__ has already returned NotImplemented. + # For the case of Path / WorkspacePath this means the underlying __truediv__ is invoked. + # The base-class implementations all access internals but yield NotImplemented if TypeError is raised. As + # such we stub those internals (_from_parts and _raw_path) to trigger the NotImplemented path and ensure that + # control ends up here. + try: + if isinstance(other, PurePath): + return type(other)(other, *self._parts()) + return self.with_segments(other, *self._parts()) + except TypeError: + return NotImplemented + @classmethod def _compile_pattern(cls, pattern: str, case_sensitive: bool) -> re.Pattern: flags = 0 if case_sensitive else re.IGNORECASE diff --git a/tests/unit/test_paths.py b/tests/unit/test_paths.py index 5b9eda2..dd032a6 100644 --- a/tests/unit/test_paths.py +++ b/tests/unit/test_paths.py @@ -364,6 +364,31 @@ def test_joinpath() -> None: assert WorkspacePath(ws, "home").joinpath("jane") == WorkspacePath(ws, "home/jane") +def test_join_dsl() -> None: + """Test that the /-based DSL can be used to build new paths.""" + ws = create_autospec(WorkspaceClient) + + # First the forward style options. + assert WorkspacePath(ws, "/home/bob") / "data" == WorkspacePath(ws, "/home/bob/data") + assert WorkspacePath(ws, "/home/bob") / "data" / "base" == WorkspacePath(ws, "/home/bob/data/base") + assert WorkspacePath(ws, "/home/bob") / "data/base" == WorkspacePath(ws, "/home/bob/data/base") + assert WorkspacePath(ws, "home") / "bob" == WorkspacePath(ws, "home/bob") + # New root + assert WorkspacePath(ws, "whatever") / "/home" == WorkspacePath(ws, "/home") + # Mix types: eventual type is less-associative + assert WorkspacePath(ws, "/home/bob") / PurePosixPath("data") == WorkspacePath(ws, "/home/bob/data") + + # Building from the other direction; same as above. + assert "/home/bob" / WorkspacePath(ws, "data") == WorkspacePath(ws, "/home/bob/data") + assert "/home/bob" / WorkspacePath(ws, "data") / "base" == WorkspacePath(ws, "/home/bob/data/base") + assert "/home/bob" / WorkspacePath(ws, "data/base") == WorkspacePath(ws, "/home/bob/data/base") + assert "home" / WorkspacePath(ws, "bob") == WorkspacePath(ws, "home/bob") + # New root + assert "whatever" / WorkspacePath(ws, "/home") == WorkspacePath(ws, "/home") + # Mix types: eventual type is less-associative + assert PurePosixPath("/home/bob") / WorkspacePath(ws, "data") == PurePosixPath("/home/bob/data") + + def test_match() -> None: """Test that glob matching works.""" ws = create_autospec(WorkspaceClient) From afad4d7393acd9d4e3d65a67867a7843bd900d62 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 14:18:14 +0200 Subject: [PATCH 14/24] Early return. --- src/databricks/labs/blueprint/paths.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index bc3cc52..4d7405c 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -346,10 +346,8 @@ def _splitroot(cls, part: str, sep: str) -> tuple[str, str]: # Based on the upstream implementation, with the '//'-specific bit elided because we don't need to # bother with Posix semantics. if part and part[0] == sep: - root, path = sep, part.lstrip(sep) - else: - root, path = "", part - return root, path + return sep, part.lstrip(sep) + return "", part def __reduce__(self) -> NoReturn: # Cannot support pickling because we can't pickle the workspace client. From f880a4e7f8b2ebccad0586e964abaa5c4b5ebd70 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 15:20:05 +0200 Subject: [PATCH 15/24] Replace relative_to/is_relative_to implementation to use our internals. --- src/databricks/labs/blueprint/paths.py | 40 +++++++++++--------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 4d7405c..3a4547c 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -514,42 +514,36 @@ def with_suffix(self, suffix): raise ValueError(msg) return self.with_name(stem + suffix) - @property - def _stack(self): - return self.anchor, list(reversed(self._path_parts)) - def relative_to(self, other, *more_other, walk_up=False): # pylint: disable=arguments-differ other = self.with_segments(other, *more_other) - anchor0, parts0 = self._stack - anchor1, parts1 = other._stack # pylint: disable=protected-access - if anchor0 != anchor1: + if self.anchor != other.anchor: msg = f"{str(self)!r} and {str(other)!r} have different anchors" raise ValueError(msg) - while parts0 and parts1 and parts0[-1] == parts1[-1]: - parts0.pop() - parts1.pop() - for part in parts1: + path_parts0 = self._path_parts + path_parts1 = other._path_parts # pylint: disable=protected-access + # Find the length of the common prefix. + i = 0 + while i < len(path_parts0) and i < len(path_parts1) and path_parts0[i] == path_parts1[i]: + i += 1 + relative_parts = path_parts0[i:] + # Handle walking up. + if i < len(path_parts1): if not walk_up: msg = f"{str(self)!r} is not in the subpath of {str(other)!r}" raise ValueError(msg) - if part == "..": + if ".." in path_parts1[i:]: raise ValueError(f"'..' segment in {str(other)!r} cannot be walked") - parts0.append("..") - return self.with_segments("", *reversed(parts0)) + walkup_parts = [".."] * (len(path_parts1) - i) + relative_parts = (*walkup_parts, *relative_parts) + return self.with_segments("", *relative_parts) def is_relative_to(self, other, *more_other): # pylint: disable=arguments-differ other = self.with_segments(other, *more_other) if self.anchor != other.anchor: return False - parts0 = list(reversed(self._path_parts)) - parts1 = list(reversed(other._path_parts)) # pylint: disable=protected-access - while parts0 and parts1 and parts0[-1] == parts1[-1]: - parts0.pop() - parts1.pop() - for part in parts1: - if part and part != ".": - return False - return True + path_parts0 = self._path_parts + path_parts1 = other._path_parts # pylint: disable=protected-access + return path_parts0[: len(path_parts1)] == path_parts1 @property def parent(self): From e66804edf971f9e914adcfb8f4cd3eb8ed43a2b0 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 15:43:40 +0200 Subject: [PATCH 16/24] Remove methods that only delegate to the superclass. --- src/databricks/labs/blueprint/paths.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 3a4547c..310e7af 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -369,11 +369,6 @@ def __str__(self): self._str = (self._root + self.parser.sep.join(self._path_parts)) or "." return self._str - def __bytes__(self): - # Super implementations are fine. - # TODO: Decide before PR merge whether to: a) remove; b) allow as marker that we checked it; c) inline to be independent. - return super().__bytes__() - def __repr__(self): return f"{self.__class__.__name__}({str(self)!r})" @@ -477,17 +472,6 @@ def suffix(self): pass return suffix - @property - def suffixes(self): - # Super implementations are all fine. - # TODO: Make this consistent with .suffix for notebooks. - return super().suffixes - - @property - def stem(self): - # Super implementations are all fine. - return super().stem - def with_name(self, name): parser = self.parser if not name or parser.sep in name or name == ".": @@ -499,11 +483,6 @@ def with_name(self, name): path_parts[-1] = name return type(self)(self._ws, self.anchor, *path_parts) - def with_stem(self, stem): # pylint: disable=useless-parent-delegation - # Super implementations are all fine. - # TODO: Decide before PR merge whether to: a) remove; b) allow as marker that we checked it; c) inline to be independent. - return super().with_stem(stem) - def with_suffix(self, suffix): stem = self.stem if not stem: From c89b4facda690169acfb36c93a8527adbea8f372 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 17:02:28 +0200 Subject: [PATCH 17/24] Drop type annotations to reduce changes in this PR. --- tests/unit/test_paths.py | 80 ++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/tests/unit/test_paths.py b/tests/unit/test_paths.py index dd032a6..59f4bc2 100644 --- a/tests/unit/test_paths.py +++ b/tests/unit/test_paths.py @@ -410,132 +410,132 @@ def test_match() -> None: assert not WorkspacePath(ws, "/foo/bar/file.txt").match("/**/*.txt") -def test_exists_when_path_exists() -> None: +def test_exists_when_path_exists(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = True assert workspace_path.exists() -def test_exists_when_path_does_not_exist() -> None: +def test_exists_when_path_does_not_exist(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") assert not workspace_path.exists() -def test_mkdir_creates_directory() -> None: +def test_mkdir_creates_directory(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.mkdir() ws.workspace.mkdirs.assert_called_once_with("/test/path") -def test_rmdir_removes_directory() -> None: +def test_rmdir_removes_directory(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.rmdir() ws.workspace.delete.assert_called_once_with("/test/path", recursive=False) -def test_is_dir_when_path_is_directory() -> None: +def test_is_dir_when_path_is_directory(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.DIRECTORY) assert workspace_path.is_dir() -def test_is_dir_when_path_is_not_directory() -> None: +def test_is_dir_when_path_is_not_directory(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.FILE) assert not workspace_path.is_dir() -def test_is_file_when_path_is_file() -> None: +def test_is_file_when_path_is_file(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.FILE) assert workspace_path.is_file() -def test_is_file_when_path_is_not_file() -> None: +def test_is_file_when_path_is_not_file(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.DIRECTORY) assert not workspace_path.is_file() -def test_is_notebook_when_path_is_notebook() -> None: +def test_is_notebook_when_path_is_notebook(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.NOTEBOOK) assert workspace_path.is_notebook() -def test_is_notebook_when_path_is_not_notebook() -> None: +def test_is_notebook_when_path_is_not_notebook(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.FILE) assert not workspace_path.is_notebook() -def test_open_file_in_read_binary_mode() -> None: +def test_open_file_in_read_binary_mode(): ws = create_autospec(WorkspaceClient) ws.workspace.download.return_value.__enter__.return_value.read.return_value = b"test" workspace_path = WorkspacePath(ws, "/test/path") assert workspace_path.read_bytes() == b"test" -def test_open_file_in_write_binary_mode() -> None: +def test_open_file_in_write_binary_mode(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.write_bytes(b"test") ws.workspace.upload.assert_called_with("/test/path", b"test", format=ImportFormat.AUTO) -def test_open_file_in_read_text_mode() -> None: +def test_open_file_in_read_text_mode(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.download.return_value.__enter__.return_value.read.return_value = b"test" assert workspace_path.read_text() == "test" -def test_open_file_in_write_text_mode() -> None: +def test_open_file_in_write_text_mode(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.write_text("test") ws.workspace.upload.assert_called_with("/test/path", "test", format=ImportFormat.AUTO) -def test_open_file_in_invalid_mode() -> None: +def test_open_file_in_invalid_mode(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") with pytest.raises(ValueError): workspace_path.open(mode="invalid") -def test_suffix_when_file_has_extension() -> None: +def test_suffix_when_file_has_extension(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path.py") assert workspace_path.suffix == ".py" -def test_suffix_when_file_is_notebook_and_language_matches() -> None: +def test_suffix_when_file_is_notebook_and_language_matches(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._cached_object_info = ObjectInfo(language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) assert workspace_path.suffix == ".py" -def test_suffix_when_file_is_notebook_and_language_does_not_match() -> None: +def test_suffix_when_file_is_notebook_and_language_does_not_match(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.language = None assert workspace_path.suffix == "" -def test_suffix_when_file_is_not_notebook() -> None: +def test_suffix_when_file_is_not_notebook(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") with patch("databricks.labs.blueprint.paths.WorkspacePath.is_notebook") as mock_is_notebook: @@ -543,35 +543,35 @@ def test_suffix_when_file_is_not_notebook() -> None: assert workspace_path.suffix == "" -def test_mkdir_creates_directory_with_valid_mode() -> None: +def test_mkdir_creates_directory_with_valid_mode(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.mkdir(mode=0o600) ws.workspace.mkdirs.assert_called_once_with("/test/path") -def test_mkdir_raises_error_with_invalid_mode() -> None: +def test_mkdir_raises_error_with_invalid_mode(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") with pytest.raises(ValueError): workspace_path.mkdir(mode=0o700) -def test_rmdir_removes_directory_non_recursive() -> None: +def test_rmdir_removes_directory_non_recursive(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.rmdir() ws.workspace.delete.assert_called_once_with("/test/path", recursive=False) -def test_rmdir_removes_directory_recursive() -> None: +def test_rmdir_removes_directory_recursive(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path.rmdir(recursive=True) ws.workspace.delete.assert_called_once_with("/test/path", recursive=True) -def test_rename_file_without_overwrite() -> None: +def test_rename_file_without_overwrite(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.download.return_value.__enter__.return_value.read.return_value = b"test" @@ -579,7 +579,7 @@ def test_rename_file_without_overwrite() -> None: ws.workspace.upload.assert_called_once_with("/new/path", b"test", format=ImportFormat.AUTO, overwrite=False) -def test_rename_file_with_overwrite() -> None: +def test_rename_file_with_overwrite(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.download.return_value.__enter__.return_value.read.return_value = b"test" @@ -587,7 +587,7 @@ def test_rename_file_with_overwrite() -> None: ws.workspace.upload.assert_called_once_with("/new/path", b"test", format=ImportFormat.AUTO, overwrite=True) -def test_unlink_existing_file() -> None: +def test_unlink_existing_file(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = True @@ -595,7 +595,7 @@ def test_unlink_existing_file() -> None: ws.workspace.delete.assert_called_once_with("/test/path") -def test_unlink_non_existing_file() -> None: +def test_unlink_non_existing_file(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") @@ -603,7 +603,7 @@ def test_unlink_non_existing_file() -> None: workspace_path.unlink() -def test_as_fuse_in_databricks_runtime() -> None: +def test_as_fuse_in_databricks_runtime(): with patch.dict("os.environ", {"DATABRICKS_RUNTIME_VERSION": "14.3"}): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") @@ -611,7 +611,7 @@ def test_as_fuse_in_databricks_runtime() -> None: assert str(result) == "/Workspace/test/path" -def test_as_fuse_outside_databricks_runtime() -> None: +def test_as_fuse_outside_databricks_runtime(): with patch.dict("os.environ", {}, clear=True): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") @@ -619,7 +619,7 @@ def test_as_fuse_outside_databricks_runtime() -> None: assert str(result) == "/Workspace/test/path" -def test_home_directory() -> None: +def test_home_directory(): ws = create_autospec(WorkspaceClient) ws.current_user.me.return_value.user_name = "test_user" workspace_path = WorkspacePath(ws, "/test/path") @@ -627,63 +627,63 @@ def test_home_directory() -> None: assert str(result) == "/Users/test_user" -def test_is_dir_when_object_type_is_directory() -> None: +def test_is_dir_when_object_type_is_directory(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.DIRECTORY assert workspace_path.is_dir() is True -def test_is_dir_when_object_type_is_not_directory() -> None: +def test_is_dir_when_object_type_is_not_directory(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.FILE assert workspace_path.is_dir() is False -def test_is_dir_when_databricks_error_occurs() -> None: +def test_is_dir_when_databricks_error_occurs(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") assert workspace_path.is_dir() is False -def test_is_file_when_object_type_is_file() -> None: +def test_is_file_when_object_type_is_file(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.FILE assert workspace_path.is_file() is True -def test_is_file_when_object_type_is_not_file() -> None: +def test_is_file_when_object_type_is_not_file(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.DIRECTORY assert workspace_path.is_file() is False -def test_is_file_when_databricks_error_occurs() -> None: +def test_is_file_when_databricks_error_occurs(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") assert workspace_path.is_file() is False -def test_is_notebook_when_object_type_is_notebook() -> None: +def test_is_notebook_when_object_type_is_notebook(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.NOTEBOOK assert workspace_path.is_notebook() is True -def test_is_notebook_when_object_type_is_not_notebook() -> None: +def test_is_notebook_when_object_type_is_not_notebook(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") workspace_path._object_info.object_type = ObjectType.FILE assert workspace_path.is_notebook() is False -def test_is_notebook_when_databricks_error_occurs() -> None: +def test_is_notebook_when_databricks_error_occurs(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.side_effect = NotFound("Simulated NotFound") @@ -691,7 +691,7 @@ def test_is_notebook_when_databricks_error_occurs() -> None: @pytest.mark.xfail(reason="Implementation pending.") -def test_globbing_when_nested_json_files_exist() -> None: +def test_globbing_when_nested_json_files_exist(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.DIRECTORY) From b94ad64ef5cdd6cd445dcf39b6d1aa4bfb0569d6 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 17:03:30 +0200 Subject: [PATCH 18/24] Whitespace. --- src/databricks/labs/blueprint/paths.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 310e7af..1303cf4 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -284,6 +284,7 @@ def __new__(cls, *args, **kwargs) -> WorkspacePath: def __init__(self, ws: WorkspaceClient, *args) -> None: # pylint: disable=super-init-not-called,useless-suppression # We deliberately do _not_ call the super initializer because we're taking over complete responsibility for the # implementation of the public API. + # Convert the arguments into string-based path segments, irrespective of their type. raw_paths = self._to_raw_paths(*args) From fde8a7ffa4871267cc4de1ac5a5b1a43fca62b2b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 17:17:04 +0200 Subject: [PATCH 19/24] Revert to original .suffix implementation. --- src/databricks/labs/blueprint/paths.py | 31 ++++++++++++++------------ 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 1303cf4..78ba3cb 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -255,7 +255,7 @@ class WorkspacePath(Path): # pylint: disable=too-many-public-methods ) _cached_object_info: ObjectInfo - _SUFFIXES = {Language.PYTHON: ".py", Language.SQL: ".sql", Language.SCALA: ".scala", Language.R: ".R"} + _SUFFIXES = {".py": Language.PYTHON, ".sql": Language.SQL, ".scala": Language.SCALA, ".R": Language.R} # Path semantics are posix-like. parser = posixpath @@ -460,19 +460,6 @@ def parts(self): return self.drive + self.root, *self._path_parts return self._path_parts - @property - def suffix(self): - # Super implementations are mostly fine... - suffix = super().suffix - # ...but if there is no suffix and this path is for a notebook then infer the extension based on the notebook - # language. - if not suffix and self.is_notebook(): - try: - suffix = self._SUFFIXES.get(self._object_info.language, "") - except DatabricksError: - pass - return suffix - def with_name(self, name): parser = self.parser if not name or parser.sep in name or name == ".": @@ -664,6 +651,22 @@ def open(self, mode="r", buffering=-1, encoding=None, errors=None, newline=None) return _TextUploadIO(self._ws, self.as_posix()) raise ValueError(f"invalid mode: {mode}") + @property + def suffix(self): + """Return the file extension. If the file is a notebook, return the suffix based on the language.""" + suffix = super().suffix + if suffix: + return suffix + if not self.is_notebook(): + return "" + for sfx, lang in self._SUFFIXES.items(): + try: + if self._object_info.language == lang: + return sfx + except DatabricksError: + return "" + return "" + @property def _object_info(self) -> ObjectInfo: # this method is cached because it is used in multiple is_* methods. From 1f5f138158fe75242eff6f4624623d9f02e1102f Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 17:27:07 +0200 Subject: [PATCH 20/24] Move test to site of prior test with the same name. This will mean it's clearer in the PR what has changed. --- tests/unit/test_paths.py | 48 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/tests/unit/test_paths.py b/tests/unit/test_paths.py index 59f4bc2..2e44ff6 100644 --- a/tests/unit/test_paths.py +++ b/tests/unit/test_paths.py @@ -260,30 +260,6 @@ def test_with_suffix() -> None: _ = WorkspacePath(ws, "/").with_suffix(".txt") -def test_relative_to() -> None: - """Test that it is possible to get the relative path between two paths.""" - ws = create_autospec(WorkspaceClient) - - # Basics. - assert WorkspacePath(ws, "/home/bob").relative_to("/") == WorkspacePath(ws, "home/bob") - assert WorkspacePath(ws, "/home/bob").relative_to("/home") == WorkspacePath(ws, "bob") - assert WorkspacePath(ws, "/home/bob").relative_to("/./home") == WorkspacePath(ws, "bob") - assert WorkspacePath(ws, "foo/bar/baz").relative_to("foo") == WorkspacePath(ws, "bar/baz") - assert WorkspacePath(ws, "foo/bar/baz").relative_to("foo/bar") == WorkspacePath(ws, "baz") - assert WorkspacePath(ws, "foo/bar/baz").relative_to("foo/./bar") == WorkspacePath(ws, "baz") - - # Walk-up (3.12+) behaviour. - assert WorkspacePath(ws, "/home/bob").relative_to("/usr", walk_up=True) == WorkspacePath(ws, "../home/bob") - - # Check some errors. - with pytest.raises(ValueError, match="different anchors"): - _ = WorkspacePath(ws, "/home/bob").relative_to("home") - with pytest.raises(ValueError, match="not in the subpath"): - _ = WorkspacePath(ws, "/home/bob").relative_to("/usr") - with pytest.raises(ValueError, match="cannot be walked"): - _ = WorkspacePath(ws, "/home/bob").relative_to("/home/../usr", walk_up=True) - - @pytest.mark.parametrize( ("path", "parent"), [ @@ -603,6 +579,30 @@ def test_unlink_non_existing_file(): workspace_path.unlink() +def test_relative_to() -> None: + """Test that it is possible to get the relative path between two paths.""" + ws = create_autospec(WorkspaceClient) + + # Basics. + assert WorkspacePath(ws, "/home/bob").relative_to("/") == WorkspacePath(ws, "home/bob") + assert WorkspacePath(ws, "/home/bob").relative_to("/home") == WorkspacePath(ws, "bob") + assert WorkspacePath(ws, "/home/bob").relative_to("/./home") == WorkspacePath(ws, "bob") + assert WorkspacePath(ws, "foo/bar/baz").relative_to("foo") == WorkspacePath(ws, "bar/baz") + assert WorkspacePath(ws, "foo/bar/baz").relative_to("foo/bar") == WorkspacePath(ws, "baz") + assert WorkspacePath(ws, "foo/bar/baz").relative_to("foo/./bar") == WorkspacePath(ws, "baz") + + # Walk-up (3.12+) behaviour. + assert WorkspacePath(ws, "/home/bob").relative_to("/usr", walk_up=True) == WorkspacePath(ws, "../home/bob") + + # Check some errors. + with pytest.raises(ValueError, match="different anchors"): + _ = WorkspacePath(ws, "/home/bob").relative_to("home") + with pytest.raises(ValueError, match="not in the subpath"): + _ = WorkspacePath(ws, "/home/bob").relative_to("/usr") + with pytest.raises(ValueError, match="cannot be walked"): + _ = WorkspacePath(ws, "/home/bob").relative_to("/home/../usr", walk_up=True) + + def test_as_fuse_in_databricks_runtime(): with patch.dict("os.environ", {"DATABRICKS_RUNTIME_VERSION": "14.3"}): ws = create_autospec(WorkspaceClient) From 5cf0b7115226f30809b6f4ec618ee2e9be82e51d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 17:41:12 +0200 Subject: [PATCH 21/24] Avoid monkey-patching the class under test. Instead modify behaviour via the constructor-injected mock. --- tests/unit/test_paths.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_paths.py b/tests/unit/test_paths.py index 2e44ff6..4efccd5 100644 --- a/tests/unit/test_paths.py +++ b/tests/unit/test_paths.py @@ -513,10 +513,10 @@ def test_suffix_when_file_is_notebook_and_language_does_not_match(): def test_suffix_when_file_is_not_notebook(): ws = create_autospec(WorkspaceClient) + ws.workspace.get_status.return_value = ObjectInfo(language=Language.PYTHON, object_type=ObjectType.FILE) + workspace_path = WorkspacePath(ws, "/test/path") - with patch("databricks.labs.blueprint.paths.WorkspacePath.is_notebook") as mock_is_notebook: - mock_is_notebook.return_value = False - assert workspace_path.suffix == "" + assert workspace_path.suffix == "" def test_mkdir_creates_directory_with_valid_mode(): From 3c970d94c2fe637fd9e06c2696fb6bc844f4635e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 17:42:04 +0200 Subject: [PATCH 22/24] Adjust text fixture to induce behaviour via constructor-injected mock. Previously to produce the desired results these tests mutated the internal state of the object under test. --- tests/unit/test_paths.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_paths.py b/tests/unit/test_paths.py index 4efccd5..76747f7 100644 --- a/tests/unit/test_paths.py +++ b/tests/unit/test_paths.py @@ -499,15 +499,17 @@ def test_suffix_when_file_has_extension(): def test_suffix_when_file_is_notebook_and_language_matches(): ws = create_autospec(WorkspaceClient) + ws.workspace.get_status.return_value = ObjectInfo(language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) + workspace_path = WorkspacePath(ws, "/test/path") - workspace_path._cached_object_info = ObjectInfo(language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) assert workspace_path.suffix == ".py" def test_suffix_when_file_is_notebook_and_language_does_not_match(): ws = create_autospec(WorkspaceClient) + ws.workspace.get_status.return_value = ObjectInfo(language=None, object_type=ObjectType.NOTEBOOK) + workspace_path = WorkspacePath(ws, "/test/path") - workspace_path._object_info.language = None assert workspace_path.suffix == "" From 1f534461047d1fb1cef9c5633565411c859cd605 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 10 Jul 2024 18:10:17 +0200 Subject: [PATCH 23/24] Document why __fspath__() isn't supported. --- src/databricks/labs/blueprint/paths.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 78ba3cb..d6b6fe5 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -357,6 +357,15 @@ def __reduce__(self) -> NoReturn: def __fspath__(self): # Cannot support this: Workspace objects aren't accessible via the filesystem. + # + # This method is part of the os.PathLike protocol. Functions which accept a PathLike argument use os.fsname() + # to convert (via this method) the object into a file system path that can be used with the low-level os.* + # methods. + # + # Relevant online documentation: + # - PEP 519 (https://peps.python.org/pep-0519/) + # - os.fspath (https://docs.python.org/3/library/os.html#os.fspath) + # TODO: Allow this to work when within an appropriate Databricks Runtime that mounts Workspace paths via FUSE. msg = f"Workspace paths are not path-like: {self}" raise NotImplementedError(msg) From 4c177146309bd5296fc8d4ee28cec5c26a62e2f2 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 12 Jul 2024 19:53:58 +0200 Subject: [PATCH 24/24] Update `WorkspaceClient` to support globbing (#125) This PR is stacked on top of #122 and updates `WorkspaceClient` to support: - `.glob()` - `.rglob()` - `.iterdir()` --- src/databricks/labs/blueprint/paths.py | 350 +++++++++++++------------ tests/unit/test_paths.py | 246 ++++++++++++++++- 2 files changed, 423 insertions(+), 173 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index d6b6fe5..875cc93 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -7,10 +7,11 @@ import os import posixpath import re -import sys +from abc import abstractmethod +from collections.abc import Iterable, Sequence from io import BytesIO, StringIO from pathlib import Path, PurePath -from typing import NoReturn +from typing import NoReturn, TypeVar from urllib.parse import quote_from_bytes as urlquote_from_bytes from databricks.sdk import WorkspaceClient @@ -26,90 +27,6 @@ logger = logging.getLogger(__name__) -class _DatabricksFlavour: - # adapted from pathlib._Flavour, where we ignore support for drives, as we - # don't have that concept in Databricks. We also ignore support for Windows - # paths, as we only support POSIX paths in Databricks. - - sep = "/" - altsep = "" - has_drv = False - pathmod = posixpath - is_supported = True - - def __init__(self, ws: WorkspaceClient): - self.join = self.sep.join - self._ws = ws - - def parse_parts(self, parts: list[str]) -> tuple[str, str, list[str]]: - # adapted from pathlib._Flavour.parse_parts, - # where we ignore support for drives, as we - # don't have that concept in Databricks - parsed = [] - drv = root = "" - for part in reversed(parts): - if not part: - continue - drv, root, rel = self.splitroot(part) - if self.sep not in rel: - if rel and rel != ".": - parsed.append(sys.intern(rel)) - continue - for part_ in reversed(rel.split(self.sep)): - if part_ and part_ != ".": - parsed.append(sys.intern(part_)) - if drv or root: - parsed.append(drv + root) - parsed.reverse() - return drv, root, parsed - - @staticmethod - def join_parsed_parts( - drv: str, - root: str, - parts: list[str], - _, - root2: str, - parts2: list[str], - ) -> tuple[str, str, list[str]]: - # adapted from pathlib.PurePosixPath, where we ignore support for drives, - # as we don't have that concept in Databricks - if root2: - return drv, root2, [drv + root2] + parts2[1:] - return drv, root, parts + parts2 - - @staticmethod - def splitroot(part, sep=sep) -> tuple[str, str, str]: - if part and part[0] == sep: - stripped_part = part.lstrip(sep) - if len(part) - len(stripped_part) == 2: - return "", sep * 2, stripped_part - return "", sep, stripped_part - return "", "", part - - @staticmethod - def casefold(value: str) -> str: - return value - - @staticmethod - def casefold_parts(parts: list[str]) -> list[str]: - return parts - - @staticmethod - def compile_pattern(pattern: str): - return re.compile(fnmatch.translate(pattern)).fullmatch - - @staticmethod - def is_reserved(_) -> bool: - return False - - def make_uri(self, path) -> str: - return self._ws.config.host + "#workspace" + urlquote_from_bytes(bytes(path)) - - def __repr__(self): - return f"<{self.__class__.__name__} for {self._ws}>" - - def _na(fn: str): def _inner(*_, **__): __tracebackhide__ = True # pylint: disable=unused-variable @@ -118,75 +35,6 @@ def _inner(*_, **__): return _inner -class _ScandirItem: - def __init__(self, object_info): - self._object_info = object_info - - def __fspath__(self): - return self._object_info.path - - def is_dir(self, follow_symlinks=False): # pylint: disable=unused-argument - # follow_symlinks is for compatibility with Python 3.11 - return self._object_info.object_type == ObjectType.DIRECTORY - - def is_file(self, follow_symlinks=False): # pylint: disable=unused-argument - # follow_symlinks is for compatibility with Python 3.11 - # TODO: check if we want to show notebooks as files - return self._object_info.object_type == ObjectType.FILE - - def is_symlink(self): - return False - - @property - def name(self): - return os.path.basename(self._object_info.path) - - -class _ScandirIterator: - def __init__(self, objects): - self._it = objects - - def __iter__(self): - for object_info in self._it: - yield _ScandirItem(object_info) - - def __enter__(self): - return self - - def __exit__(self, *args): - pass - - -class _DatabricksAccessor: - chmod = _na("accessor.chmod") - getcwd = _na("accessor.getcwd") - group = _na("accessor.group") - link = _na("accessor.link") - mkdir = _na("accessor.mkdir") - owner = _na("accessor.owner") - readlink = _na("accessor.readlink") - realpath = _na("accessor.realpath") - rename = _na("accessor.rename") - replace = _na("accessor.replace") - rmdir = _na("accessor.rmdir") - stat = _na("accessor.stat") - symlink = _na("accessor.symlink") - unlink = _na("accessor.unlink") - - def __init__(self, ws: WorkspaceClient): - self._ws = ws - - def __repr__(self): - return f"<{self.__class__.__name__} for {self._ws}>" - - def scandir(self, path): - objects = self._ws.workspace.list(path) - return _ScandirIterator(objects) - - def listdir(self, path): - return [item.name for item in self.scandir(path)] - - class _UploadIO(abc.ABC): def __init__(self, ws: WorkspaceClient, path: str): self._ws = ws @@ -296,6 +144,16 @@ def __init__(self, ws: WorkspaceClient, *args) -> None: # pylint: disable=super self._path_parts = path_parts self._ws = ws + @classmethod + def _from_object_info(cls, ws: WorkspaceClient, object_info: ObjectInfo): + """Special (internal-only) constructor that creates an instance based on ObjectInfo.""" + if not object_info.path: + msg = f"Cannot initialise within object path: {object_info}" + raise ValueError(msg) + path = cls(ws, object_info.path) + path._cached_object_info = object_info + return path + @staticmethod def _to_raw_paths(*args) -> list[str]: raw_paths: list[str] = [] @@ -565,12 +423,6 @@ def __rtruediv__(self, other): except TypeError: return NotImplemented - @classmethod - def _compile_pattern(cls, pattern: str, case_sensitive: bool) -> re.Pattern: - flags = 0 if case_sensitive else re.IGNORECASE - regex = fnmatch.translate(pattern) - return re.compile(regex, flags=flags) - def match(self, path_pattern, *, case_sensitive=None): # Convert the pattern to a fake path (with globs) to help with matching parts. if not isinstance(path_pattern, PurePath): @@ -578,18 +430,17 @@ def match(self, path_pattern, *, case_sensitive=None): # Default to false if not specified. if case_sensitive is None: case_sensitive = True - # Reverse the parts. - path_parts = self.parts + pattern_parts = path_pattern.parts - # Error if not pattern_parts: raise ValueError("empty pattern") - # Impossible matches. + # Short-circuit on situations where a match is logically impossible. + path_parts = self.parts if len(path_parts) < len(pattern_parts) or len(path_parts) > len(pattern_parts) and path_pattern.anchor: return False - # Check each part. + # Check each part, starting from the end. for path_part, pattern_part in zip(reversed(path_parts), reversed(pattern_parts)): - pattern = self._compile_pattern(pattern_part, case_sensitive=case_sensitive) + pattern = _PatternSelector.compile_pattern(pattern_part, case_sensitive=case_sensitive) if not pattern.match(path_part): return False return True @@ -715,11 +566,6 @@ def is_file(self): except DatabricksError: return False - def _scandir(self): - # TODO: Not yet invoked; work-in-progress. - objects = self._ws.workspace.list(self.as_posix()) - return _ScandirIterator(objects) - def expanduser(self): # Expand ~ (but NOT ~user) constructs. if not (self._drv or self._root) and self._path_parts and self._path_parts[0][:1] == "~": @@ -741,3 +587,163 @@ def is_notebook(self): return self._object_info.object_type == ObjectType.NOTEBOOK except DatabricksError: return False + + def iterdir(self): + for child in self._ws.workspace.list(self.as_posix()): + yield self._from_object_info(self._ws, child) + + def _prepare_pattern(self, pattern) -> Sequence[str]: + if not pattern: + raise ValueError("Glob pattern must not be empty.") + parsed_pattern = self.with_segments(pattern) + if parsed_pattern.anchor: + msg = f"Non-relative patterns are unsupported: {pattern}" + raise NotImplementedError(msg) + pattern_parts = parsed_pattern._path_parts # pylint: disable=protected-access + if ".." in pattern_parts: + msg = f"Parent traversal is not supported: {pattern}" + raise ValueError(msg) + if pattern[-1] == self.parser.sep: + pattern_parts = (*pattern_parts, "") + return pattern_parts + + def glob(self, pattern, *, case_sensitive=None): + pattern_parts = self._prepare_pattern(pattern) + selector = _Selector.parse(pattern_parts, case_sensitive=case_sensitive if case_sensitive is not None else True) + yield from selector(self) + + def rglob(self, pattern, *, case_sensitive=None): + pattern_parts = ("**", *self._prepare_pattern(pattern)) + selector = _Selector.parse(pattern_parts, case_sensitive=case_sensitive if case_sensitive is not None else True) + yield from selector(self) + + +T = TypeVar("T", bound="Path") + + +class _Selector(abc.ABC): + @classmethod + def parse(cls, pattern_parts: Sequence[str], *, case_sensitive: bool) -> _Selector: + # The pattern language is: + # - '**' matches any number (including zero) of file or directory segments. Must be the entire segment. + # - '*' match any number of characters within a single segment. + # - '?' match a single character within a segment. + # - '[seq]' match a single character against the class (within a segment). + # - '[!seq]' negative match for a single character against the class (within a segment). + # - A trailing '/' (which presents here as a trailing empty segment) matches only directories. + # There is no explicit escaping mechanism; literal matches against special characters above are possible as + # character classes, for example: [*] + # + # Some sharp edges: + # - Multiple '**' segments are allowed. + # - Normally the '..' segment is allowed. (This can be used to match against siblings, for + # example: /home/bob/../jane/) However WorspacePath (and DBFS) do not support '..' traversal in paths. + # - Normally '.' is allowed, but eliminated before we reach this method. + match pattern_parts: + case ["**", *tail]: + return _RecursivePatternSelector(tail, case_sensitive=case_sensitive) + case [head, *tail] if case_sensitive and not _PatternSelector.needs_pattern(head): + return _LiteralSelector(head, tail, case_sensitive=case_sensitive) + case [head, *tail]: + if "**" in head: + raise ValueError("Invalid pattern: '**' can only be a complete path component") + return _PatternSelector(head, tail, case_sensitive=case_sensitive) + case []: + return _TerminalSelector() + raise ValueError(f"Glob pattern unsupported: {pattern_parts}") + + @abstractmethod + def __call__(self, path: T) -> Iterable[T]: + raise NotImplementedError() + + +class _TerminalSelector(_Selector): + def __call__(self, path: T) -> Iterable[T]: + yield path + + +class _NonTerminalSelector(_Selector): + __slots__ = ( + "_dir_only", + "_child_selector", + ) + + def __init__(self, child_pattern_parts: Sequence[str], *, case_sensitive: bool) -> None: + super().__init__() + if child_pattern_parts: + self._child_selector = self.parse(child_pattern_parts, case_sensitive=case_sensitive) + self._dir_only = True + else: + self._child_selector = _TerminalSelector() + self._dir_only = False + + def __call__(self, path: T) -> Iterable[T]: + if path.is_dir(): + yield from self._select_children(path) + + @abstractmethod + def _select_children(self, path: T) -> Iterable[T]: + raise NotImplementedError() + + +class _LiteralSelector(_NonTerminalSelector): + __slots__ = ("_literal_path",) + + def __init__(self, path: str, child_pattern_parts: Sequence[str], case_sensitive: bool) -> None: + super().__init__(child_pattern_parts, case_sensitive=case_sensitive) + self._literal_path = path + + def _select_children(self, path: T) -> Iterable[T]: + candidate = path / self._literal_path + if self._dir_only and candidate.is_dir() or candidate.exists(): + yield from self._child_selector(candidate) + + +class _PatternSelector(_NonTerminalSelector): + __slots__ = ("_pattern",) + + # The special set of characters that indicate a glob pattern isn't a trivial literal. + # Ref: https://docs.python.org/3/library/fnmatch.html#module-fnmatch + _glob_specials = re.compile("[*?\\[\\]]") + + @classmethod + def needs_pattern(cls, pattern: str) -> bool: + return cls._glob_specials.search(pattern) is not None + + @classmethod + def compile_pattern(cls, pattern: str, case_sensitive: bool) -> re.Pattern: + flags = 0 if case_sensitive else re.IGNORECASE + regex = fnmatch.translate(pattern) + return re.compile(regex, flags=flags) + + def __init__(self, pattern: str, child_pattern_parts: Sequence[str], case_sensitive: bool) -> None: + super().__init__(child_pattern_parts, case_sensitive=case_sensitive) + self._pattern = self.compile_pattern(pattern, case_sensitive=case_sensitive) + + def _select_children(self, path: T) -> Iterable[T]: + candidates = list(path.iterdir()) + for candidate in candidates: + if self._dir_only and not candidate.is_dir(): + continue + if self._pattern.match(candidate.name): + yield from self._child_selector(candidate) + + +class _RecursivePatternSelector(_NonTerminalSelector): + def __init__(self, child_pattern_parts: Sequence[str], case_sensitive: bool) -> None: + super().__init__(child_pattern_parts, case_sensitive=case_sensitive) + + def _all_directories(self, path: T) -> Iterable[T]: + # Depth-first traversal of directory tree, visiting this node first. + yield path + children = [child for child in path.iterdir() if child.is_dir()] + for child in children: + yield from self._all_directories(child) + + def _select_children(self, path: T) -> Iterable[T]: + yielded = set() + for starting_point in self._all_directories(path): + for candidate in self._child_selector(starting_point): + if candidate not in yielded: + yielded.add(candidate) + yield candidate diff --git a/tests/unit/test_paths.py b/tests/unit/test_paths.py index 76747f7..1a16f14 100644 --- a/tests/unit/test_paths.py +++ b/tests/unit/test_paths.py @@ -1,10 +1,12 @@ import os +from collections.abc import Iterator from pathlib import Path, PurePath, PurePosixPath, PureWindowsPath from unittest.mock import create_autospec, patch import pytest from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound +from databricks.sdk.mixins.workspace import WorkspaceExt from databricks.sdk.service.workspace import ( ImportFormat, Language, @@ -386,6 +388,28 @@ def test_match() -> None: assert not WorkspacePath(ws, "/foo/bar/file.txt").match("/**/*.txt") +def test_iterdir() -> None: + """Test that iterating through a directory works.""" + ws = create_autospec(WorkspaceClient) + ws.workspace.list.return_value = iter( + ( + ObjectInfo(path="/home/bob"), + ObjectInfo(path="/home/jane"), + ObjectInfo(path="/home/ted"), + ObjectInfo(path="/home/fred"), + ) + ) + + children = set(WorkspacePath(ws, "/home").iterdir()) + + assert children == { + WorkspacePath(ws, "/home/bob"), + WorkspacePath(ws, "/home/jane"), + WorkspacePath(ws, "/home/ted"), + WorkspacePath(ws, "/home/fred"), + } + + def test_exists_when_path_exists(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") @@ -692,7 +716,214 @@ def test_is_notebook_when_databricks_error_occurs(): assert workspace_path.is_notebook() is False -@pytest.mark.xfail(reason="Implementation pending.") +class StubWorkspaceFilesystem: + """Stub the basic Workspace filesystem operations.""" + + __slots__ = ("_paths",) + + def __init__(self, *known_paths: str | ObjectInfo) -> None: + """Initialize a virtual filesystem with a set of known paths. + + Each known path can either be a string or a complete ObjectInfo instance; if a string then the path is + treated as a directory if it has a trailing-/ or a file otherwise. + """ + fs_entries = [self._normalize_path(p) for p in known_paths] + keyed_entries = {o.path: o for o in fs_entries if o.path is not None} + self._normalize_paths(keyed_entries) + self._paths = keyed_entries + + @classmethod + def _normalize_path(cls, path: str | ObjectInfo) -> ObjectInfo: + if isinstance(path, ObjectInfo): + return path + return ObjectInfo( + path=path.rstrip("/") if path != "/" else path, + object_type=ObjectType.DIRECTORY if path.endswith("/") else ObjectType.FILE, + ) + + @classmethod + def _normalize_paths(cls, paths: dict[str, ObjectInfo]) -> None: + """Validate entries are absolute and that intermediate directories are both present and typed correctly.""" + for p in list(paths): + for parent in PurePath(p).parents: + path = str(parent) + paths.setdefault(path, ObjectInfo(path=path, object_type=ObjectType.DIRECTORY)) + + def _stub_get_status(self, path: str) -> ObjectInfo: + object_info = self._paths.get(path) + if object_info is None: + msg = f"Simulated path not found: {path}" + raise NotFound(msg) + return object_info + + def _stub_list( + self, path: str, *, notebooks_modified_after: int | None = None, recursive: bool | None = False, **kwargs + ) -> Iterator[ObjectInfo]: + path = path.rstrip("/") + path_len = len(path) + for candidate, object_info in self._paths.items(): + # Only direct children, and excluding the path itself. + if ( + len(candidate) > (path_len + 1) + and candidate[:path_len] == path + and candidate[path_len] == "/" + and "/" not in candidate[path_len + 1 :] + ): + yield object_info + + def mock(self) -> WorkspaceExt: + m = create_autospec(WorkspaceExt) + m.get_status.side_effect = self._stub_get_status + m.list.side_effect = self._stub_list + return m + + +def test_globbing_literals() -> None: + """Verify that trivial (literal) globs for one or more path segments match (or doesn't).""" + ws = create_autospec(WorkspaceClient) + ws.workspace = StubWorkspaceFilesystem("/home/bob/bin/labs", "/etc/passwd").mock() + + assert set(WorkspacePath(ws, "/home").glob("jane")) == set() + assert set(WorkspacePath(ws, "/home").glob("bob")) == {WorkspacePath(ws, "/home/bob")} + assert set(WorkspacePath(ws, "/home").glob("bob/")) == {WorkspacePath(ws, "/home/bob")} + assert set(WorkspacePath(ws, "/home").glob("bob/bin")) == {WorkspacePath(ws, "/home/bob/bin")} + assert set(WorkspacePath(ws, "/home").glob("bob/bin/labs/")) == set() + assert set(WorkspacePath(ws, "/etc").glob("passwd")) == {WorkspacePath(ws, "/etc/passwd")} + + +def test_globbing_empty_error() -> None: + """Verify that an empty glob triggers an immediate error.""" + ws = create_autospec(WorkspaceClient) + + with pytest.raises(ValueError, match="must not be empty"): + _ = set(WorkspacePath(ws, "/etc/passwd").glob("")) + + +def test_globbing_absolute_error() -> None: + """Verify that absolute-path globs triggers an immediate error.""" + ws = create_autospec(WorkspaceClient) + + with pytest.raises(NotImplementedError, match="Non-relative patterns are unsupported"): + _ = set(WorkspacePath(ws, "/").glob("/tmp/*")) + + +def test_globbing_patterns() -> None: + """Verify that globbing with globs works as expected, including across multiple path segments.""" + ws = create_autospec(WorkspaceClient) + ws.workspace = StubWorkspaceFilesystem( + "/home/bob/bin/labs", + "/home/bob/bin/databricks", + "/home/bot/", + "/home/bat/", + "/etc/passwd", + ).mock() + + assert set(WorkspacePath(ws, "/home").glob("*")) == { + WorkspacePath(ws, "/home/bob"), + WorkspacePath(ws, "/home/bot"), + WorkspacePath(ws, "/home/bat"), + } + assert set(WorkspacePath(ws, "/home/bob/bin").glob("*")) == { + WorkspacePath(ws, "/home/bob/bin/databricks"), + WorkspacePath(ws, "/home/bob/bin/labs"), + } + assert set(WorkspacePath(ws, "/home/bob").glob("*/*")) == { + WorkspacePath(ws, "/home/bob/bin/databricks"), + WorkspacePath(ws, "/home/bob/bin/labs"), + } + assert set(WorkspacePath(ws, "/home/bob/bin").glob("*a*")) == { + WorkspacePath(ws, "/home/bob/bin/databricks"), + WorkspacePath(ws, "/home/bob/bin/labs"), + } + assert set(WorkspacePath(ws, "/home").glob("bo[bt]")) == { + WorkspacePath(ws, "/home/bob"), + WorkspacePath(ws, "/home/bot"), + } + assert set(WorkspacePath(ws, "/home").glob("b[!o]t")) == {WorkspacePath(ws, "/home/bat")} + + +def test_glob_trailing_slash() -> None: + """Verify that globs with a trailing slash only match directories.""" + ws = create_autospec(WorkspaceClient) + ws.workspace = StubWorkspaceFilesystem( + "/home/bob/bin/labs", + "/home/bob/bin/databricks", + "/home/bob/.profile", + ).mock() + + assert set(WorkspacePath(ws, "/home/bob").glob("*/")) == {WorkspacePath(ws, "/home/bob/bin")} + assert set(WorkspacePath(ws, "/home").glob("bob/*/")) == {WorkspacePath(ws, "/home/bob/bin")} + # Negative test. + assert WorkspacePath(ws, "/home/bob/.profile") in set(WorkspacePath(ws, "/home").glob("bob/*")) + + +def test_glob_parent_path_traversal_error() -> None: + """Globs are normally allowed to include /../ segments to traverse directories; these aren't supported though.""" + ws = create_autospec(WorkspaceClient) + + with pytest.raises(ValueError, match="Parent traversal is not supported"): + _ = set(WorkspacePath(ws, "/usr").glob("sbin/../bin")) + + +def test_recursive_glob() -> None: + """Verify that recursive globs work properly.""" + ws = create_autospec(WorkspaceClient) + ws.workspace = StubWorkspaceFilesystem( + "/home/bob/bin/labs", + "/usr/local/bin/labs", + "/usr/local/sbin/labs", + ).mock() + + assert set(WorkspacePath(ws, "/").glob("**/bin/labs")) == { + WorkspacePath(ws, "/home/bob/bin/labs"), + WorkspacePath(ws, "/usr/local/bin/labs"), + } + assert set(WorkspacePath(ws, "/").glob("usr/**/labs")) == { + WorkspacePath(ws, "/usr/local/bin/labs"), + WorkspacePath(ws, "/usr/local/sbin/labs"), + } + assert set(WorkspacePath(ws, "/").glob("usr/**")) == { + WorkspacePath(ws, "/usr"), + WorkspacePath(ws, "/usr/local"), + WorkspacePath(ws, "/usr/local/bin"), + WorkspacePath(ws, "/usr/local/sbin"), + } + + +def test_double_recursive_glob() -> None: + """Verify that double-recursive globs work as expected without duplicate results.""" + ws = create_autospec(WorkspaceClient) + ws.workspace = StubWorkspaceFilesystem( + "/some/long/path/with/repeated/path/segments/present", + ).mock() + + assert tuple(WorkspacePath(ws, "/").glob("**/path/**/present")) == ( + WorkspacePath(ws, "/some/long/path/with/repeated/path/segments/present"), + ) + + +def test_glob_case_insensitive() -> None: + """As of python 3.12, globbing is allowed to be case-insensitive irrespective of the underlying filesystem. Check this.""" + ws = create_autospec(WorkspaceClient) + ws.workspace = StubWorkspaceFilesystem( + "/home/bob/bin/labs", + "/home/bob/bin/databricks", + "/home/bot/", + "/home/bat/", + "/etc/passwd", + ).mock() + + assert set(WorkspacePath(ws, "/home").glob("B*t", case_sensitive=False)) == { + WorkspacePath(ws, "/home/bot"), + WorkspacePath(ws, "/home/bat"), + } + assert set(WorkspacePath(ws, "/home").glob("bO[TB]", case_sensitive=False)) == { + WorkspacePath(ws, "/home/bot"), + WorkspacePath(ws, "/home/bob"), + } + assert set(WorkspacePath(ws, "/etc").glob("PasSWd", case_sensitive=False)) == {WorkspacePath(ws, "/etc/passwd")} + + def test_globbing_when_nested_json_files_exist(): ws = create_autospec(WorkspaceClient) workspace_path = WorkspacePath(ws, "/test/path") @@ -711,3 +942,16 @@ def test_globbing_when_nested_json_files_exist(): ] result = [str(p) for p in workspace_path.glob("*/*.json")] assert result == ["/test/path/dir1/file1.json", "/test/path/dir2/file2.json"] + + +def test_rglob() -> None: + ws = create_autospec(WorkspaceClient) + ws.workspace = StubWorkspaceFilesystem( + "/test/path/dir1/file1.json", + "/test/path/dir2/file2.json", + ).mock() + + assert set(WorkspacePath(ws, "/test").rglob("*.json")) == { + WorkspacePath(ws, "/test/path/dir1/file1.json"), + WorkspacePath(ws, "/test/path/dir2/file2.json"), + }