Skip to content

Commit

Permalink
pythonGH-113528: Deoptimise pathlib._abc.PurePathBase (python#113559)
Browse files Browse the repository at this point in the history
Apply pathlib's normalization and performance tuning in `pathlib.PurePath`, but not `pathlib._abc.PurePathBase`.

With this change, the pathlib ABCs do not normalize away alternate path separators, empty segments, or dot segments. A single string given to the initialiser will round-trip by default, i.e. `str(PurePathBase(my_string)) == my_string`. Implementors can set their own path domain-specific normalization scheme by overriding `__str__()`

Eliminating path normalization makes maintaining and caching the path's parts and string representation both optional and not very useful, so this commit moves the `_drv`, `_root`, `_tail_cached` and `_str` slots from `PurePathBase` to `PurePath`. Only `_raw_paths` and `_resolving` slots remain in `PurePathBase`. This frees the ABCs from the burden of some of pathlib's hardest-to-understand code.
  • Loading branch information
barneygale authored Jan 9, 2024
1 parent 57bdc6c commit beb80d1
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 140 deletions.
104 changes: 103 additions & 1 deletion Lib/pathlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,20 @@ class PurePath(_abc.PurePathBase):
"""

__slots__ = (
# The `_drv`, `_root` and `_tail_cached` slots store parsed and
# normalized parts of the path. They are set when any of the `drive`,
# `root` or `_tail` properties are accessed for the first time. The
# three-part division corresponds to the result of
# `os.path.splitroot()`, except that the tail is further split on path
# separators (i.e. it is a list of strings), and that the root and
# tail are normalized.
'_drv', '_root', '_tail_cached',

# The `_str` slot stores the string representation of the path,
# computed from the drive, root and tail when `__str__()` is called
# for the first time. It's used to implement `_str_normcase`
'_str',

# The `_str_normcase_cached` slot stores the string path with
# normalized case. It is set when the `_str_normcase` property is
# accessed for the first time. It's used to implement `__eq__()`
Expand Down Expand Up @@ -196,6 +210,94 @@ def __ge__(self, other):
return NotImplemented
return self._parts_normcase >= other._parts_normcase

def __str__(self):
"""Return the string representation of the path, suitable for
passing to system calls."""
try:
return self._str
except AttributeError:
self._str = self._format_parsed_parts(self.drive, self.root,
self._tail) or '.'
return self._str

@classmethod
def _format_parsed_parts(cls, drv, root, tail):
if drv or root:
return drv + root + cls.pathmod.sep.join(tail)
elif tail and cls.pathmod.splitdrive(tail[0])[0]:
tail = ['.'] + tail
return cls.pathmod.sep.join(tail)

def _from_parsed_parts(self, drv, root, tail):
path_str = self._format_parsed_parts(drv, root, tail)
path = self.with_segments(path_str)
path._str = path_str or '.'
path._drv = drv
path._root = root
path._tail_cached = tail
return path

@classmethod
def _parse_path(cls, path):
if not path:
return '', '', []
sep = cls.pathmod.sep
altsep = cls.pathmod.altsep
if altsep:
path = path.replace(altsep, sep)
drv, root, rel = cls.pathmod.splitroot(path)
if not root and drv.startswith(sep) and not drv.endswith(sep):
drv_parts = drv.split(sep)
if len(drv_parts) == 4 and drv_parts[2] not in '?.':
# e.g. //server/share
root = sep
elif len(drv_parts) == 6:
# e.g. //?/unc/server/share
root = sep
parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
return drv, root, parsed

def _load_parts(self):
paths = self._raw_paths
if len(paths) == 0:
path = ''
elif len(paths) == 1:
path = paths[0]
else:
path = self.pathmod.join(*paths)
self._drv, self._root, self._tail_cached = self._parse_path(path)

@property
def drive(self):
"""The drive prefix (letter or UNC path), if any."""
try:
return self._drv
except AttributeError:
self._load_parts()
return self._drv

@property
def root(self):
"""The root of the path, if any."""
try:
return self._root
except AttributeError:
self._load_parts()
return self._root

@property
def _tail(self):
try:
return self._tail_cached
except AttributeError:
self._load_parts()
return self._tail_cached

@property
def anchor(self):
"""The concatenation of the drive and root, or ''."""
return self.drive + self.root

@property
def parts(self):
"""An object providing sequence-like access to the
Expand Down Expand Up @@ -416,7 +518,7 @@ def iterdir(self):
def _scandir(self):
return os.scandir(self)

def _make_child_entry(self, entry):
def _make_child_entry(self, entry, is_dir=False):
# Transform an entry yielded from _scandir() into a path object.
path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str)
Expand Down
142 changes: 29 additions & 113 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import functools
import ntpath
import posixpath
import sys
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO

Expand Down Expand Up @@ -82,7 +81,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
except OSError:
continue
if match(entry.name):
yield parent_path._make_child_entry(entry)
yield parent_path._make_child_entry(entry, dir_only)


def _select_recursive(parent_paths, dir_only, follow_symlinks):
Expand All @@ -105,7 +104,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
for entry in entries:
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
paths.append(path._make_child_entry(entry))
paths.append(path._make_child_entry(entry, dir_only))
continue
except OSError:
pass
Expand Down Expand Up @@ -147,20 +146,6 @@ class PurePathBase:
# in the `__init__()` method.
'_raw_paths',

# The `_drv`, `_root` and `_tail_cached` slots store parsed and
# normalized parts of the path. They are set when any of the `drive`,
# `root` or `_tail` properties are accessed for the first time. The
# three-part division corresponds to the result of
# `os.path.splitroot()`, except that the tail is further split on path
# separators (i.e. it is a list of strings), and that the root and
# tail are normalized.
'_drv', '_root', '_tail_cached',

# The `_str` slot stores the string representation of the path,
# computed from the drive, root and tail when `__str__()` is called
# for the first time. It's used to implement `_str_normcase`
'_str',

# The '_resolving' slot stores a boolean indicating whether the path
# is being processed by `PathBase.resolve()`. This prevents duplicate
# work from occurring when `resolve()` calls `stat()` or `readlink()`.
Expand All @@ -179,65 +164,16 @@ def with_segments(self, *pathsegments):
"""
return type(self)(*pathsegments)

@classmethod
def _parse_path(cls, path):
if not path:
return '', '', []
sep = cls.pathmod.sep
altsep = cls.pathmod.altsep
if altsep:
path = path.replace(altsep, sep)
drv, root, rel = cls.pathmod.splitroot(path)
if not root and drv.startswith(sep) and not drv.endswith(sep):
drv_parts = drv.split(sep)
if len(drv_parts) == 4 and drv_parts[2] not in '?.':
# e.g. //server/share
root = sep
elif len(drv_parts) == 6:
# e.g. //?/unc/server/share
root = sep
parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
return drv, root, parsed

def _load_parts(self):
paths = self._raw_paths
if len(paths) == 0:
path = ''
elif len(paths) == 1:
path = paths[0]
else:
path = self.pathmod.join(*paths)
drv, root, tail = self._parse_path(path)
self._drv = drv
self._root = root
self._tail_cached = tail

def _from_parsed_parts(self, drv, root, tail):
path_str = self._format_parsed_parts(drv, root, tail)
path = self.with_segments(path_str)
path._str = path_str or '.'
path._drv = drv
path._root = root
path._tail_cached = tail
return path

@classmethod
def _format_parsed_parts(cls, drv, root, tail):
if drv or root:
return drv + root + cls.pathmod.sep.join(tail)
elif tail and cls.pathmod.splitdrive(tail[0])[0]:
tail = ['.'] + tail
return cls.pathmod.sep.join(tail)

def __str__(self):
"""Return the string representation of the path, suitable for
passing to system calls."""
try:
return self._str
except AttributeError:
self._str = self._format_parsed_parts(self.drive, self.root,
self._tail) or '.'
return self._str
paths = self._raw_paths
if len(paths) == 1:
return paths[0]
elif paths:
return self.pathmod.join(*paths)
else:
return ''

def as_posix(self):
"""Return the string representation of the path with forward (/)
Expand All @@ -247,42 +183,23 @@ def as_posix(self):
@property
def drive(self):
"""The drive prefix (letter or UNC path), if any."""
try:
return self._drv
except AttributeError:
self._load_parts()
return self._drv
return self.pathmod.splitdrive(str(self))[0]

@property
def root(self):
"""The root of the path, if any."""
try:
return self._root
except AttributeError:
self._load_parts()
return self._root

@property
def _tail(self):
try:
return self._tail_cached
except AttributeError:
self._load_parts()
return self._tail_cached
return self.pathmod.splitroot(str(self))[1]

@property
def anchor(self):
"""The concatenation of the drive and root, or ''."""
anchor = self.drive + self.root
return anchor
drive, root, _ = self.pathmod.splitroot(str(self))
return drive + root

@property
def name(self):
"""The final path component, if any."""
path_str = str(self)
if not path_str or path_str == '.':
return ''
return self.pathmod.basename(path_str)
return self.pathmod.basename(str(self))

@property
def suffix(self):
Expand Down Expand Up @@ -323,13 +240,10 @@ def stem(self):

def with_name(self, name):
"""Return a new path with the file name changed."""
m = self.pathmod
if not name or m.sep in name or (m.altsep and m.altsep in name) or name == '.':
dirname = self.pathmod.dirname
if dirname(name):
raise ValueError(f"Invalid name {name!r}")
parent, old_name = m.split(str(self))
if not old_name or old_name == '.':
raise ValueError(f"{self!r} has an empty name")
return self.with_segments(parent, name)
return self.with_segments(dirname(str(self)), name)

def with_stem(self, stem):
"""Return a new path with the stem changed."""
Expand Down Expand Up @@ -480,7 +394,7 @@ def is_absolute(self):
def is_reserved(self):
"""Return True if the path contains one of the special names reserved
by the system, if any."""
if self.pathmod is posixpath or not self._tail:
if self.pathmod is posixpath or not self.name:
return False

# NOTE: the rules for reserved names seem somewhat complicated
Expand All @@ -490,7 +404,7 @@ def is_reserved(self):
if self.drive.startswith('\\\\'):
# UNC paths are never reserved.
return False
name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ')
name = self.name.partition('.')[0].partition(':')[0].rstrip(' ')
return name.upper() in _WIN_RESERVED_NAMES

def match(self, path_pattern, *, case_sensitive=None):
Expand All @@ -503,9 +417,9 @@ def match(self, path_pattern, *, case_sensitive=None):
case_sensitive = _is_case_sensitive(self.pathmod)
sep = path_pattern.pathmod.sep
pattern_str = str(path_pattern)
if path_pattern.drive or path_pattern.root:
if path_pattern.anchor:
pass
elif path_pattern._tail:
elif path_pattern.parts:
pattern_str = f'**{sep}{pattern_str}'
else:
raise ValueError("empty pattern")
Expand Down Expand Up @@ -780,8 +694,10 @@ def _scandir(self):
from contextlib import nullcontext
return nullcontext(self.iterdir())

def _make_child_entry(self, entry):
def _make_child_entry(self, entry, is_dir=False):
# Transform an entry yielded from _scandir() into a path object.
if is_dir:
return entry.joinpath('')
return entry

def _make_child_relpath(self, name):
Expand All @@ -792,13 +708,13 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
kind, including directories) matching the given relative pattern.
"""
path_pattern = self.with_segments(pattern)
if path_pattern.drive or path_pattern.root:
if path_pattern.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not path_pattern._tail:
elif not path_pattern.parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))

pattern_parts = path_pattern._tail.copy()
if pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
pattern_parts = list(path_pattern.parts)
if not self.pathmod.basename(pattern):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
pattern_parts.append('')

Expand All @@ -816,7 +732,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self] if self.is_dir() else [])
paths = iter([self.joinpath('')] if self.is_dir() else [])
part_idx = 0
while part_idx < len(pattern_parts):
part = pattern_parts[part_idx]
Expand Down
Loading

0 comments on commit beb80d1

Please sign in to comment.