From d7f13d488fac7f95421ab7a83e609ac65cea9bda Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 28 Oct 2024 01:23:36 +0000 Subject: [PATCH] GH-125413: Add `pathlib.Path.scandir()` method Add `pathlib.Path.scandir()` as a trivial wrapper of `os.scandir()`. In the private `pathlib._abc.PathBase` class, we can rework the `iterdir()`, `glob()`, `walk()` and `copy()` methods to call `scandir()` and make use of cached directory entry information, and thereby improve performance. Because the `Path.copy()` method is provided by `PathBase`, this also speeds up traversal when copying local files and directories. --- Doc/library/pathlib.rst | 29 ++++++ Doc/whatsnew/3.14.rst | 6 ++ Lib/glob.py | 13 +-- Lib/pathlib/_abc.py | 64 ++++++------- Lib/pathlib/_local.py | 8 ++ Lib/test/test_pathlib/test_pathlib_abc.py | 94 ++++++++++++++----- ...-10-28-01-24-52.gh-issue-125413.Jat5kq.rst | 3 + 7 files changed, 154 insertions(+), 63 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-10-28-01-24-52.gh-issue-125413.Jat5kq.rst diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 4380122eb1be7d..b6fb36554f7cec 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -1289,6 +1289,35 @@ Reading directories raised. +.. method:: Path.scandir() + + When the path points to a directory, return an iterator of + :class:`os.DirEntry` objects corresponding to entries in the directory. The + returned iterator supports the :term:`context manager` protocol. It is + implemented using :func:`os.scandir` and gives the same guarantees. + + Using :meth:`~Path.scandir` instead of :meth:`~Path.iterdir` can + significantly increase the performance of code that also needs file type or + file attribute information, because :class:`os.DirEntry` objects expose + this information if the operating system provides it when scanning a + directory. + + The following example displays the names of subdirectories. The + ``entry.is_dir()`` check will generally not make an additional system call:: + + >>> p = Path('docs') + >>> with p.scandir() as entries: + ... for entry in entries: + ... if entry.is_dir(): + ... entry.name + ... + '_templates' + '_build' + '_static' + + .. versionadded:: 3.14 + + .. method:: Path.glob(pattern, *, case_sensitive=None, recurse_symlinks=False) Glob the given relative *pattern* in the directory represented by this path, diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index a6f595ccf08bf4..8caac8e83a06e3 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -380,6 +380,12 @@ pathlib (Contributed by Barney Gale in :gh:`73991`.) +* Add :meth:`pathlib.Path.scandir` to scan a directory and return an iterator + of :class:`os.DirEntry` objects. This is exactly equivalent to calling + :func:`os.scandir` on a path object. + + (Contributed by Barney Gale in :gh:`125413`.) + pdb --- diff --git a/Lib/glob.py b/Lib/glob.py index 574e5ad51b601d..ce9b3698888dd9 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -364,12 +364,6 @@ def concat_path(path, text): """ raise NotImplementedError - @staticmethod - def parse_entry(entry): - """Returns the path of an entry yielded from scandir(). - """ - raise NotImplementedError - # High-level methods def compile(self, pat): @@ -438,6 +432,7 @@ def select_wildcard(path, exists=False): except OSError: pass else: + prefix = self.add_slash(path) for entry in entries: if match is None or match(entry.name): if dir_only: @@ -446,7 +441,7 @@ def select_wildcard(path, exists=False): continue except OSError: continue - entry_path = self.parse_entry(entry) + entry_path = self.concat_path(prefix, entry.name) if dir_only: yield from select_next(entry_path, exists=True) else: @@ -495,6 +490,7 @@ def select_recursive_step(stack, match_pos): except OSError: pass else: + prefix = self.add_slash(path) for entry in entries: is_dir = False try: @@ -504,7 +500,7 @@ def select_recursive_step(stack, match_pos): pass if is_dir or not dir_only: - entry_path = self.parse_entry(entry) + entry_path = self.concat_path(prefix, entry.name) if match is None or match(str(entry_path), match_pos): if dir_only: yield from select_next(entry_path, exists=True) @@ -533,7 +529,6 @@ class _StringGlobber(_GlobberBase): """ lexists = staticmethod(os.path.lexists) scandir = staticmethod(os.scandir) - parse_entry = operator.attrgetter('path') concat_path = operator.add if os.name == 'nt': diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 11c8018b28f26b..4620a319bc5369 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -94,25 +94,13 @@ class PathGlobber(_GlobberBase): lexists = operator.methodcaller('exists', follow_symlinks=False) add_slash = operator.methodcaller('joinpath', '') - - @staticmethod - def scandir(path): - """Emulates os.scandir(), which returns an object that can be used as - a context manager. This method is called by walk() and glob(). - """ - import contextlib - return contextlib.nullcontext(path.iterdir()) + scandir = operator.methodcaller('scandir') @staticmethod def concat_path(path, text): """Appends text to the given path.""" return path.with_segments(path._raw_path + text) - @staticmethod - def parse_entry(entry): - """Returns the path of an entry yielded from scandir().""" - return entry - class PurePathBase: """Base class for pure path objects. @@ -639,13 +627,23 @@ def write_text(self, data, encoding=None, errors=None, newline=None): with self.open(mode='w', encoding=encoding, errors=errors, newline=newline) as f: return f.write(data) + def scandir(self): + """Yield os.DirEntry objects of the directory contents. + + The children are yielded in arbitrary order, and the + special entries '.' and '..' are not included. + """ + raise UnsupportedOperation(self._unsupported_msg('scandir()')) + def iterdir(self): """Yield path objects of the directory contents. The children are yielded in arbitrary order, and the special entries '.' and '..' are not included. """ - raise UnsupportedOperation(self._unsupported_msg('iterdir()')) + with self.scandir() as entries: + names = [entry.name for entry in entries] + return map(self.joinpath, names) def _glob_selector(self, parts, case_sensitive, recurse_symlinks): if case_sensitive is None: @@ -695,16 +693,17 @@ def walk(self, top_down=True, on_error=None, follow_symlinks=False): if not top_down: paths.append((path, dirnames, filenames)) try: - for child in path.iterdir(): - try: - if child.is_dir(follow_symlinks=follow_symlinks): - if not top_down: - paths.append(child) - dirnames.append(child.name) - else: - filenames.append(child.name) - except OSError: - filenames.append(child.name) + with path.scandir() as entries: + for entry in entries: + try: + if entry.is_dir(follow_symlinks=follow_symlinks): + if not top_down: + paths.append(path.joinpath(entry.name)) + dirnames.append(entry.name) + else: + filenames.append(entry.name) + except OSError: + filenames.append(entry.name) except OSError as error: if on_error is not None: on_error(error) @@ -872,18 +871,19 @@ def copy(self, target, *, follow_symlinks=True, dirs_exist_ok=False, if not isinstance(target, PathBase): target = self.with_segments(target) self._ensure_distinct_path(target) - stack = [(self, target)] + stack = [(self, self, target)] while stack: - src, dst = stack.pop() - if not follow_symlinks and src.is_symlink(): + src_entry, src, dst = stack.pop() + if not follow_symlinks and src_entry.is_symlink(): dst._symlink_to_target_of(src) if preserve_metadata: src._copy_metadata(dst, follow_symlinks=False) - elif src.is_dir(): - children = src.iterdir() - dst.mkdir(exist_ok=dirs_exist_ok) - stack.extend((child, dst.joinpath(child.name)) - for child in children) + elif src_entry.is_dir(): + with src.scandir() as entries: + dst.mkdir(exist_ok=dirs_exist_ok) + stack.extend( + (entry, src.joinpath(entry.name), dst.joinpath(entry.name)) + for entry in entries) if preserve_metadata: src._copy_metadata(dst) else: diff --git a/Lib/pathlib/_local.py b/Lib/pathlib/_local.py index a78997179820b1..ef072b83d96904 100644 --- a/Lib/pathlib/_local.py +++ b/Lib/pathlib/_local.py @@ -615,6 +615,14 @@ def _filter_trailing_slash(self, paths): path_str = path_str[:-1] yield path_str + def scandir(self): + """Yield os.DirEntry objects of the directory contents. + + The children are yielded in arbitrary order, and the + special entries '.' and '..' are not included. + """ + return os.scandir(self) + def iterdir(self): """Yield path objects of the directory contents. diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 08355a71453807..e8556258b92712 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1,4 +1,5 @@ import collections +import contextlib import io import os import errno @@ -1424,6 +1425,24 @@ def close(self): 'st_mode st_ino st_dev st_nlink st_uid st_gid st_size st_atime st_mtime st_ctime') +class DummyDirEntry: + """ + Minimal os.DirEntry-like object. Returned from DummyPath.scandir(). + """ + __slots__ = ('name', '_is_symlink', '_is_dir') + + def __init__(self, name, is_symlink, is_dir): + self.name = name + self._is_symlink = is_symlink + self._is_dir = is_dir + + def is_symlink(self): + return self._is_symlink + + def is_dir(self, *, follow_symlinks=True): + return self._is_dir and (follow_symlinks or not self._is_symlink) + + class DummyPath(PathBase): """ Simple implementation of PathBase that keeps files and directories in @@ -1491,14 +1510,25 @@ def open(self, mode='r', buffering=-1, encoding=None, stream = io.TextIOWrapper(stream, encoding=encoding, errors=errors, newline=newline) return stream - def iterdir(self): - path = str(self.resolve()) - if path in self._files: - raise NotADirectoryError(errno.ENOTDIR, "Not a directory", path) - elif path in self._directories: - return iter([self / name for name in self._directories[path]]) + @contextlib.contextmanager + def scandir(self): + path = self.resolve() + path_str = str(path) + if path_str in self._files: + raise NotADirectoryError(errno.ENOTDIR, "Not a directory", path_str) + elif path_str in self._directories: + yield iter([path.joinpath(name)._dir_entry for name in self._directories[path_str]]) else: - raise FileNotFoundError(errno.ENOENT, "File not found", path) + raise FileNotFoundError(errno.ENOENT, "File not found", path_str) + + @property + def _dir_entry(self): + path_str = str(self) + is_symlink = path_str in self._symlinks + is_directory = (path_str in self._directories + if not is_symlink + else self._symlinks[path_str][1]) + return DummyDirEntry(self.name, is_symlink, is_directory) def mkdir(self, mode=0o777, parents=False, exist_ok=False): path = str(self.parent.resolve() / self.name) @@ -1602,9 +1632,11 @@ def setUp(self): if self.can_symlink: p.joinpath('linkA').symlink_to('fileA') p.joinpath('brokenLink').symlink_to('non-existing') - p.joinpath('linkB').symlink_to('dirB') - p.joinpath('dirA', 'linkC').symlink_to(parser.join('..', 'dirB')) - p.joinpath('dirB', 'linkD').symlink_to(parser.join('..', 'dirB')) + p.joinpath('linkB').symlink_to('dirB', target_is_directory=True) + p.joinpath('dirA', 'linkC').symlink_to( + parser.join('..', 'dirB'), target_is_directory=True) + p.joinpath('dirB', 'linkD').symlink_to( + parser.join('..', 'dirB'), target_is_directory=True) p.joinpath('brokenLinkLoop').symlink_to('brokenLinkLoop') def tearDown(self): @@ -1771,7 +1803,7 @@ def test_copy_symlink_to_existing_symlink(self): source = base / 'copySource' target = base / 'copyTarget' source.symlink_to(base / 'fileA') - target.symlink_to(base / 'dirC') + target.symlink_to(base / 'dirC', target_is_directory=True) self.assertRaises(OSError, source.copy, target) self.assertRaises(OSError, source.copy, target, follow_symlinks=False) @@ -1781,7 +1813,7 @@ def test_copy_symlink_to_existing_directory_symlink(self): source = base / 'copySource' target = base / 'copyTarget' source.symlink_to(base / 'fileA') - target.symlink_to(base / 'dirC') + target.symlink_to(base / 'dirC', target_is_directory=True) self.assertRaises(OSError, source.copy, target) self.assertRaises(OSError, source.copy, target, follow_symlinks=False) @@ -1817,7 +1849,7 @@ def test_copy_directory_symlink_to_existing_symlink(self): base = self.cls(self.base) source = base / 'copySource' target = base / 'copyTarget' - source.symlink_to(base / 'dirC') + source.symlink_to(base / 'dirC', target_is_directory=True) target.symlink_to(base / 'fileA') self.assertRaises(FileExistsError, source.copy, target) self.assertRaises(FileExistsError, source.copy, target, follow_symlinks=False) @@ -1827,8 +1859,8 @@ def test_copy_directory_symlink_to_existing_directory_symlink(self): base = self.cls(self.base) source = base / 'copySource' target = base / 'copyTarget' - source.symlink_to(base / 'dirC' / 'dirD') - target.symlink_to(base / 'dirC') + source.symlink_to(base / 'dirC' / 'dirD', target_is_directory=True) + target.symlink_to(base / 'dirC', target_is_directory=True) self.assertRaises(FileExistsError, source.copy, target) self.assertRaises(FileExistsError, source.copy, target, follow_symlinks=False) @@ -1919,7 +1951,7 @@ def ordered_walk(path): if self.can_symlink: # Add some symlinks source.joinpath('linkC').symlink_to('fileC') - source.joinpath('linkD').symlink_to('dirD') + source.joinpath('linkD').symlink_to('dirD', target_is_directory=True) # Perform the copy target = base / 'copyC' @@ -2187,6 +2219,22 @@ def test_iterdir_nodir(self): self.assertIn(cm.exception.errno, (errno.ENOTDIR, errno.ENOENT, errno.EINVAL)) + def test_scandir(self): + p = self.cls(self.base) + with p.scandir() as entries: + self.assertTrue(list(entries)) + with p.scandir() as entries: + for entry in entries: + child = p / entry.name + self.assertIsNotNone(entry) + self.assertEqual(entry.name, child.name) + self.assertEqual(entry.is_symlink(), + child.is_symlink()) + self.assertEqual(entry.is_dir(follow_symlinks=False), + child.is_dir(follow_symlinks=False)) + if entry.name != 'brokenLinkLoop': + self.assertEqual(entry.is_dir(), child.is_dir()) + def test_glob_common(self): def _check(glob, expected): self.assertEqual(set(glob), { P(self.base, q) for q in expected }) @@ -2432,7 +2480,7 @@ def test_glob_permissions(self): if i % 2: link.symlink_to(P(self.base, "dirE", "nonexistent")) else: - link.symlink_to(P(self.base, "dirC")) + link.symlink_to(P(self.base, "dirC"), target_is_directory=True) self.assertEqual(len(set(base.glob("*"))), 100) self.assertEqual(len(set(base.glob("*/"))), 50) @@ -2515,8 +2563,10 @@ def test_resolve_common(self): self._check_resolve_relative(p, P(self.base, 'foo', 'in', 'spam'), False) # Now create absolute symlinks. d = self.tempdir() - P(self.base, 'dirA', 'linkX').symlink_to(d) - P(self.base, str(d), 'linkY').symlink_to(self.parser.join(self.base, 'dirB')) + P(self.base, 'dirA', 'linkX').symlink_to( + d, target_is_directory=True) + P(self.base, str(d), 'linkY').symlink_to( + self.parser.join(self.base, 'dirB'), target_is_directory=True) p = P(self.base, 'dirA', 'linkX', 'linkY', 'fileB') self._check_resolve_absolute(p, P(self.base, 'dirB', 'fileB')) # Non-strict @@ -2920,7 +2970,7 @@ def setUpWalk(self): f.write(f"I'm {path} and proud of it. Blame test_pathlib.\n") if self.can_symlink: - self.link_path.symlink_to(t2_path) + self.link_path.symlink_to(t2_path, target_is_directory=True) broken_link_path.symlink_to('broken') broken_link2_path.symlink_to(self.cls('tmp3', 'broken')) self.sub2_tree = (self.sub2_path, [], ["broken_link", "broken_link2", "link", "tmp3"]) @@ -3038,7 +3088,7 @@ class DummyPathWithSymlinks(DummyPath): def readlink(self): path = str(self.parent.resolve() / self.name) if path in self._symlinks: - return self.with_segments(self._symlinks[path]) + return self.with_segments(self._symlinks[path][0]) elif path in self._files or path in self._directories: raise OSError(errno.EINVAL, "Not a symlink", path) else: @@ -3050,7 +3100,7 @@ def symlink_to(self, target, target_is_directory=False): if path in self._symlinks: raise FileExistsError(errno.EEXIST, "File exists", path) self._directories[parent].add(self.name) - self._symlinks[path] = str(target) + self._symlinks[path] = str(target), target_is_directory class DummyPathWithSymlinksTest(DummyPathTest): diff --git a/Misc/NEWS.d/next/Library/2024-10-28-01-24-52.gh-issue-125413.Jat5kq.rst b/Misc/NEWS.d/next/Library/2024-10-28-01-24-52.gh-issue-125413.Jat5kq.rst new file mode 100644 index 00000000000000..ddf1f9725d9695 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-28-01-24-52.gh-issue-125413.Jat5kq.rst @@ -0,0 +1,3 @@ +Add :meth:`pathlib.Path.scandir` method to efficiently fetch directory +children and their file attributes. This is a trivial wrapper of +:func:`os.scandir`.