From c223cdaecd13d20c7a6d459b554940448ccaf8c2 Mon Sep 17 00:00:00 2001 From: skshetry <18718008+skshetry@users.noreply.github.com> Date: Thu, 28 Mar 2024 07:30:12 +0545 Subject: [PATCH] params/metrics: do not fail during dir expansion (#10375) params/metrics: assume file when dir expansion raises error Closes https://github.com/iterative/dvc/issues/10018. We have to be careful when expanding paths. This PR fall backs to assume it as a file when either of `isdir` and `find` raises error due to some mishaps. The consequence of this assumption happens when we try to read the file. In that case, if the directory exists, and we happen to try to open it as a file, dvc will raise `IsADirectoryError`, but that is handled correctly and won't fail catastrophically like this did. --- dvc/repo/metrics/show.py | 21 +++++++++++++++++++-- dvc/repo/params/show.py | 6 +++--- dvc/utils/__init__.py | 11 ----------- tests/func/metrics/test_show.py | 18 ++++++++++++++++++ tests/func/params/test_show.py | 20 +++++++++++++++++++- 5 files changed, 59 insertions(+), 17 deletions(-) diff --git a/dvc/repo/metrics/show.py b/dvc/repo/metrics/show.py index ec86f9648e..d1143453ce 100644 --- a/dvc/repo/metrics/show.py +++ b/dvc/repo/metrics/show.py @@ -1,3 +1,4 @@ +import logging import os from collections.abc import Iterable, Iterator from itertools import chain @@ -8,7 +9,7 @@ from dvc.log import logger from dvc.scm import NoSCMError -from dvc.utils import as_posix, expand_paths +from dvc.utils import as_posix from dvc.utils.collections import ensure_list from dvc.utils.serialize import load_path @@ -103,7 +104,7 @@ def _collect_metrics( paths = (fs.from_os_path(metric) for metric in metrics) # make paths absolute for DVCFileSystem repo_paths = (f"{fs.root_marker}{path}" for path in paths) - return ldistinct(expand_paths(fs, repo_paths)) + return ldistinct(try_expand_paths(fs, repo_paths)) class FileResult(TypedDict, total=False): @@ -116,6 +117,22 @@ class Result(TypedDict, total=False): error: Exception +def try_expand_paths(fs: "FileSystem", paths: Iterable[str]) -> Iterator[str]: + for path in paths: + try: + if fs.isdir(path): + yield from fs.find(path) + continue + except Exception as e: # noqa: BLE001 + logger.debug( + "failed to expand %r: %s", + path, + e, + exc_info=logger.isEnabledFor(logging.TRACE), # type: ignore[attr-defined] + ) + yield path + + def to_relpath(fs: "FileSystem", root_dir: str, d: Result) -> Result: relpath = fs.relpath cwd = fs.getcwd() diff --git a/dvc/repo/params/show.py b/dvc/repo/params/show.py index 036a39b847..de69755c94 100644 --- a/dvc/repo/params/show.py +++ b/dvc/repo/params/show.py @@ -6,9 +6,9 @@ from dvc.dependency.param import ParamsDependency, read_param_file from dvc.log import logger -from dvc.repo.metrics.show import FileResult, Result +from dvc.repo.metrics.show import FileResult, Result, try_expand_paths from dvc.stage import PipelineStage -from dvc.utils import as_posix, expand_paths +from dvc.utils import as_posix from dvc.utils.collections import ensure_list if TYPE_CHECKING: @@ -77,7 +77,7 @@ def _collect_params( path = fs.from_os_path(param) # make paths absolute for DVCFileSystem repo_path = f"{fs.root_marker}{path}" - ret.update(dict.fromkeys(expand_paths(fs, [repo_path]), _params)) + ret.update(dict.fromkeys(try_expand_paths(fs, [repo_path]), _params)) return ret diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index c7f222c3c5..a43d11ac79 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -5,7 +5,6 @@ import os import re import sys -from collections.abc import Iterable, Iterator from typing import TYPE_CHECKING, Optional import colorama @@ -13,8 +12,6 @@ if TYPE_CHECKING: from typing import TextIO - from dvc.fs import FileSystem - LARGE_DIR_SIZE = 100 TARGET_REGEX = re.compile(r"(?P.*?)(:(?P[^\\/:]*))??$") @@ -410,14 +407,6 @@ def errored_revisions(rev_data: dict) -> list: return result -def expand_paths(fs: "FileSystem", paths: Iterable[str]) -> Iterator[str]: - for path in paths: - if fs.isdir(path): - yield from fs.find(path) - else: - yield path - - def isatty(stream: "Optional[TextIO]") -> bool: if stream is None: return False diff --git a/tests/func/metrics/test_show.py b/tests/func/metrics/test_show.py index d9ea003c28..6962a8db94 100644 --- a/tests/func/metrics/test_show.py +++ b/tests/func/metrics/test_show.py @@ -1,5 +1,6 @@ import json import os +import shutil from os.path import join import pytest @@ -9,8 +10,10 @@ from dvc.dvcfile import PROJECT_FILE from dvc.exceptions import OverlappingOutputPathsError from dvc.repo import Repo +from dvc.repo.metrics.show import FileResult, Result from dvc.utils.fs import remove from dvc.utils.serialize import JSONFileCorruptedError +from dvc_data.index import DataIndexDirError def test_show_simple(tmp_dir, dvc, run_copy_metrics): @@ -351,3 +354,18 @@ def test_top_level_parametrized(tmp_dir, dvc): assert dvc.metrics.show() == { "": {"data": {"metrics.yaml": {"data": {"foo": 3, "bar": 10}}}} } + + +def test_metric_in_a_tracked_directory_with_missing_dir_file(M, tmp_dir, dvc): + tmp_dir.dvc_gen({"dir": {"file": "2"}}) + (tmp_dir / "dvc.yaml").dump({"metrics": [join("dir", "file")]}) + shutil.rmtree(tmp_dir / "dir") # remove from workspace + dvc.cache.local.clear() # remove .dir file + + assert dvc.metrics.show() == { + "": Result( + data={ + join("dir", "file"): FileResult(error=M.instance_of(DataIndexDirError)), + } + ) + } diff --git a/tests/func/params/test_show.py b/tests/func/params/test_show.py index 9b81545f8e..145d40901b 100644 --- a/tests/func/params/test_show.py +++ b/tests/func/params/test_show.py @@ -1,9 +1,12 @@ +import shutil from os.path import join import pytest +from dvc.dvcfile import PROJECT_FILE from dvc.repo import Repo -from dvc.repo.stage import PROJECT_FILE +from dvc.repo.metrics.show import FileResult, Result +from dvc_data.index import DataIndexDirError def test_show_empty(dvc): @@ -192,3 +195,18 @@ def test_top_level_parametrized(tmp_dir, dvc): } } } + + +def test_param_in_a_tracked_directory_with_missing_dir_file(M, tmp_dir, dvc): + tmp_dir.dvc_gen({"dir": {"file": "2"}}) + (tmp_dir / "dvc.yaml").dump({"params": [join("dir", "file")]}) + shutil.rmtree(tmp_dir / "dir") # remove from workspace + dvc.cache.local.clear() # remove .dir file + + assert dvc.params.show() == { + "": Result( + data={ + join("dir", "file"): FileResult(error=M.instance_of(DataIndexDirError)), + } + ) + }