From 6ba6cbad1891c635fdf504f6adc02ff00b52f501 Mon Sep 17 00:00:00 2001 From: bpinsard Date: Tue, 20 Feb 2024 13:50:27 -0500 Subject: [PATCH] mark sensitive only files added in the commit. metadata add rather than init to fix reruns issue --- heudiconv/external/dlad.py | 30 ++++++++++++++++++--------- heudiconv/external/tests/test_dlad.py | 20 ++++++++++++++++++ 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/heudiconv/external/dlad.py b/heudiconv/external/dlad.py index 2d65c2b2..5e1b305a 100644 --- a/heudiconv/external/dlad.py +++ b/heudiconv/external/dlad.py @@ -153,16 +153,16 @@ def add_to_datalad( # annex_add_opts=['--include-dotfiles'] ) - # TODO: filter for only changed files? # Provide metadata for sensitive information - mark_sensitive(ds, "sourcedata") - mark_sensitive(ds, "*_scans.tsv") # top level - mark_sensitive(ds, "*/*_scans.tsv") # within subj - mark_sensitive(ds, "*/*/*_scans.tsv") # within sess/subj - mark_sensitive(ds, "*/anat") # within subj - mark_sensitive(ds, "*/*/anat") # within ses/subj + last_commit = "HEAD" + mark_sensitive(ds, "sourcedata", last_commit) + mark_sensitive(ds, "*_scans.tsv", last_commit) # top level + mark_sensitive(ds, "*/*_scans.tsv", last_commit) # within subj + mark_sensitive(ds, "*/*/*_scans.tsv", last_commit) # within sess/subj + mark_sensitive(ds, "*/anat", last_commit) # within subj + mark_sensitive(ds, "*/*/anat", last_commit) # within ses/subj if dsh_path: - mark_sensitive(ds, ".heudiconv") # entire .heudiconv! + mark_sensitive(ds, ".heudiconv", last_commit) # entire .heudiconv! superds.save(path=ds.path, message=msg, recursive=True) assert not ds.repo.dirty @@ -178,7 +178,7 @@ def add_to_datalad( """ -def mark_sensitive(ds: Dataset, path_glob: str) -> None: +def mark_sensitive(ds: Dataset, path_glob: str, commit: str = None) -> None: """ Parameters @@ -186,18 +186,28 @@ def mark_sensitive(ds: Dataset, path_glob: str) -> None: ds : Dataset to operate on path_glob : str glob of the paths within dataset to work on + commit : str + commit which files to mark Returns ------- None """ paths = glob(op.join(ds.path, path_glob)) + if commit: + paths_in_commit = [ + op.join(ds.path, nf) + for nf in ds.repo.call_git( + ["show", "--name-only", commit, "--format=oneline"] + ).split("\n")[1:] + ] + paths = [p for p in paths if p in paths_in_commit] if not paths: return lgr.debug("Marking %d files with distribution-restrictions field", len(paths)) # set_metadata can be a bloody generator res = ds.repo.set_metadata( - paths, init=dict([("distribution-restrictions", "sensitive")]), recursive=True + paths, add=dict([("distribution-restrictions", "sensitive")]), recursive=True ) if inspect.isgenerator(res): res = list(res) diff --git a/heudiconv/external/tests/test_dlad.py b/heudiconv/external/tests/test_dlad.py index 6518d648..5900311a 100644 --- a/heudiconv/external/tests/test_dlad.py +++ b/heudiconv/external/tests/test_dlad.py @@ -28,3 +28,23 @@ def test_mark_sensitive(tmp_path: Path) -> None: # g2 since the same content assert not all_meta.pop("g1", None) # nothing or empty record assert all_meta == {"f1": target_rec, "f2": target_rec, "g2": target_rec} + + +def test_mark_sensitive_last_commit(tmp_path: Path) -> None: + ds = dl.Dataset(tmp_path).create(force=True) + create_tree( + str(tmp_path), + { + "f1": "d1", + "f2": "d2", + "g1": "d3", + "g2": "d1", + }, + ) + ds.save(".") + mark_sensitive(ds, "f*", "HEAD") + all_meta = dict(ds.repo.get_metadata(".")) + target_rec = {"distribution-restrictions": ["sensitive"]} + # g2 since the same content + assert not all_meta.pop("g1", None) # nothing or empty record + assert all_meta == {"f1": target_rec, "f2": target_rec, "g2": target_rec}