From ff2ee5a3f246c06f2596c19dc8aaa39e30e0e543 Mon Sep 17 00:00:00 2001
From: Taylor Salo <salot@pennmedicine.upenn.edu>
Date: Wed, 24 Jul 2024 08:20:45 -0400
Subject: [PATCH] Allow bids-filter-file terms for mesh and morphometry files
 (#1210)

---
 docs/usage.rst                 |   6 +-
 xcp_d/data/io_spec.yaml        | 152 +++++++++++++++++++++++++
 xcp_d/tests/test_utils_bids.py |  79 ++++++++++++-
 xcp_d/utils/bids.py            | 196 ++++++++++-----------------------
 xcp_d/workflows/base.py        |   4 +-
 5 files changed, 294 insertions(+), 143 deletions(-)
 create mode 100644 xcp_d/data/io_spec.yaml

diff --git a/docs/usage.rst b/docs/usage.rst
index 5bb3523a8..bc10718ec 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -152,6 +152,7 @@ This argument must point to a JSON file, containing filters that will be fed int
 The keys in this JSON file are unique to XCP-D.
 They are our internal terms for different inputs that will be selected from the preprocessed
 dataset.
+The full list of keys can be found in the file ``xcp_d/data/io_spec.yaml``.
 
 ``"bold"`` determines which preprocessed BOLD files will be chosen.
 You can set a number of entities here, including "session", "task", "space", "resolution", and
@@ -166,7 +167,7 @@ We recommend NOT setting the datatype, suffix, or file extension in the filter f
 ``"t1w"`` selects a native T1w-space, preprocessed T1w file.
 
 ``"t2w"`` selects a native T1w-space, preprocessed T2w file.
-If not T1w file is available, this file will be in T2w space.
+If a T1w file is not available, this file will be in T2w space.
 
 ``"anat_dseg"`` selects a segmentation file in the same space as the BOLD data.
 This file is not used for anything.
@@ -182,6 +183,9 @@ The standard space that will be used depends on the ``"bold"`` files that are se
 ``"template_to_anat_xfm"`` selects a transform from standard space to T1w/T2w space.
 Again, the standard space is determined based on other files.
 
+There are additional keys that control how mesh and morphometry files are selected.
+Please refer to ``io_spec.yaml`` for more information on them.
+
 
 Example bids-filter-file
 ========================
diff --git a/xcp_d/data/io_spec.yaml b/xcp_d/data/io_spec.yaml
new file mode 100644
index 000000000..88ce671b2
--- /dev/null
+++ b/xcp_d/data/io_spec.yaml
@@ -0,0 +1,152 @@
+queries:
+    base:
+        # brain mask in same standard space as BOLD data
+        anat_brainmask:
+            datatype: anat
+            desc: brain
+            extension: .nii.gz
+            suffix: mask
+        # native anatomical-space dseg file
+        anat_dseg:
+            datatype: anat
+            desc: null
+            extension: .nii.gz
+            space: null
+            suffix: dseg
+        # transform from native anatomical (T1w or T2w) space to same standard space as BOLD
+        # "to" entity will be set later
+        anat_to_template_xfm:
+            datatype: anat
+            from:
+            - T1w
+            - T2w
+            suffix: xfm
+        # all preprocessed BOLD files in the right space/resolution/density
+        bold:
+            datatype: func
+            desc:
+            - preproc
+            - null
+            suffix: bold
+        # native T1w-space, preprocessed T1w file
+        t1w:
+            datatype: anat
+            desc: preproc
+            extension: .nii.gz
+            space: null
+            suffix: T1w
+        # native T2w-space or T1w-space, preprocessed T2w file
+        t2w:
+            datatype: anat
+            desc: preproc
+            extension: .nii.gz
+            space:
+            - null
+            - T1w
+            suffix: T2w
+        # transform from standard space to anatomical (T1w or T2w) space
+        # "from" entity will be set later
+        template_to_anat_xfm:
+            datatype: anat
+            suffix: xfm
+            to:
+            - T1w
+            - T2w
+    mesh:
+        # Pial surface mesh in fsnative space (fsLR-space mesh will be searched for separately)
+        lh_pial_surf:
+            datatype: anat
+            desc: null
+            extension: .surf.gii
+            hemi: L
+            suffix: pial
+        # Subject's surface sphere to be used to warp fsnative meshes to fsLR space
+        lh_subject_sphere:
+            datatype: anat
+            desc: reg
+            extension: .surf.gii
+            hemi: L
+            space: null
+            suffix: sphere
+        # White matter surface mesh in fsnative space (fsLR-space mesh will be searched for separately)
+        lh_wm_surf:
+            datatype: anat
+            desc: null
+            extension: .surf.gii
+            hemi: L
+            suffix:
+            - smoothwm
+            - white
+        # Pial surface mesh in fsnative space (fsLR-space mesh will be searched for separately)
+        rh_pial_surf:
+            datatype: anat
+            desc: null
+            extension: .surf.gii
+            hemi: R
+            suffix: pial
+        # Subject's surface sphere to be used to warp fsnative meshes to fsLR space
+        rh_subject_sphere:
+            datatype: anat
+            desc: reg
+            extension: .surf.gii
+            hemi: R
+            space: null
+            suffix: sphere
+        # White matter surface mesh in fsnative space (fsLR-space mesh will be searched for separately)
+        rh_wm_surf:
+            datatype: anat
+            desc: null
+            extension: .surf.gii
+            hemi: R
+            suffix:
+            - smoothwm
+            - white
+    morphometry:
+        # Cortical thickness in fsLR space CIFTI
+        cortical_thickness:
+            datatype: anat
+            den: 91k
+            desc: null
+            extension: .dscalar.nii
+            space: fsLR
+            suffix: thickness
+        # Corrected cortical thickness in fsLR space CIFTI
+        cortical_thickness_corr:
+            datatype: anat
+            den: 91k
+            desc: corrected
+            extension: .dscalar.nii
+            space: fsLR
+            suffix: thickness
+        # Myelin map in fsLR space CIFTI
+        myelin:
+            datatype: anat
+            den: 91k
+            desc: null
+            extension: .dscalar.nii
+            space: fsLR
+            suffix: myelinw
+        # Smoothed myelin map in fsLR space CIFTI
+        myelin_smoothed:
+            datatype: anat
+            den: 91k
+            desc: smoothed
+            extension: .dscalar.nii
+            space: fsLR
+            suffix: myelinw
+        # Sulcal curvature in fsLR space CIFTI
+        sulcal_curv:
+            datatype: anat
+            den: 91k
+            desc: null
+            extension: .dscalar.nii
+            space: fsLR
+            suffix: curv
+        # Sulcal depth in fsLR space CIFTI
+        sulcal_depth:
+            datatype: anat
+            den: 91k
+            desc: null
+            extension: .dscalar.nii
+            space: fsLR
+            suffix: sulc
diff --git a/xcp_d/tests/test_utils_bids.py b/xcp_d/tests/test_utils_bids.py
index cc66431ea..7b731a7cd 100644
--- a/xcp_d/tests/test_utils_bids.py
+++ b/xcp_d/tests/test_utils_bids.py
@@ -126,13 +126,17 @@ def test_collect_mesh_data(datasets, tmp_path_factory):
     """Test collect_mesh_data."""
     # Dataset without mesh files
     layout = BIDSLayout(datasets["fmriprep_without_freesurfer"], validate=False)
-    mesh_available, standard_space_mesh, _, _ = xbids.collect_mesh_data(layout, "1648798153")
+    mesh_available, standard_space_mesh, _, _ = xbids.collect_mesh_data(
+        layout, "1648798153", bids_filters={}
+    )
     assert mesh_available is False
     assert standard_space_mesh is False
 
     # Dataset with native-space mesh files (one file matching each query)
     layout = BIDSLayout(datasets["pnc"], validate=False)
-    mesh_available, standard_space_mesh, _, _ = xbids.collect_mesh_data(layout, "1648798153")
+    mesh_available, standard_space_mesh, _, _ = xbids.collect_mesh_data(
+        layout, "1648798153", bids_filters={}
+    )
     assert mesh_available is True
     assert standard_space_mesh is False
 
@@ -153,7 +157,9 @@ def test_collect_mesh_data(datasets, tmp_path_factory):
         (std_mesh_dir / "sub-1648798153/ses-PNC1/anat").joinpath(f).touch()
 
     layout = BIDSLayout(std_mesh_dir, validate=False)
-    mesh_available, standard_space_mesh, _, _ = xbids.collect_mesh_data(layout, "1648798153")
+    mesh_available, standard_space_mesh, _, _ = xbids.collect_mesh_data(
+        layout, "1648798153", bids_filters={}
+    )
     assert mesh_available is True
     assert standard_space_mesh is True
 
@@ -179,7 +185,72 @@ def test_collect_mesh_data(datasets, tmp_path_factory):
 
     layout = BIDSLayout(std_mesh_dir, validate=False)
     with pytest.raises(ValueError, match="More than one surface found"):
-        xbids.collect_mesh_data(layout, "1648798153")
+        xbids.collect_mesh_data(layout, "1648798153", bids_filters={})
+
+    # If we include BIDS filters, we should be able to ignore the existing files
+    layout = BIDSLayout(datasets["pnc"], validate=False)
+    mesh_available, standard_space_mesh, _, _ = xbids.collect_mesh_data(
+        layout,
+        "1648798153",
+        bids_filters={
+            "lh_pial_surf": {"acquisition": "test"},
+            "rh_pial_surf": {"acquisition": "test"},
+            "lh_wm_surf": {"acquisition": "test"},
+            "rh_wm_surf": {"acquisition": "test"},
+            "lh_subject_sphere": {"acquisition": "test"},
+            "rh_subject_sphere": {"acquisition": "test"},
+        },
+    )
+    assert mesh_available is False
+    assert standard_space_mesh is False
+
+
+def test_collect_morphometry_data(datasets, tmp_path_factory):
+    """Test collect_morphometry_data."""
+    # Dataset without morphometry files
+    layout = BIDSLayout(datasets["fmriprep_without_freesurfer"], validate=False)
+    morph_file_types, _ = xbids.collect_morphometry_data(layout, "1648798153", bids_filters={})
+    assert morph_file_types == []
+
+    # Dataset with morphometry files (one file matching each query)
+    layout = BIDSLayout(datasets["pnc"], validate=False)
+    morph_file_types, _ = xbids.collect_morphometry_data(layout, "1648798153", bids_filters={})
+    assert morph_file_types == ["cortical_thickness", "sulcal_curv", "sulcal_depth"]
+
+    # Dataset with multiple files matching each query (raises an error)
+    bad_morph_dir = tmp_path_factory.mktemp("bad_morph")
+    shutil.copyfile(
+        os.path.join(datasets["pnc"], "dataset_description.json"),
+        bad_morph_dir / "dataset_description.json",
+    )
+    os.makedirs(bad_morph_dir / "sub-1648798153/ses-PNC1/anat", exist_ok=True)
+    files = [
+        "sub-1648798153_ses-PNC1_acq-refaced_space-fsLR_den-91k_thickness.dscalar.nii",
+        "sub-1648798153_ses-PNC1_acq-refaced2_space-fsLR_den-91k_thickness.dscalar.nii",
+        "sub-1648798153_ses-PNC1_acq-refaced_space-fsLR_den-91k_sulc.dscalar.nii",
+        "sub-1648798153_ses-PNC1_acq-refaced2_space-fsLR_den-91k_sulc.dscalar.nii",
+        "sub-1648798153_ses-PNC1_acq-refaced_space-fsLR_den-91k_curv.dscalar.nii",
+        "sub-1648798153_ses-PNC1_acq-refaced2_space-fsLR_den-91k_curv.dscalar.nii",
+    ]
+    for f in files:
+        (bad_morph_dir / "sub-1648798153/ses-PNC1/anat").joinpath(f).touch()
+
+    layout = BIDSLayout(bad_morph_dir, validate=False)
+    with pytest.raises(ValueError, match="More than one .* found"):
+        xbids.collect_morphometry_data(layout, "1648798153", bids_filters={})
+
+    # If we include BIDS filters, we should be able to ignore the existing files
+    layout = BIDSLayout(datasets["pnc"], validate=False)
+    morph_file_types, _ = xbids.collect_morphometry_data(
+        layout,
+        "1648798153",
+        bids_filters={
+            "cortical_thickness": {"acquisition": "test"},
+            "sulcal_curv": {"acquisition": "test"},
+            "sulcal_depth": {"acquisition": "test"},
+        },
+    )
+    assert morph_file_types == []
 
 
 def test_write_dataset_description(datasets, tmp_path_factory, caplog):
diff --git a/xcp_d/utils/bids.py b/xcp_d/utils/bids.py
index 904a06044..f89136af8 100644
--- a/xcp_d/utils/bids.py
+++ b/xcp_d/utils/bids.py
@@ -5,6 +5,7 @@
 Most of the code is copied from niworkflows.
 A PR will be submitted to niworkflows at some point.
 """
+
 import os
 import warnings
 from pathlib import Path
@@ -14,6 +15,7 @@
 from nipype import logging
 from packaging.version import Version
 
+from xcp_d.data import load as load_data
 from xcp_d.utils.doc import fill_doc
 from xcp_d.utils.filemanip import ensure_list
 
@@ -165,55 +167,8 @@ def collect_data(
     %(layout)s
     subj_data : dict
     """
-    queries = {
-        # all preprocessed BOLD files in the right space/resolution/density
-        "bold": {"datatype": "func", "suffix": "bold", "desc": ["preproc", None]},
-        # native T1w-space, preprocessed T1w file
-        "t1w": {
-            "datatype": "anat",
-            "space": None,
-            "desc": "preproc",
-            "suffix": "T1w",
-            "extension": ".nii.gz",
-        },
-        # native T2w-space, preprocessed T1w file
-        "t2w": {
-            "datatype": "anat",
-            "space": [None, "T1w"],
-            "desc": "preproc",
-            "suffix": "T2w",
-            "extension": ".nii.gz",
-        },
-        # native T1w-space dseg file
-        "anat_dseg": {
-            "datatype": "anat",
-            "space": None,
-            "desc": None,
-            "suffix": "dseg",
-            "extension": ".nii.gz",
-        },
-        # transform from standard space to T1w or T2w space
-        # "from" entity will be set later
-        "template_to_anat_xfm": {
-            "datatype": "anat",
-            "to": ["T1w", "T2w"],
-            "suffix": "xfm",
-        },
-        # brain mask in same standard space as BOLD data
-        "anat_brainmask": {
-            "datatype": "anat",
-            "desc": "brain",
-            "suffix": "mask",
-            "extension": ".nii.gz",
-        },
-        # transform from T1w or T2w space to standard space
-        # "to" entity will be set later
-        "anat_to_template_xfm": {
-            "datatype": "anat",
-            "from": ["T1w", "T2w"],
-            "suffix": "xfm",
-        },
-    }
+    _spec = yaml.safe_load(load_data.readable("io_spec.yaml").read_text())
+    queries = _spec["queries"]["base"]
     if input_type in ("hcp", "dcan", "ukb"):
         # HCP/DCAN data have anats only in standard space
         queries["t1w"]["space"] = "MNI152NLin6Asym"
@@ -226,8 +181,9 @@ def collect_data(
 
     # Apply filters. These may override anything.
     bids_filters = bids_filters or {}
-    for acq, entities in bids_filters.items():
-        queries[acq].update(entities)
+    for acq in queries.keys():
+        if acq in bids_filters:
+            queries[acq].update(bids_filters[acq])
 
     # Select the best available space.
     if "space" in queries["bold"]:
@@ -347,13 +303,16 @@ def collect_data(
             else:
                 subj_data[field] = None
 
-    LOGGER.log(25, f"Collected data:\n{yaml.dump(subj_data, default_flow_style=False, indent=4)}")
+    LOGGER.log(
+        25,
+        f"Collected data:\n{yaml.dump(subj_data, default_flow_style=False, indent=4)}",
+    )
 
     return subj_data
 
 
 @fill_doc
-def collect_mesh_data(layout, participant_label):
+def collect_mesh_data(layout, participant_label, bids_filters):
     """Collect surface files from preprocessed derivatives.
 
     This function will try to collect fsLR-space, 32k-resolution surface files first.
@@ -381,38 +340,38 @@ def collect_mesh_data(layout, participant_label):
     # Surfaces to use for brainsprite and anatomical workflow
     # The base surfaces can be used to generate the derived surfaces.
     # The base surfaces may be in native or standard space.
-    base_queries = {
-        "pial_surf": {
-            "desc": None,
-            "suffix": "pial",
-        },
-        "wm_surf": {
-            "desc": None,
-            "suffix": ["smoothwm", "white"],
-        },
-    }
+    _spec = yaml.safe_load(load_data.readable("io_spec.yaml").read_text())
+    queries = _spec["queries"]["mesh"]
 
+    # Apply filters. These may override anything.
+    bids_filters = bids_filters or {}
+    for acq in queries.keys():
+        if acq in bids_filters:
+            queries[acq].update(bids_filters[acq])
+
+    # First, try to grab the first base surface file in standard (fsLR) space.
+    # If it's not available, switch to native fsnative-space data.
     standard_space_mesh = True
-    for name, query in base_queries.items():
-        # First, try to grab the first base surface file in standard space.
-        # If it's not available, switch to native T1w-space data.
-        for hemisphere in ["L", "R"]:
-            temp_files = layout.get(
-                return_type="file",
-                subject=participant_label,
-                datatype="anat",
-                hemi=hemisphere,
-                space="fsLR",
-                den="32k",
-                extension=".surf.gii",
-                **query,
-            )
+    for name, query in queries.items():
+        # Don't look for fsLR-space versions of the subject spheres.
+        if "subject_sphere" in name:
+            continue
 
-            if len(temp_files) == 0:
-                LOGGER.info("No standard-space surfaces found.")
-                standard_space_mesh = False
-            elif len(temp_files) > 1:
-                LOGGER.warning(f"{name}: More than one standard-space surface found.")
+        temp_files = layout.get(
+            return_type="file",
+            subject=participant_label,
+            space="fsLR",
+            den="32k",
+            **query,
+        )
+
+        if len(temp_files) == 0:
+            standard_space_mesh = False
+        elif len(temp_files) > 1:
+            LOGGER.warning(f"{name}: More than one standard-space surface found.")
+
+    if not standard_space_mesh:
+        LOGGER.info("No standard-space surfaces found.")
 
     # Now that we know if there are standard-space surfaces available, we can grab the files.
     query_extras = {}
@@ -420,27 +379,15 @@ def collect_mesh_data(layout, participant_label):
         query_extras = {
             "space": None,
         }
-        # We need the subject spheres if we're using native-space surfaces.
-        base_queries["subject_sphere"] = {
-            "space": None,
-            "desc": "reg",
-            "suffix": "sphere",
-        }
 
     initial_mesh_files = {}
-    queries = {}
-    for name, query in base_queries.items():
-        for hemisphere in ["L", "R"]:
-            key = f"{hemisphere.lower()}h_{name}"
-            queries[key] = {
-                "subject": participant_label,
-                "datatype": "anat",
-                "hemi": hemisphere,
-                "extension": ".surf.gii",
-                **query,
-                **query_extras,
-            }
-            initial_mesh_files[key] = layout.get(return_type="file", **queries[key])
+    for name, query in queries.items():
+        queries[name] = {
+            "subject": participant_label,
+            **query,
+            **query_extras,
+        }
+        initial_mesh_files[name] = layout.get(return_type="file", **queries[name])
 
     mesh_files = {}
     mesh_available = True
@@ -449,8 +396,10 @@ def collect_mesh_data(layout, participant_label):
             mesh_files[dtype] = surface_files_[0]
 
         elif len(surface_files_) == 0:
-            mesh_available = False
             mesh_files[dtype] = None
+            # We don't need subject spheres if we have standard-space meshes already
+            if not ("subject_sphere" in dtype and standard_space_mesh):
+                mesh_available = False
 
         else:
             mesh_available = False
@@ -461,9 +410,6 @@ def collect_mesh_data(layout, participant_label):
                 f"Query: {queries[dtype]}"
             )
 
-    mesh_files["lh_subject_sphere"] = mesh_files.get("lh_subject_sphere", None)
-    mesh_files["rh_subject_sphere"] = mesh_files.get("rh_subject_sphere", None)
-
     # Check for *_space-dhcpAsym_desc-reg_sphere.surf.gii
     # If we find it, we assume segmentation was done with MCRIBS. Otherwise, assume FreeSurfer.
     dhcp_file = layout.get(
@@ -489,7 +435,7 @@ def collect_mesh_data(layout, participant_label):
 
 
 @fill_doc
-def collect_morphometry_data(layout, participant_label):
+def collect_morphometry_data(layout, participant_label, bids_filters):
     """Collect morphometry surface files from preprocessed derivatives.
 
     This function will look for fsLR-space, 91k-resolution morphometry CIFTI files.
@@ -510,42 +456,20 @@ def collect_morphometry_data(layout, participant_label):
         Dictionary of surface file identifiers and their paths.
         If the surface files weren't found, then the paths will be Nones.
     """
-    queries = {
-        "sulcal_depth": {
-            "desc": None,
-            "suffix": "sulc",
-        },
-        "sulcal_curv": {
-            "desc": None,
-            "suffix": "curv",
-        },
-        "cortical_thickness": {
-            "desc": None,
-            "suffix": "thickness",
-        },
-        "cortical_thickness_corr": {
-            "desc": "corrected",
-            "suffix": "thickness",
-        },
-        "myelin": {
-            "desc": None,
-            "suffix": "myelinw",
-        },
-        "myelin_smoothed": {
-            "desc": "smoothed",
-            "suffix": "myelinw",
-        },
-    }
+    _spec = yaml.safe_load(load_data.readable("io_spec.yaml").read_text())
+    queries = _spec["queries"]["morphometry"]
+
+    # Apply filters. These may override anything.
+    bids_filters = bids_filters or {}
+    for acq in queries.keys():
+        if acq in bids_filters:
+            queries[acq].update(bids_filters[acq])
 
     morphometry_files = {}
     for name, query in queries.items():
         files = layout.get(
             return_type="file",
             subject=participant_label,
-            datatype="anat",
-            space="fsLR",
-            den="91k",
-            extension=".dscalar.nii",
             **query,
         )
         if len(files) == 1:
diff --git a/xcp_d/workflows/base.py b/xcp_d/workflows/base.py
index e55055f0a..4df526b6b 100644
--- a/xcp_d/workflows/base.py
+++ b/xcp_d/workflows/base.py
@@ -138,10 +138,12 @@ def init_single_subject_wf(subject_id: str):
     mesh_available, standard_space_mesh, software, mesh_files = collect_mesh_data(
         layout=config.execution.layout,
         participant_label=subject_id,
+        bids_filters=config.execution.bids_filters,
     )
     morph_file_types, morphometry_files = collect_morphometry_data(
         layout=config.execution.layout,
         participant_label=subject_id,
+        bids_filters=config.execution.bids_filters,
     )
 
     # determine the appropriate post-processing workflow
@@ -155,7 +157,6 @@ def init_single_subject_wf(subject_id: str):
     inputnode = pe.Node(
         niu.IdentityInterface(
             fields=[
-                "subj_data",  # not currently used, but will be in future
                 "t1w",
                 "t2w",  # optional
                 "anat_brainmask",  # used to estimate head radius and for QC metrics
@@ -180,7 +181,6 @@ def init_single_subject_wf(subject_id: str):
         ),
         name="inputnode",
     )
-    inputnode.inputs.subj_data = subj_data
     inputnode.inputs.t1w = subj_data["t1w"]
     inputnode.inputs.t2w = subj_data["t2w"]
     inputnode.inputs.anat_brainmask = subj_data["anat_brainmask"]