dask-contrib · douglasdavis · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,23 +42,19 @@ io = [
   "pyarrow",
 ]
 complete = [
-  "aiohttp",
-  "pyarrow",
+  "dask-awkward[io]",
 ]
 # `docs` and `test` are separate from user installs
 docs = [
+  "dask-awkard[complete]",
   "dask-sphinx-theme >=3.0.2",
-  "pyarrow",
   "sphinx-design",
-  "pytest >=6.0",
-  "pytest-cov >=3.0.0",
   "requests >=2.27.1",
 ]
 test = [
-  "aiohttp",
+  "dask-awkward[complete]",
   "distributed",
   "pandas",
-  "pyarrow",
   "pytest >=6.0",
   "pytest-cov >=3.0.0",
   "requests >=2.27.1",

diff --git a/src/dask_awkward/lib/optimize.py b/src/dask_awkward/lib/optimize.py
@@ -349,6 +349,8 @@ def _get_column_reports(dsk: HighLevelGraph) -> dict[str, Any]:
 
     layers = dsk.layers.copy()  # type: ignore
     deps = dsk.dependencies.copy()  # type: ignore
+    dependents = dsk.dependents
+
     reports = {}
 
     # make labelled report
@@ -367,12 +369,30 @@ def _get_column_reports(dsk: HighLevelGraph) -> dict[str, Any]:
         layers[name] = _touch_and_call(layers[name])
 
     hlg = HighLevelGraph(layers, deps)
-    outlayer = hlg.layers[hlg._toposort_layers()[-1]]
 
+    # this loop builds up what are the possible final leaf nodes by
+    # inspecting the dependents dictionary. If something does not have
+    # a dependent, it must be the end of a graph. These are the things
+    # we need to compute for; we only use a single partition (the
+    # first). for a single collection `.compute()` this list will just
+    # be length 1; but if we are using `dask.compute` to pass in
+    # multiple collections to be computed simultaneously, this list
+    # will increase in length.
+    leaf_layers_keys = [
+        (k, 0) for k, v in dependents.items() if isinstance(v, set) and len(v) == 0
+    ]
+
+    # now we try to compute for each possible output layer key (leaf
+    # node on partition 0); this will cause the typetacer reports to
+    # get correct fields/columns touched. If the result is a record or
+    # an array we of course want to touch all of the data/fields.
     try:
         for layer in hlg.layers.values():
             layer.__dict__.pop("_cached_dict", None)
-        out = get_sync(hlg, list(outlayer.keys())[0])
+        for outlayerkey in leaf_layers_keys:
+            out = get_sync(hlg, outlayerkey)
+            if isinstance(out, (ak.Array, ak.Record)):
+                out.layout._touch_data(recursive=True)
     except Exception as err:
         on_fail = dask.config.get("awkward.optimization.on-fail")
         # this is the default, throw a warning but skip the optimization.
@@ -394,8 +414,6 @@ def _get_column_reports(dsk: HighLevelGraph) -> dict[str, Any]:
                 "Valid options are 'warn', 'pass', or 'raise'."
             )
 
-    if isinstance(out, (ak.Array, ak.Record)):
-        out.layout._touch_data(recursive=True)
     return reports
 
 

diff --git a/tests/test_optimize.py b/tests/test_optimize.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+import awkward as ak
+import dask
+
+import dask_awkward as dak
+
+
+def test_multiple_computes(pq_points_dir) -> None:
+    ds1 = dak.from_parquet(pq_points_dir)
+    # add a columns= argument to force a new tokenize result in
+    # from_parquet so we get two unique collections.
+    ds2 = dak.from_parquet(pq_points_dir, columns=["points"])
+
+    lists = [[[1, 2, 3], [4, 5]], [[], [0, 0, 0]]]
+    ds3 = dak.from_lists(lists)
+
+    assert ds1.name != ds2.name
+    things1 = dask.compute(ds1.points.x, ds2.points.y)
+    things2 = dask.compute(ds1.points)
+    assert things2[0].x.tolist() == things1[0].tolist()
+
+    things3 = dask.compute(ds2.points.y, ds1.points.partitions[0])
+    assert things3[0].tolist() == things1[1].tolist()
+
+    assert len(things3[1]) < len(things3[0])
+
+    things = dask.compute(ds1.points, ds2.points.x, ds2.points.y, ds1.points.y, ds3)
+    assert things[-1].tolist() == ak.Array(lists[0] + lists[1]).tolist()