From eb8909b2a9eff9269b85df100c64a939241d3afd Mon Sep 17 00:00:00 2001 From: John Chilton Date: Fri, 26 Jan 2024 11:32:09 -0500 Subject: [PATCH] Implement a cross product collection operation tools. These are available in a list(n) x list (m) -> list(nxm) version (a cross product that produces two flat lists) and a list(n) x list(m) -> list(n):list(m) version (a cross product that produces two nested lists). After two lists have been run through one of these two tools - the result is two new lists that can be passed into another tool to perform all-against-all operations using Galaxy's normal collection mapping semantics. The choice of which to use will depend on how you want to continue to process the all-against-all results after the next step in an analysis. My sense is the flat version is "easier" to think about and pick through manually and the nested version perserves more structure if additional collection operation tools will be used to filter or aggregate the results. Some considerations: Apply Rules? I do not believe the Apply Rules tool semanatics would allow these operations but certainly the Apply Rules tool could be used to convert the result of the flat version to the nested version or vice versa - so no metadata is really lost per se between the two versions. I think it is still worth including both versions though - they both have utility (both for instance are baked into CWL's workflow semantics - https://docs.sevenbridges.com/docs/about-parallelizing-tool-executions#nested-cross-product) and avoiding requiring complex Apply Rules programs for simple workflows is probably ideal. One Tool vs Two? Marius and I agree that few simpler tools for these kinds of operations are better. The tool help can be more focused and avoiding the conditional and conditional outputs make the static analysis done for instance by the workflow editor simpler. --- lib/galaxy/config/sample/tool_conf.xml.sample | 2 + lib/galaxy/tools/__init__.py | 89 ++++++++++++++++++ lib/galaxy/tools/cross_product_flat.xml | 89 ++++++++++++++++++ lib/galaxy/tools/cross_product_nested.xml | 93 +++++++++++++++++++ lib/galaxy/tools/flatten_collection.xml | 17 ++-- lib/galaxy/tools/model_operation_macros.xml | 40 ++++++++ run_tests.sh | 13 +++ test/functional/tools/sample_tool_conf.xml | 2 + 8 files changed, 334 insertions(+), 11 deletions(-) create mode 100644 lib/galaxy/tools/cross_product_flat.xml create mode 100644 lib/galaxy/tools/cross_product_nested.xml create mode 100644 lib/galaxy/tools/model_operation_macros.xml diff --git a/lib/galaxy/config/sample/tool_conf.xml.sample b/lib/galaxy/config/sample/tool_conf.xml.sample index 57810365641a..82235fbb765c 100644 --- a/lib/galaxy/config/sample/tool_conf.xml.sample +++ b/lib/galaxy/config/sample/tool_conf.xml.sample @@ -37,6 +37,8 @@ + + diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index 5831359a8411..52cf89d539a5 100644 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -12,6 +12,7 @@ from collections.abc import MutableMapping from pathlib import Path from typing import ( + Any, cast, Dict, List, @@ -3286,6 +3287,94 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history ) +class CrossProductFlatCollectionTool(DatabaseOperationTool): + tool_type = "cross_product_flat" + require_terminal_states = False + require_dataset_ok = False + + def produce_outputs(self, trans, out_data, output_collections, incoming, history, **kwds): + input_a = incoming["input_a"] + input_b = incoming["input_b"] + join_identifier = incoming["join_identifier"] + + output_a = {} + output_b = {} + all_copied_hdas = [] + + for input_a_dce in input_a.collection.elements: + element_identifier_a = input_a_dce.element_identifier + for input_b_dce in input_b.collection.elements: + element_identifier_b = input_b_dce.element_identifier + identifier = f"{element_identifier_a}{join_identifier}{element_identifier_b}" + + hda_a_copy = input_a_dce.element_object.copy(copy_tags=input_a_dce.element_object.tags, flush=False) + hda_b_copy = input_b_dce.element_object.copy(copy_tags=input_b_dce.element_object.tags, flush=False) + all_copied_hdas.append(hda_a_copy) + all_copied_hdas.append(hda_b_copy) + output_a[identifier] = hda_a_copy + output_b[identifier] = hda_b_copy + + self._add_datasets_to_history(history, all_copied_hdas) + output_collections.create_collection( + self.outputs["output_a"], "output_a", elements=output_a, propagate_hda_tags=False + ) + output_collections.create_collection( + self.outputs["output_b"], "output_b", elements=output_b, propagate_hda_tags=False + ) + + +class CrossProductNestedCollectionTool(DatabaseOperationTool): + tool_type = "cross_product_nested" + require_terminal_states = False + require_dataset_ok = False + + def produce_outputs(self, trans, out_data, output_collections, incoming, history, **kwds): + input_a = incoming["input_a"] + input_b = incoming["input_b"] + + output_a = {} + output_b = {} + all_copied_hdas = [] + + for input_a_dce in input_a.collection.elements: + element_identifier_a = input_a_dce.element_identifier + + iter_elements_a = {} + iter_elements_b = {} + + for input_b_dce in input_b.collection.elements: + element_identifier_b = input_b_dce.element_identifier + + hda_a_copy = input_a_dce.element_object.copy(copy_tags=input_a_dce.element_object.tags, flush=False) + hda_b_copy = input_b_dce.element_object.copy(copy_tags=input_b_dce.element_object.tags, flush=False) + all_copied_hdas.append(hda_a_copy) + all_copied_hdas.append(hda_b_copy) + iter_elements_a[element_identifier_b] = hda_a_copy + iter_elements_b[element_identifier_b] = hda_b_copy + + sub_collection_a: Dict[str, Any] = {} + sub_collection_a["src"] = "new_collection" + sub_collection_a["collection_type"] = "list" + sub_collection_a["elements"] = iter_elements_a + + output_a[element_identifier_a] = sub_collection_a + + sub_collection_b: Dict[str, Any] = {} + sub_collection_b["src"] = "new_collection" + sub_collection_b["collection_type"] = "list" + sub_collection_b["elements"] = iter_elements_b + + output_b[element_identifier_a] = sub_collection_b + + self._add_datasets_to_history(history, all_copied_hdas) + output_collections.create_collection( + self.outputs["output_a"], "output_a", elements=output_a, propagate_hda_tags=False + ) + output_collections.create_collection( + self.outputs["output_b"], "output_b", elements=output_b, propagate_hda_tags=False + ) + + class BuildListCollectionTool(DatabaseOperationTool): tool_type = "build_list" require_terminal_states = False diff --git a/lib/galaxy/tools/cross_product_flat.xml b/lib/galaxy/tools/cross_product_flat.xml new file mode 100644 index 000000000000..891772c78a6c --- /dev/null +++ b/lib/galaxy/tools/cross_product_flat.xml @@ -0,0 +1,89 @@ + + + + + model_operation_macros.xml + + + + operation_3436 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/galaxy/tools/cross_product_nested.xml b/lib/galaxy/tools/cross_product_nested.xml new file mode 100644 index 000000000000..b4ba4d596de5 --- /dev/null +++ b/lib/galaxy/tools/cross_product_nested.xml @@ -0,0 +1,93 @@ + + + + + model_operation_macros.xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/galaxy/tools/flatten_collection.xml b/lib/galaxy/tools/flatten_collection.xml index 999d9b887bbd..67a96252dfeb 100644 --- a/lib/galaxy/tools/flatten_collection.xml +++ b/lib/galaxy/tools/flatten_collection.xml @@ -9,13 +9,12 @@ operation_2409 + + model_operation_macros.xml + - - - - - + @@ -35,14 +34,10 @@ - - - + - - - + diff --git a/lib/galaxy/tools/model_operation_macros.xml b/lib/galaxy/tools/model_operation_macros.xml new file mode 100644 index 000000000000..a3f16c5398e3 --- /dev/null +++ b/lib/galaxy/tools/model_operation_macros.xml @@ -0,0 +1,40 @@ + + + + + This tool will create new history datasets copied from your input collections but your quota usage will not increase. + + + operation_3436 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/run_tests.sh b/run_tests.sh index 7fbd26bc813c..2e04eab5eee9 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -82,6 +82,19 @@ Run a selenium test against a running server while watching client (fastest iter . .venv/bin/activate # source the virtualenv so can skip run_tests.sh. pytest lib/galaxy_test/selenium/test_workflow_editor.py::TestWorkflowEditor::test_data_input +To run the tool tests for a specific framework test tool +listed in test/functional/tools/sample_tool_conf.xml. + + ./run_tests.sh -framework -id + +If you'd like to skip this script and run it with pytest +directly a command like the following can be used. Note +the framework tools run with conda installation on but 99% +of the tools do not require this so this example includes +disabling that. + + GALAXY_TEST_TOOL_CONF="test/functional/tools/sample_tool_conf.xml" GALAXY_CONFIG_OVERRIDE_CONDA_AUTO_INIT=false pytest test/functional/test_toolbox_pytest.py -k -m tool + Note About Selenium Tests: If using a local selenium driver such as a Chrome or Firefox based one diff --git a/test/functional/tools/sample_tool_conf.xml b/test/functional/tools/sample_tool_conf.xml index ae0f15a5a7bc..53bb418e7bd3 100644 --- a/test/functional/tools/sample_tool_conf.xml +++ b/test/functional/tools/sample_tool_conf.xml @@ -295,6 +295,8 @@ + +