From eb8909b2a9eff9269b85df100c64a939241d3afd Mon Sep 17 00:00:00 2001
From: John Chilton <jmchilton@gmail.com>
Date: Fri, 26 Jan 2024 11:32:09 -0500
Subject: [PATCH] Implement a cross product collection operation tools.

These are available in a list(n) x list (m) -> list(nxm) version (a cross product that produces two flat lists) and a list(n) x list(m) -> list(n):list(m) version (a cross product that produces two nested lists).

After two lists have been run through one of these two tools - the result is two new lists that can be passed into another tool to perform all-against-all operations using Galaxy's normal collection mapping semantics.

The choice of which to use will depend on how you want to continue to process the all-against-all results after the next step in an analysis. My sense is the flat version is "easier" to think about and pick through manually and the nested version perserves more structure if additional collection operation tools will be used to filter or aggregate the results.

Some considerations:

Apply Rules?

I do not believe the Apply Rules tool semanatics would allow these operations but certainly the Apply Rules tool could be used to convert the result of the flat version to the nested version or vice versa - so no metadata is really lost per se between the two versions. I think it is still worth including both versions though - they both have utility (both for instance are baked into CWL's workflow semantics - https://docs.sevenbridges.com/docs/about-parallelizing-tool-executions#nested-cross-product) and avoiding requiring complex Apply Rules programs for simple workflows is probably ideal.

One Tool vs Two?

Marius and I agree that few simpler tools for these kinds of operations are better. The tool help can be more focused and avoiding the conditional and conditional outputs make the static analysis done for instance by the workflow editor simpler.
---
 lib/galaxy/config/sample/tool_conf.xml.sample |  2 +
 lib/galaxy/tools/__init__.py                  | 89 ++++++++++++++++++
 lib/galaxy/tools/cross_product_flat.xml       | 89 ++++++++++++++++++
 lib/galaxy/tools/cross_product_nested.xml     | 93 +++++++++++++++++++
 lib/galaxy/tools/flatten_collection.xml       | 17 ++--
 lib/galaxy/tools/model_operation_macros.xml   | 40 ++++++++
 run_tests.sh                                  | 13 +++
 test/functional/tools/sample_tool_conf.xml    |  2 +
 8 files changed, 334 insertions(+), 11 deletions(-)
 create mode 100644 lib/galaxy/tools/cross_product_flat.xml
 create mode 100644 lib/galaxy/tools/cross_product_nested.xml
 create mode 100644 lib/galaxy/tools/model_operation_macros.xml
diff --git a/lib/galaxy/config/sample/tool_conf.xml.sample b/lib/galaxy/config/sample/tool_conf.xml.sample
index 57810365641a..82235fbb765c 100644
--- a/lib/galaxy/config/sample/tool_conf.xml.sample
+++ b/lib/galaxy/config/sample/tool_conf.xml.sample
@@ -37,6 +37,8 @@
     <tool file="${model_tools_path}/relabel_from_file.xml" />
     <tool file="${model_tools_path}/filter_from_file.xml" />
     <tool file="${model_tools_path}/sort_collection_list.xml" />
+    <tool file="${model_tools_path}/cross_product_flat.xml" />
+    <tool file="${model_tools_path}/cross_product_nested.xml" />
     <tool file="${model_tools_path}/tag_collection_from_file.xml" />
     <tool file="${model_tools_path}/apply_rules.xml" />
     <tool file="${model_tools_path}/build_list.xml" />
diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py
index 5831359a8411..52cf89d539a5 100644
--- a/lib/galaxy/tools/__init__.py
+++ b/lib/galaxy/tools/__init__.py
@@ -12,6 +12,7 @@
 from collections.abc import MutableMapping
 from pathlib import Path
 from typing import (
+    Any,
     cast,
     Dict,
     List,
@@ -3286,6 +3287,94 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history
         )
 
 
+class CrossProductFlatCollectionTool(DatabaseOperationTool):
+    tool_type = "cross_product_flat"
+    require_terminal_states = False
+    require_dataset_ok = False
+
+    def produce_outputs(self, trans, out_data, output_collections, incoming, history, **kwds):
+        input_a = incoming["input_a"]
+        input_b = incoming["input_b"]
+        join_identifier = incoming["join_identifier"]
+
+        output_a = {}
+        output_b = {}
+        all_copied_hdas = []
+
+        for input_a_dce in input_a.collection.elements:
+            element_identifier_a = input_a_dce.element_identifier
+            for input_b_dce in input_b.collection.elements:
+                element_identifier_b = input_b_dce.element_identifier
+                identifier = f"{element_identifier_a}{join_identifier}{element_identifier_b}"
+
+                hda_a_copy = input_a_dce.element_object.copy(copy_tags=input_a_dce.element_object.tags, flush=False)
+                hda_b_copy = input_b_dce.element_object.copy(copy_tags=input_b_dce.element_object.tags, flush=False)
+                all_copied_hdas.append(hda_a_copy)
+                all_copied_hdas.append(hda_b_copy)
+                output_a[identifier] = hda_a_copy
+                output_b[identifier] = hda_b_copy
+
+        self._add_datasets_to_history(history, all_copied_hdas)
+        output_collections.create_collection(
+            self.outputs["output_a"], "output_a", elements=output_a, propagate_hda_tags=False
+        )
+        output_collections.create_collection(
+            self.outputs["output_b"], "output_b", elements=output_b, propagate_hda_tags=False
+        )
+
+
+class CrossProductNestedCollectionTool(DatabaseOperationTool):
+    tool_type = "cross_product_nested"
+    require_terminal_states = False
+    require_dataset_ok = False
+
+    def produce_outputs(self, trans, out_data, output_collections, incoming, history, **kwds):
+        input_a = incoming["input_a"]
+        input_b = incoming["input_b"]
+
+        output_a = {}
+        output_b = {}
+        all_copied_hdas = []
+
+        for input_a_dce in input_a.collection.elements:
+            element_identifier_a = input_a_dce.element_identifier
+
+            iter_elements_a = {}
+            iter_elements_b = {}
+
+            for input_b_dce in input_b.collection.elements:
+                element_identifier_b = input_b_dce.element_identifier
+
+                hda_a_copy = input_a_dce.element_object.copy(copy_tags=input_a_dce.element_object.tags, flush=False)
+                hda_b_copy = input_b_dce.element_object.copy(copy_tags=input_b_dce.element_object.tags, flush=False)
+                all_copied_hdas.append(hda_a_copy)
+                all_copied_hdas.append(hda_b_copy)
+                iter_elements_a[element_identifier_b] = hda_a_copy
+                iter_elements_b[element_identifier_b] = hda_b_copy
+
+            sub_collection_a: Dict[str, Any] = {}
+            sub_collection_a["src"] = "new_collection"
+            sub_collection_a["collection_type"] = "list"
+            sub_collection_a["elements"] = iter_elements_a
+
+            output_a[element_identifier_a] = sub_collection_a
+
+            sub_collection_b: Dict[str, Any] = {}
+            sub_collection_b["src"] = "new_collection"
+            sub_collection_b["collection_type"] = "list"
+            sub_collection_b["elements"] = iter_elements_b
+
+            output_b[element_identifier_a] = sub_collection_b
+
+        self._add_datasets_to_history(history, all_copied_hdas)
+        output_collections.create_collection(
+            self.outputs["output_a"], "output_a", elements=output_a, propagate_hda_tags=False
+        )
+        output_collections.create_collection(
+            self.outputs["output_b"], "output_b", elements=output_b, propagate_hda_tags=False
+        )
+
+
 class BuildListCollectionTool(DatabaseOperationTool):
     tool_type = "build_list"
     require_terminal_states = False
diff --git a/lib/galaxy/tools/cross_product_flat.xml b/lib/galaxy/tools/cross_product_flat.xml
new file mode 100644
index 000000000000..891772c78a6c
--- /dev/null
+++ b/lib/galaxy/tools/cross_product_flat.xml
@@ -0,0 +1,89 @@
+<tool id="__CROSS_PRODUCT_FLAT__"
+      name="Flat Cross Product"
+      version="1.0.0">
+    <description></description>
+    <type class="CrossProductFlatCollectionTool" module="galaxy.tools" />
+    <macros>
+        <import>model_operation_macros.xml</import>
+    </macros>
+    <expand macro="uses_a_model_operation_action" />
+    <edam_operations>
+        <edam_operation>operation_3436</edam_operation> <!-- DataHandling -> Aggregation -->
+    </edam_operations>
+    <inputs>
+        <param type="data_collection" name="input_a" collection_type="list" label="Input Collection A" />
+        <param type="data_collection" name="input_b" collection_type="list" label="Input Collection B" />
+        <expand macro="join_identifier" />
+    </inputs>
+    <outputs>
+        <collection name="output_a" format_source="input_a" type="list" label="${on_string} (A files)" >
+        </collection>
+        <collection name="output_b" format_source="input_b" type="list" label="${on_string} (B files)" >
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_a">
+                <collection type="list">
+                    <element name="a1" value="simple_line.txt" />
+                    <element name="a2" value="simple_line_alternative.txt" />
+                </collection>
+            </param>
+            <param name="input_b">
+                <collection type="list">
+                    <element name="b1" value="1.txt" />
+                    <element name="b2" value="1.fasta" />
+                </collection>
+            </param>
+            <param name="join_identifier" value="_" />
+            <output_collection name="output_a" type="list">
+                <element name="a1_b1">
+                    <expand macro="assert_is_simple_line" />
+                </element>
+                <element name="a1_b2">
+                    <expand macro="assert_is_simple_line" />
+                </element>
+                <element name="a2_b1">
+                    <expand macro="assert_is_simple_line_alt" />
+                </element>
+                <element name="a2_b2">
+                    <expand macro="assert_is_simple_line_alt" />
+                </element>
+            </output_collection>
+            <output_collection name="output_b" type="list">
+                <element name="a1_b1">
+                    <expand macro="assert_is_1_dot_txt" />
+                </element>
+                <element name="a1_b2">
+                    <expand macro="assert_is_1_dot_fasta" />
+                </element>
+                <element name="a2_b1">
+                    <expand macro="assert_is_1_dot_txt" />
+                </element>
+                <element name="a2_b2">
+                    <expand macro="assert_is_1_dot_fasta" />
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+========
+Synopsis
+========
+
+
+
+===========
+Description
+===========
+
+
+----
+
+.. class:: infomark
+
+@QUOTA_USAGE_NOTE@
+
+     ]]></help>
+</tool>
diff --git a/lib/galaxy/tools/cross_product_nested.xml b/lib/galaxy/tools/cross_product_nested.xml
new file mode 100644
index 000000000000..b4ba4d596de5
--- /dev/null
+++ b/lib/galaxy/tools/cross_product_nested.xml
@@ -0,0 +1,93 @@
+<tool id="__CROSS_PRODUCT_NESTED__"
+      name="Nested Cross Product"
+      version="1.0.0">
+    <description></description>
+    <type class="CrossProductNestedCollectionTool" module="galaxy.tools" />
+    <macros>
+        <import>model_operation_macros.xml</import>
+    </macros>
+    <expand macro="uses_a_model_operation_action" />
+    <expand macro="annotate_as_aggregation_operation" />
+    <inputs>
+        <param type="data_collection" name="input_a" collection_type="list" label="Input Collection A" />
+        <param type="data_collection" name="input_b" collection_type="list" label="Input Collection B" />
+    </inputs>
+    <outputs>
+        <collection name="output_a" format_source="input_a" type="list:list" label="${on_string} (A files)" >
+        </collection>
+        <collection name="output_b" format_source="input_b" type="list:list" label="${on_string} (B files)" >
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_a">
+                <collection type="list">
+                    <element name="a1" value="simple_line.txt" />
+                    <element name="a2" value="simple_line_alternative.txt" />
+                </collection>
+            </param>
+            <param name="input_b">
+                <collection type="list">
+                    <element name="b1" value="1.txt" />
+                    <element name="b2" value="1.fasta" />
+                </collection>
+            </param>
+            <output_collection name="output_a" type="list:list">
+                <element name="a1">
+                    <element name="b1">
+                        <expand macro="assert_is_simple_line" />
+                    </element>
+                    <element name="b2">
+                        <expand macro="assert_is_simple_line" />
+                    </element>
+                </element>
+                <element name="a2">
+                    <element name="b1">
+                        <expand macro="assert_is_simple_line_alt" />
+                    </element>
+                    <element name="b2">
+                        <expand macro="assert_is_simple_line_alt" />
+                    </element>
+                </element>
+            </output_collection>
+            <output_collection name="output_b" type="list:list">
+                <element name="a1">
+                    <element name="b1">
+                        <expand macro="assert_is_1_dot_txt" />
+                    </element>
+                    <element name="b2">
+                        <expand macro="assert_is_1_dot_fasta" />
+                    </element>
+                </element>
+                <element name="a2">
+                    <element name="b1">
+                        <expand macro="assert_is_1_dot_txt" />
+                    </element>
+                    <element name="b2">
+                        <expand macro="assert_is_1_dot_fasta" />
+                    </element>
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+========
+Synopsis
+========
+
+
+
+===========
+Description
+===========
+
+
+----
+
+.. class:: infomark
+
+@QUOTA_USAGE_NOTE@
+
+     ]]></help>
+</tool>
diff --git a/lib/galaxy/tools/flatten_collection.xml b/lib/galaxy/tools/flatten_collection.xml
index 999d9b887bbd..67a96252dfeb 100644
--- a/lib/galaxy/tools/flatten_collection.xml
+++ b/lib/galaxy/tools/flatten_collection.xml
@@ -9,13 +9,12 @@
     <edam_operations>
         <edam_operation>operation_2409</edam_operation>
     </edam_operations>
+    <macros>
+        <import>model_operation_macros.xml</import>
+    </macros>
     <inputs>
         <param type="data_collection" name="input" label="Input Collection" />
-        <param type="select" name="join_identifier" label="Join collection identifiers using" help="Separator for merging dataset identifiers">
-            <option value="_">underscore ( _ )</option>
-            <option value=":">colon ( : )</option>
-            <option value="-">dash ( - )</option>
-        </param>
+        <expand macro="join_identifier" />
     </inputs>
     <outputs>
         <collection name="output" format_source="input" type="list" label="${on_string} (flattened)" >
@@ -35,14 +34,10 @@
             </param>
             <output_collection name="output" type="list">
               <element name="i1_forward">
-                <assert_contents>
-                  <has_text_matching expression="^This is a line of text.\n$" />
-                </assert_contents>
+                <expand macro="assert_is_simple_line" />
               </element>
               <element name="i1_reverse">
-                <assert_contents>
-                  <has_text_matching expression="^This is a different line of text.\n$" />
-                </assert_contents>
+                <expand macro="assert_is_simple_line_alt" />
               </element>
             </output_collection>
         </test>
diff --git a/lib/galaxy/tools/model_operation_macros.xml b/lib/galaxy/tools/model_operation_macros.xml
new file mode 100644
index 000000000000..a3f16c5398e3
--- /dev/null
+++ b/lib/galaxy/tools/model_operation_macros.xml
@@ -0,0 +1,40 @@
+<macros>
+    <xml name="uses_a_model_operation_action">
+        <action module="galaxy.tools.actions.model_operations"
+            class="ModelOperationToolAction"/>
+    </xml>
+    <token name="@QUOTA_USAGE_NOTE@">This tool will create new history datasets copied from your input collections but your quota usage will not increase.</token>
+    <xml name="annotate_as_aggregation_operation">
+        <edam_operations>
+            <edam_operation>operation_3436</edam_operation> <!-- DataHandling -> Aggregation -->
+        </edam_operations>
+    </xml>
+    <xml name="join_identifier">
+        <param type="select" name="join_identifier" label="Join collection identifiers using" help="Separator for merging dataset identifiers">
+            <option value="_">underscore ( _ )</option>
+            <option value=":">colon ( : )</option>
+            <option value="-">dash ( - )</option>
+        </param>
+    </xml>
+    <xml name="assert_is_simple_line">
+        <assert_contents>
+            <has_text_matching expression="^This is a line of text.\n$" />
+        </assert_contents>        
+    </xml>
+    <xml name="assert_is_simple_line_alt">
+        <assert_contents>
+            <has_text_matching expression="^This is a different line of text.\n$" />
+        </assert_contents>        
+    </xml>
+    <xml name="assert_is_1_dot_txt">
+        <assert_contents>
+            <has_text_matching expression="^chr1" />
+            <has_text_matching expression="42287290" />
+        </assert_contents>
+    </xml>
+    <xml name="assert_is_1_dot_fasta">
+        <assert_contents>
+            <has_text_matching expression="^>hg17\n" />
+        </assert_contents>
+    </xml>
+</macros>
diff --git a/run_tests.sh b/run_tests.sh
index 7fbd26bc813c..2e04eab5eee9 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -82,6 +82,19 @@ Run a selenium test against a running server while watching client (fastest iter
     . .venv/bin/activate # source the virtualenv so can skip run_tests.sh.
     pytest lib/galaxy_test/selenium/test_workflow_editor.py::TestWorkflowEditor::test_data_input
 
+To run the tool tests for a specific framework test tool
+listed in test/functional/tools/sample_tool_conf.xml.
+
+    ./run_tests.sh -framework -id <tool_id>
+
+If you'd like to skip this script and run it with pytest
+directly a command like the following can be used. Note
+the framework tools run with conda installation on but 99%
+of the tools do not require this so this example includes
+disabling that.
+
+    GALAXY_TEST_TOOL_CONF="test/functional/tools/sample_tool_conf.xml" GALAXY_CONFIG_OVERRIDE_CONDA_AUTO_INIT=false pytest test/functional/test_toolbox_pytest.py -k <tool_id> -m tool
+
 Note About Selenium Tests:
 
 If using a local selenium driver such as a Chrome or Firefox based one
diff --git a/test/functional/tools/sample_tool_conf.xml b/test/functional/tools/sample_tool_conf.xml
index ae0f15a5a7bc..53bb418e7bd3 100644
--- a/test/functional/tools/sample_tool_conf.xml
+++ b/test/functional/tools/sample_tool_conf.xml
@@ -295,6 +295,8 @@
   <tool file="${model_tools_path}/merge_collection.xml" />
   <tool file="${model_tools_path}/relabel_from_file.xml" />
   <tool file="${model_tools_path}/filter_from_file.xml" />
+  <tool file="${model_tools_path}/cross_product_flat.xml" />
+  <tool file="${model_tools_path}/cross_product_nested.xml" />
   <tool file="${model_tools_path}/tag_collection_from_file.xml" />
   <tool file="${model_tools_path}/apply_rules.xml" />
   <tool file="${model_tools_path}/build_list.xml" />