From 69a3cda018d4a7f4e7fbe8604949185753808b0f Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 22 Sep 2024 10:35:13 +0200 Subject: [PATCH] Fix history import when parent_hda not serialized and make sure we include the parent_hda when exporting histories. Fixes https://sentry.galaxyproject.org/share/issue/20ee27f675ef476588bbe8ff273eaee7/: ``` KeyError: '31e7840b5aedca43523bfcc8e2e9e56f' File "galaxy/jobs/__init__.py", line 2039, in finish task_wrapper = self.tool.exec_after_process( File "galaxy/tools/__init__.py", line 3100, in exec_after_process JobImportHistoryArchiveWrapper(self.app, job.id).cleanup_after_job() File "galaxy/tools/imp_exp/__init__.py", line 81, in cleanup_after_job model_store.perform_import(new_history, job=job, new_history=True) File "galaxy/model/store/__init__.py", line 416, in perform_import self._import_implicit_dataset_conversions(object_import_tracker) File "galaxy/model/store/__init__.py", line 1280, in _import_implicit_dataset_conversions idc.parent_hda = object_import_tracker.hdas_by_key[idc_attrs["parent_hda"]] ``` --- lib/galaxy/model/__init__.py | 1 + lib/galaxy/model/store/__init__.py | 30 +++++++++--- test/unit/data/model/test_model_store.py | 62 +++++++++++++++++++++--- 3 files changed, 78 insertions(+), 15 deletions(-) diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 8c04e2ecefb5..6354b3654f0e 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -4546,6 +4546,7 @@ class DatasetInstance(RepresentById, UsesCreateAndUpdateTime, _HasTable): copied_from_history_dataset_association: Optional["HistoryDatasetAssociation"] copied_from_library_dataset_dataset_association: Optional["LibraryDatasetDatasetAssociation"] implicitly_converted_datasets: List["ImplicitlyConvertedDatasetAssociation"] + implicitly_converted_parent_datasets: List["ImplicitlyConvertedDatasetAssociation"] validated_states = DatasetValidatedState diff --git a/lib/galaxy/model/store/__init__.py b/lib/galaxy/model/store/__init__.py index a27e8c57b10c..a3bf6a6ec896 100644 --- a/lib/galaxy/model/store/__init__.py +++ b/lib/galaxy/model/store/__init__.py @@ -1023,7 +1023,7 @@ def _reassign_hids(self, object_import_tracker: "ObjectImportTracker", history: if object_import_tracker.copy_hid_for: # in an if to avoid flush if unneeded - for from_dataset, to_dataset in object_import_tracker.copy_hid_for.items(): + for from_dataset, to_dataset in object_import_tracker.copy_hid_for: to_dataset.hid = from_dataset.hid self._session_add(to_dataset) self._flush() @@ -1276,18 +1276,24 @@ def _import_implicit_dataset_conversions(self, object_import_tracker: "ObjectImp metadata_safe = False idc = model.ImplicitlyConvertedDatasetAssociation(metadata_safe=metadata_safe, for_import=True) idc.type = idc_attrs["file_type"] - if idc_attrs.get("parent_hda"): - idc.parent_hda = object_import_tracker.hdas_by_key[idc_attrs["parent_hda"]] + # We may not have exported the parent, so only set the parent_hda attribute if we did. + if (parent_hda_id := idc_attrs.get("parent_hda")) and ( + parent_hda := object_import_tracker.hdas_by_key.get(parent_hda_id) + ): + # exports created prior to 24.2 may not have a parent if the parent had been purged + idc.parent_hda = parent_hda if idc_attrs.get("hda"): idc.dataset = object_import_tracker.hdas_by_key[idc_attrs["hda"]] - # we have a the dataset and the parent, lets ensure they land up with the same HID - if idc.dataset and idc.parent_hda and idc.parent_hda in object_import_tracker.requires_hid: + # we have the dataset and the parent, lets ensure they land up with the same HID + if idc.dataset and idc.parent_hda: try: object_import_tracker.requires_hid.remove(idc.dataset) except ValueError: pass # we wanted to remove it anyway. - object_import_tracker.copy_hid_for[idc.parent_hda] = idc.dataset + # A HDA can be the parent of multiple implicitly converted dataset, + # that's thy we use [(source, target)] here + object_import_tracker.copy_hid_for.append((idc.parent_hda, idc.dataset)) self._session_add(idc) @@ -1370,7 +1376,7 @@ class ObjectImportTracker: hdca_copied_from_sinks: Dict[ObjectKeyType, ObjectKeyType] jobs_by_key: Dict[ObjectKeyType, model.Job] requires_hid: List["HistoryItem"] - copy_hid_for: Dict["HistoryItem", "HistoryItem"] + copy_hid_for: List[Tuple["HistoryItem", "HistoryItem"]] def __init__(self) -> None: self.libraries_by_key = {} @@ -1388,7 +1394,7 @@ def __init__(self) -> None: self.implicit_collection_jobs_by_key: Dict[str, ImplicitCollectionJobs] = {} self.workflows_by_key: Dict[str, model.Workflow] = {} self.requires_hid = [] - self.copy_hid_for = {} + self.copy_hid_for = [] self.new_history: Optional[model.History] = None @@ -2301,6 +2307,14 @@ def add_implicit_conversion_dataset( include_files: bool, conversion: model.ImplicitlyConvertedDatasetAssociation, ) -> None: + parent_hda = conversion.parent_hda + if parent_hda and parent_hda not in self.included_datasets: + # We should always include the parent of an implicit conversion + # to avoid holes in the provenance. + self.included_datasets[parent_hda] = (parent_hda, include_files) + grand_parent_association = parent_hda.implicitly_converted_parent_datasets + if grand_parent_association and (grand_parent_hda := grand_parent_association[0].parent_hda): + self.add_implicit_conversion_dataset(grand_parent_hda, include_files, grand_parent_association[0]) self.included_datasets[dataset] = (dataset, include_files) self.dataset_implicit_conversions[dataset] = conversion diff --git a/test/unit/data/model/test_model_store.py b/test/unit/data/model/test_model_store.py index a30410dd0207..5ea51452e9f9 100644 --- a/test/unit/data/model/test_model_store.py +++ b/test/unit/data/model/test_model_store.py @@ -122,32 +122,80 @@ def test_import_export_history_allow_discarded_data(): assert imported_job.output_datasets[0].dataset == datasets[1] -def test_import_export_history_with_implicit_conversion(): +def setup_history_with_implicit_conversion(): app = _mock_app() u, h, d1, d2, j = _setup_simple_cat_job(app) + intermediate_ext = "bam" + intermediate_implicit_hda = model.HistoryDatasetAssociation( + extension=intermediate_ext, create_dataset=True, flush=False, history=h + ) + intermediate_implicit_hda.hid = d2.hid convert_ext = "fasta" implicit_hda = model.HistoryDatasetAssociation(extension=convert_ext, create_dataset=True, flush=False, history=h) implicit_hda.hid = d2.hid # this adds and flushes the result... - d2.attach_implicitly_converted_dataset(app.model.context, implicit_hda, convert_ext) + intermediate_implicit_hda.attach_implicitly_converted_dataset(app.model.context, implicit_hda, convert_ext) + d2.attach_implicitly_converted_dataset(app.model.context, intermediate_implicit_hda, intermediate_ext) + + app.object_store.update_from_file(intermediate_implicit_hda.dataset, file_name=TEST_PATH_2_CONVERTED, create=True) app.object_store.update_from_file(implicit_hda.dataset, file_name=TEST_PATH_2_CONVERTED, create=True) - assert len(h.active_datasets) == 3 + assert len(h.active_datasets) == 4 + return app, h, implicit_hda + + +def test_import_export_history_with_implicit_conversion(): + app, h, _ = setup_history_with_implicit_conversion() imported_history = _import_export_history(app, h, export_files="copy", include_hidden=True) - assert len(imported_history.active_datasets) == 3 + assert len(imported_history.active_datasets) == 4 recovered_hda_2 = imported_history.active_datasets[1] assert recovered_hda_2.implicitly_converted_datasets - imported_conversion = recovered_hda_2.implicitly_converted_datasets[0] - assert imported_conversion.type == "fasta" - assert imported_conversion.dataset == imported_history.active_datasets[2] + intermediate_conversion = recovered_hda_2.implicitly_converted_datasets[0] + assert intermediate_conversion.type == "bam" + intermediate_hda = intermediate_conversion.dataset + assert intermediate_hda.implicitly_converted_datasets + final_conversion = intermediate_hda.implicitly_converted_datasets[0] + + assert final_conversion.type == "fasta" + assert final_conversion.dataset == imported_history.active_datasets[-1] # implicit conversions have the same HID... ensure this property is recovered... assert imported_history.active_datasets[2].hid == imported_history.active_datasets[1].hid +def test_import_export_history_with_implicit_conversion_parents_purged(): + app, h, implicit_hda = setup_history_with_implicit_conversion() + # Purge parents + parent = implicit_hda.implicitly_converted_parent_datasets[0].parent_hda + parent.dataset.purged = True + grandparent = parent.implicitly_converted_parent_datasets[0].parent_hda + grandparent.dataset.purged = True + app.model.context.commit() + imported_history = _import_export_history(app, h, export_files="copy", include_hidden=True) + + assert len(imported_history.active_datasets) == 2 + assert len(imported_history.datasets) == 4 + imported_implicit_hda = imported_history.active_datasets[1] + assert imported_implicit_hda.extension == "fasta" + + # implicit conversions have the same HID... ensure this property is recovered... + assert imported_implicit_hda.hid == implicit_hda.hid + assert imported_implicit_hda.implicitly_converted_parent_datasets + intermediate_implicit_conversion = imported_implicit_hda.implicitly_converted_parent_datasets[0] + intermediate_hda = intermediate_implicit_conversion.parent_hda + assert intermediate_hda.hid == implicit_hda.hid + assert intermediate_hda.extension == "bam" + assert intermediate_hda.implicitly_converted_datasets + assert intermediate_hda.implicitly_converted_parent_datasets + first_implicit_conversion = intermediate_hda.implicitly_converted_parent_datasets[0] + source_hda = first_implicit_conversion.parent_hda + assert source_hda.hid == implicit_hda.hid + assert source_hda.extension == "txt" + + def test_import_export_history_with_implicit_conversion_and_extra_files(): app = _mock_app()