From 7858f2656976e19fc5e724742c6a02a23a990fc0 Mon Sep 17 00:00:00 2001 From: Wolfgang Maier Date: Wed, 6 Mar 2024 10:47:15 +0100 Subject: [PATCH 01/17] Improve usability of Directory datatype --- .../config/sample/datatypes_conf.xml.sample | 9 +- .../converters/archive_to_directory.xml | 33 +++++ .../datatypes/converters/tar_to_directory.xml | 2 +- lib/galaxy/datatypes/data.py | 137 ++++++++++++++++++ 4 files changed, 176 insertions(+), 5 deletions(-) create mode 100644 lib/galaxy/datatypes/converters/archive_to_directory.xml diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample index 0e026b5a7b0a..b62d930bb515 100644 --- a/lib/galaxy/config/sample/datatypes_conf.xml.sample +++ b/lib/galaxy/config/sample/datatypes_conf.xml.sample @@ -288,13 +288,14 @@ - + + + - - - + + diff --git a/lib/galaxy/datatypes/converters/archive_to_directory.xml b/lib/galaxy/datatypes/converters/archive_to_directory.xml new file mode 100644 index 000000000000..9ccdba28a30e --- /dev/null +++ b/lib/galaxy/datatypes/converters/archive_to_directory.xml @@ -0,0 +1,33 @@ + + + + galaxy-util + + + + + + + + + + + + + + + + + + + + diff --git a/lib/galaxy/datatypes/converters/tar_to_directory.xml b/lib/galaxy/datatypes/converters/tar_to_directory.xml index 59354b39b5fd..0160746283da 100644 --- a/lib/galaxy/datatypes/converters/tar_to_directory.xml +++ b/lib/galaxy/datatypes/converters/tar_to_directory.xml @@ -1,7 +1,7 @@ - galaxy-util + galaxy-util mkdir '$output1.files_path'; diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 0024adea6155..eb4d23c9926b 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1212,6 +1212,143 @@ def regex_line_dataprovider( class Directory(Data): """Class representing a directory of files.""" + # The behavior of this class is intended to be similar to that of + # composite types, but with arbitrary structure of extra_files. + + # A prioritized list of files in the directory that, if present, can serve + # as a replacement for a composite type's primary dataset. + recognized_index_files = [] + + # Directories converted from archives, and possibly others, have a single + # root folder inside the directory. The root_folder attribute lets tools + # access that folder without first exploring the directory tree themselves. + # Will be set to the empty string for flat directories. + MetadataElement( + name="root_folder", + default=None, + desc="Name of the root folder of the directory", + readonly=True, + optional=False, + visible=False, + ) + + # Capture the actual index file obtained from the recognized_index_files + # list for a concrete directory. + MetadataElement( + name="index_file", + default=None, + desc="Name of the index file relative to the root folder", + readonly=True, + optional=False, + visible=False, + ) + + MetadataElement( + name="folder_size", + default=0, + desc="Total size of the folder in bytes", + readonly=True, + optional=False, + visible=False, + ) + + @classmethod + def _get_size(cls, root_path): + total_size = 0 + for dirpath, dirnames, filenames in os.walk(root_path): + for f in filenames: + fp = os.path.join(dirpath, f) + # skip if it is symbolic link + if not os.path.islink(fp): + total_size += os.path.getsize(fp) + return total_size + + def set_meta(self, dataset: DatasetProtocol, **kwd): + efp = dataset.extra_files_path + efp_items = os.listdir(efp) + root_folder_name = efp_items[0] + if len(efp_items) == 1 and os.path.isdir(os.path.join(efp, root_folder_name)): + dataset.metadata.root_folder = root_folder_name + else: + dataset.metadata.root_folder = "" + index_file = None + for f in self.recognized_index_files: + if os.path.isfile(os.path.join(efp, root_folder_name, f)): + index_file = f + break + dataset.metadata.index_file = index_file + dataset.metadata.folder_size = self._get_size(os.path.join(efp, dataset.metadata.root_folder)) + + def set_peek(self, dataset: DatasetProtocol, **kwd) -> None: + if not dataset.dataset.purged: + dataset.peek = f"{dataset.metadata.root_folder}" + dataset.blurb = nice_size(dataset.metadata.folder_size) + else: + dataset.peek = "file does not exist" + dataset.blurb = "file purged from disk" + + def _archive_main_file( + self, archive: ZipstreamWrapper, display_name: str, data_filename: str + ) -> Tuple[bool, str, str]: + """Overwrites the method to not do anything. + + No main file gets added to a directory archive. + """ + error, msg, messagetype = False, "", "" + return error, msg, messagetype + + def display_data( + self, + trans, + dataset: DatasetHasHidProtocol, + preview: bool = False, + filename: Optional[str] = None, + to_ext: Optional[str] = None, + **kwd, + ): + headers = kwd.get("headers", {}) + # Prevent IE8 from sniffing content type since we're explicit about it. This prevents intentionally text/plain + # content from being rendered in the browser + headers["X-Content-Type-Options"] = "nosniff" + if to_ext: + # Download the directory structure as an archive + trans.log_event(f"Download directory for dataset id: {str(dataset.id)}") + return self._archive_composite_dataset(trans, dataset, headers, do_action=kwd.get("do_action", "zip")) + if preview: + root_folder = dataset.metadata.root_folder + index_file = dataset.metadata.index_file + if not filename or filename == 'index': + if root_folder is not None and index_file: + # display the index file in lieu of the empty primary dataset + file_path = os.path.join(dataset.extra_files_path, root_folder, index_file) + self._clean_and_set_mime_type(trans, "text/plain", headers) # type: ignore[arg-type] + return self._yield_user_file_content( + trans, + dataset, + file_path, + headers + ), headers + elif root_folder: + # delegate to the parent method, which knows how to display + # a directory + filename = root_folder + else: + # No meaningful display available, show an info message instead + self._clean_and_set_mime_type(trans, "text/plain", headers) # type: ignore[arg-type] + if root_folder is None: + return util.smart_str("Cannot generate preview with incomplete metadata. Resetting metadata may help.") + elif self.recognized_index_files: + return util.smart_str("None of the known key files for the datatype present. Is the datatype format set correctly?") + else: + return util.smart_str("No preview available for this dataset."), headers + + return super().display_data( + trans, + dataset=dataset, + filename=filename, + **kwd, + ) + class GenericAsn1(Text): """Class for generic ASN.1 text format""" From 7833391dc747a8032619a30cbef28a66ffae0b9b Mon Sep 17 00:00:00 2001 From: Wolfgang Maier Date: Fri, 8 Mar 2024 22:51:30 +0100 Subject: [PATCH 02/17] Use dataset's total_size property instead of calculating it again --- lib/galaxy/datatypes/data.py | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index eb4d23c9926b..11c676f141ca 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1243,26 +1243,6 @@ class Directory(Data): visible=False, ) - MetadataElement( - name="folder_size", - default=0, - desc="Total size of the folder in bytes", - readonly=True, - optional=False, - visible=False, - ) - - @classmethod - def _get_size(cls, root_path): - total_size = 0 - for dirpath, dirnames, filenames in os.walk(root_path): - for f in filenames: - fp = os.path.join(dirpath, f) - # skip if it is symbolic link - if not os.path.islink(fp): - total_size += os.path.getsize(fp) - return total_size - def set_meta(self, dataset: DatasetProtocol, **kwd): efp = dataset.extra_files_path efp_items = os.listdir(efp) @@ -1277,12 +1257,11 @@ def set_meta(self, dataset: DatasetProtocol, **kwd): index_file = f break dataset.metadata.index_file = index_file - dataset.metadata.folder_size = self._get_size(os.path.join(efp, dataset.metadata.root_folder)) def set_peek(self, dataset: DatasetProtocol, **kwd) -> None: if not dataset.dataset.purged: dataset.peek = f"{dataset.metadata.root_folder}" - dataset.blurb = nice_size(dataset.metadata.folder_size) + dataset.blurb = nice_size(dataset.dataset.total_size) else: dataset.peek = "file does not exist" dataset.blurb = "file purged from disk" From fcdc27d92ca1abdfc452598b69e374aaab83b88c Mon Sep 17 00:00:00 2001 From: Wolfgang Maier Date: Fri, 8 Mar 2024 23:21:14 +0100 Subject: [PATCH 03/17] Add auto-converters for tar.gz and tar.bz2 as suggested by @bernt-matthias --- lib/galaxy/config/sample/datatypes_conf.xml.sample | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample index b62d930bb515..a8e798143f8b 100644 --- a/lib/galaxy/config/sample/datatypes_conf.xml.sample +++ b/lib/galaxy/config/sample/datatypes_conf.xml.sample @@ -292,7 +292,7 @@ - + From 2ff6d33d6922a92ea406fd2d547ccc49686ec6af Mon Sep 17 00:00:00 2001 From: Wolfgang Maier Date: Fri, 8 Mar 2024 23:39:40 +0100 Subject: [PATCH 04/17] Make new converter testable --- lib/galaxy/datatypes/converters/archive_to_directory.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/galaxy/datatypes/converters/archive_to_directory.xml b/lib/galaxy/datatypes/converters/archive_to_directory.xml index 9ccdba28a30e..d0c56c154c09 100644 --- a/lib/galaxy/datatypes/converters/archive_to_directory.xml +++ b/lib/galaxy/datatypes/converters/archive_to_directory.xml @@ -18,6 +18,9 @@ + + + @@ -25,6 +28,7 @@ + From 8e538f2fc774b6b008230b383538f7ffde1e49cc Mon Sep 17 00:00:00 2001 From: Wolfgang Maier Date: Fri, 8 Mar 2024 23:46:57 +0100 Subject: [PATCH 05/17] Add file_ext and missing type annotation --- lib/galaxy/datatypes/data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 11c676f141ca..7c9fa7f80194 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1212,12 +1212,14 @@ def regex_line_dataprovider( class Directory(Data): """Class representing a directory of files.""" + file_ext = "directory" + # The behavior of this class is intended to be similar to that of # composite types, but with arbitrary structure of extra_files. # A prioritized list of files in the directory that, if present, can serve # as a replacement for a composite type's primary dataset. - recognized_index_files = [] + recognized_index_files: List[str] = [] # Directories converted from archives, and possibly others, have a single # root folder inside the directory. The root_folder attribute lets tools From 7484a6c82fb5b8414fc5698afe5f6c3f15108b11 Mon Sep 17 00:00:00 2001 From: Wolfgang Maier Date: Fri, 8 Mar 2024 23:54:19 +0100 Subject: [PATCH 06/17] Fix linting --- lib/galaxy/datatypes/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 7c9fa7f80194..eb282fcfaa46 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1315,7 +1315,7 @@ def display_data( filename = root_folder else: # No meaningful display available, show an info message instead - self._clean_and_set_mime_type(trans, "text/plain", headers) # type: ignore[arg-type] + self._clean_and_set_mime_type(trans, "text/plain", headers) # type: ignore[arg-type] if root_folder is None: return util.smart_str("Cannot generate preview with incomplete metadata. Resetting metadata may help.") elif self.recognized_index_files: From 566d526692e254bd010d8b8e688c5cf44c9d0902 Mon Sep 17 00:00:00 2001 From: Wolfgang Maier Date: Sat, 9 Mar 2024 00:05:09 +0100 Subject: [PATCH 07/17] Fix code formatting --- lib/galaxy/datatypes/data.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index eb282fcfaa46..3212ddea409d 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1298,17 +1298,12 @@ def display_data( if preview: root_folder = dataset.metadata.root_folder index_file = dataset.metadata.index_file - if not filename or filename == 'index': + if not filename or filename == "index": if root_folder is not None and index_file: # display the index file in lieu of the empty primary dataset file_path = os.path.join(dataset.extra_files_path, root_folder, index_file) self._clean_and_set_mime_type(trans, "text/plain", headers) # type: ignore[arg-type] - return self._yield_user_file_content( - trans, - dataset, - file_path, - headers - ), headers + return self._yield_user_file_content(trans, dataset, file_path, headers), headers elif root_folder: # delegate to the parent method, which knows how to display # a directory @@ -1317,9 +1312,13 @@ def display_data( # No meaningful display available, show an info message instead self._clean_and_set_mime_type(trans, "text/plain", headers) # type: ignore[arg-type] if root_folder is None: - return util.smart_str("Cannot generate preview with incomplete metadata. Resetting metadata may help.") + return util.smart_str( + "Cannot generate preview with incomplete metadata. Resetting metadata may help." + ) elif self.recognized_index_files: - return util.smart_str("None of the known key files for the datatype present. Is the datatype format set correctly?") + return util.smart_str( + "None of the known key files for the datatype present. Is the datatype format set correctly?" + ) else: return util.smart_str("No preview available for this dataset."), headers From 4f648855591b89a86b03266aa5fd65b1fa601f79 Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Wed, 9 Oct 2024 10:32:42 +0200 Subject: [PATCH 08/17] Drop index-related metadata from Directory datatype This will go into a sub-datatype when needed. --- lib/galaxy/datatypes/data.py | 72 ------------------------------------ 1 file changed, 72 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 3212ddea409d..c074bbdc4b1a 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1217,10 +1217,6 @@ class Directory(Data): # The behavior of this class is intended to be similar to that of # composite types, but with arbitrary structure of extra_files. - # A prioritized list of files in the directory that, if present, can serve - # as a replacement for a composite type's primary dataset. - recognized_index_files: List[str] = [] - # Directories converted from archives, and possibly others, have a single # root folder inside the directory. The root_folder attribute lets tools # access that folder without first exploring the directory tree themselves. @@ -1234,17 +1230,6 @@ class Directory(Data): visible=False, ) - # Capture the actual index file obtained from the recognized_index_files - # list for a concrete directory. - MetadataElement( - name="index_file", - default=None, - desc="Name of the index file relative to the root folder", - readonly=True, - optional=False, - visible=False, - ) - def set_meta(self, dataset: DatasetProtocol, **kwd): efp = dataset.extra_files_path efp_items = os.listdir(efp) @@ -1253,12 +1238,6 @@ def set_meta(self, dataset: DatasetProtocol, **kwd): dataset.metadata.root_folder = root_folder_name else: dataset.metadata.root_folder = "" - index_file = None - for f in self.recognized_index_files: - if os.path.isfile(os.path.join(efp, root_folder_name, f)): - index_file = f - break - dataset.metadata.index_file = index_file def set_peek(self, dataset: DatasetProtocol, **kwd) -> None: if not dataset.dataset.purged: @@ -1278,57 +1257,6 @@ def _archive_main_file( error, msg, messagetype = False, "", "" return error, msg, messagetype - def display_data( - self, - trans, - dataset: DatasetHasHidProtocol, - preview: bool = False, - filename: Optional[str] = None, - to_ext: Optional[str] = None, - **kwd, - ): - headers = kwd.get("headers", {}) - # Prevent IE8 from sniffing content type since we're explicit about it. This prevents intentionally text/plain - # content from being rendered in the browser - headers["X-Content-Type-Options"] = "nosniff" - if to_ext: - # Download the directory structure as an archive - trans.log_event(f"Download directory for dataset id: {str(dataset.id)}") - return self._archive_composite_dataset(trans, dataset, headers, do_action=kwd.get("do_action", "zip")) - if preview: - root_folder = dataset.metadata.root_folder - index_file = dataset.metadata.index_file - if not filename or filename == "index": - if root_folder is not None and index_file: - # display the index file in lieu of the empty primary dataset - file_path = os.path.join(dataset.extra_files_path, root_folder, index_file) - self._clean_and_set_mime_type(trans, "text/plain", headers) # type: ignore[arg-type] - return self._yield_user_file_content(trans, dataset, file_path, headers), headers - elif root_folder: - # delegate to the parent method, which knows how to display - # a directory - filename = root_folder - else: - # No meaningful display available, show an info message instead - self._clean_and_set_mime_type(trans, "text/plain", headers) # type: ignore[arg-type] - if root_folder is None: - return util.smart_str( - "Cannot generate preview with incomplete metadata. Resetting metadata may help." - ) - elif self.recognized_index_files: - return util.smart_str( - "None of the known key files for the datatype present. Is the datatype format set correctly?" - ) - else: - return util.smart_str("No preview available for this dataset."), headers - - return super().display_data( - trans, - dataset=dataset, - filename=filename, - **kwd, - ) - class GenericAsn1(Text): """Class for generic ASN.1 text format""" From 5530e154c031bf1217f00f26d6a14cdf840b173a Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:10:22 +0200 Subject: [PATCH 09/17] Drop root_folder metadata from Directory datatype This should probably go in sub-classes that expect specific directory structures. --- lib/galaxy/datatypes/data.py | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index c074bbdc4b1a..255f145144ce 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1217,31 +1217,9 @@ class Directory(Data): # The behavior of this class is intended to be similar to that of # composite types, but with arbitrary structure of extra_files. - # Directories converted from archives, and possibly others, have a single - # root folder inside the directory. The root_folder attribute lets tools - # access that folder without first exploring the directory tree themselves. - # Will be set to the empty string for flat directories. - MetadataElement( - name="root_folder", - default=None, - desc="Name of the root folder of the directory", - readonly=True, - optional=False, - visible=False, - ) - - def set_meta(self, dataset: DatasetProtocol, **kwd): - efp = dataset.extra_files_path - efp_items = os.listdir(efp) - root_folder_name = efp_items[0] - if len(efp_items) == 1 and os.path.isdir(os.path.join(efp, root_folder_name)): - dataset.metadata.root_folder = root_folder_name - else: - dataset.metadata.root_folder = "" - def set_peek(self, dataset: DatasetProtocol, **kwd) -> None: if not dataset.dataset.purged: - dataset.peek = f"{dataset.metadata.root_folder}" + dataset.peek = f"{dataset.extra_files_path}" dataset.blurb = nice_size(dataset.dataset.total_size) else: dataset.peek = "file does not exist" From 4a506fe4fc8425db53186cb181a91b0646256670 Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:19:02 +0200 Subject: [PATCH 10/17] Do not set peek for directories --- lib/galaxy/datatypes/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 255f145144ce..27ed78e3aa61 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1219,7 +1219,6 @@ class Directory(Data): def set_peek(self, dataset: DatasetProtocol, **kwd) -> None: if not dataset.dataset.purged: - dataset.peek = f"{dataset.extra_files_path}" dataset.blurb = nice_size(dataset.dataset.total_size) else: dataset.peek = "file does not exist" From 5ce45817f8051f462a745e46f8df05b251491060 Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:42:07 +0200 Subject: [PATCH 11/17] Set output format to directory --- lib/galaxy/datatypes/converters/archive_to_directory.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/datatypes/converters/archive_to_directory.xml b/lib/galaxy/datatypes/converters/archive_to_directory.xml index d0c56c154c09..45d4a18f4562 100644 --- a/lib/galaxy/datatypes/converters/archive_to_directory.xml +++ b/lib/galaxy/datatypes/converters/archive_to_directory.xml @@ -23,7 +23,7 @@ - + From 673dd37a036ad9dd6ef0469afe8f269828915647 Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:24:45 +0200 Subject: [PATCH 12/17] Copy metadata to galaxy.json in archive_to_directory converter --- lib/galaxy/datatypes/converters/archive_to_directory.xml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/datatypes/converters/archive_to_directory.xml b/lib/galaxy/datatypes/converters/archive_to_directory.xml index 45d4a18f4562..b346b4a49b62 100644 --- a/lib/galaxy/datatypes/converters/archive_to_directory.xml +++ b/lib/galaxy/datatypes/converters/archive_to_directory.xml @@ -4,6 +4,7 @@ galaxy-util directory - - + + From 75f9de3f1e759561557078635c8120148eccb9fe Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:59:30 +0200 Subject: [PATCH 13/17] Revert back to use provided_metadata_file --- lib/galaxy/datatypes/converters/archive_to_directory.xml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/galaxy/datatypes/converters/archive_to_directory.xml b/lib/galaxy/datatypes/converters/archive_to_directory.xml index b346b4a49b62..d0c56c154c09 100644 --- a/lib/galaxy/datatypes/converters/archive_to_directory.xml +++ b/lib/galaxy/datatypes/converters/archive_to_directory.xml @@ -4,7 +4,6 @@ galaxy-util directory - + From dfd2184b10482f79b153733d6d11598bb925b8dd Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Fri, 11 Oct 2024 16:33:25 +0200 Subject: [PATCH 14/17] Remove set_peek method from Directory datatype --- lib/galaxy/datatypes/data.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 27ed78e3aa61..66a6d9caa18e 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1214,16 +1214,6 @@ class Directory(Data): file_ext = "directory" - # The behavior of this class is intended to be similar to that of - # composite types, but with arbitrary structure of extra_files. - - def set_peek(self, dataset: DatasetProtocol, **kwd) -> None: - if not dataset.dataset.purged: - dataset.blurb = nice_size(dataset.dataset.total_size) - else: - dataset.peek = "file does not exist" - dataset.blurb = "file purged from disk" - def _archive_main_file( self, archive: ZipstreamWrapper, display_name: str, data_filename: str ) -> Tuple[bool, str, str]: From ccd7b5254e58b27b9b2654147aae83d3f1cb5454 Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:10:18 +0200 Subject: [PATCH 15/17] Add roundtrip test for downloading directories Compressed (Upload) -> Directory (Unpack) -> Compressed (Download) --- lib/galaxy/datatypes/data.py | 1 + lib/galaxy_test/api/test_tools_upload.py | 40 ++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 66a6d9caa18e..e5fea2612ace 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -465,6 +465,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd): composite_extensions = trans.app.datatypes_registry.get_composite_extensions() composite_extensions.append("html") # for archiving composite datatypes composite_extensions.append("data_manager_json") # for downloading bundles if bundled. + composite_extensions.append("directory") # for downloading directories. if data.extension in composite_extensions: return self._archive_composite_dataset(trans, data, headers, do_action=kwd.get("do_action", "zip")) diff --git a/lib/galaxy_test/api/test_tools_upload.py b/lib/galaxy_test/api/test_tools_upload.py index ed32bc92caf0..57ffece984dc 100644 --- a/lib/galaxy_test/api/test_tools_upload.py +++ b/lib/galaxy_test/api/test_tools_upload.py @@ -2,12 +2,14 @@ import os import urllib.parse from base64 import b64encode +from typing import cast import pytest from tusclient import client from galaxy.tool_util.verify.test_data import TestDataResolver from galaxy.util import UNKNOWN +from galaxy.util.compression_utils import decompress_bytes_to_directory from galaxy.util.unittest_utils import ( skip_if_github_down, skip_if_site_down, @@ -639,6 +641,44 @@ def test_upload_composite_from_bad_tar(self, history_id): details = self.dataset_populator.get_history_dataset_details(history_id, dataset=dataset, assert_ok=False) assert details["state"] == "error" + def test_upload_zip_directory_roundtrip(self, history_id): + testdir = TestDataResolver().get_filename("testdir1.zip") + with open(testdir, "rb") as fh: + details = self._upload_and_get_details(fh, api="fetch", history_id=history_id, ext="zip", assert_ok=True) + assert details["file_ext"] == "zip" + + # Convert/unpack zip to directory. + payload = { + "src": "hda", + "id": details["id"], + "source_type": "zip", + "target_type": "directory", + "history_id": history_id, + } + create_response = self._post("tools/CONVERTER_archive_to_directory/convert", data=payload) + self.dataset_populator.wait_for_job(create_response.json()["jobs"][0]["id"], assert_ok=True) + create_response.raise_for_status() + directory_dataset = create_response.json()["outputs"][0] + + # Download (compressed) directory and verify structure. + content = self.dataset_populator.get_history_dataset_content( + history_id, dataset=directory_dataset, to_ext="directory", type="bytes" + ) + dir_path = decompress_bytes_to_directory(cast(bytes, content)) + expected_directory_structure = { + "file1": "File", + "file2": "File", + "dir1": "Directory", + "dir1/file3": "File", + } + assert dir_path.endswith("testdir1") + for path, entry_class in expected_directory_structure.items(): + path = os.path.join(dir_path, path) + if entry_class == "Directory": + assert os.path.isdir(path) + else: + assert os.path.isfile(path) + def test_upload_dbkey(self): with self.dataset_populator.test_history() as history_id: payload = self.dataset_populator.upload_payload(history_id, "Test123", dbkey="hg19") From 9ef9a435ef8475196ad83eac39efcbe4601e4612 Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Wed, 30 Oct 2024 16:57:44 +0100 Subject: [PATCH 16/17] Add TAR and ZIP roundtrip tests --- lib/galaxy_test/api/test_tools_upload.py | 99 ++++++++++++++++-------- 1 file changed, 66 insertions(+), 33 deletions(-) diff --git a/lib/galaxy_test/api/test_tools_upload.py b/lib/galaxy_test/api/test_tools_upload.py index 57ffece984dc..952f2066bde7 100644 --- a/lib/galaxy_test/api/test_tools_upload.py +++ b/lib/galaxy_test/api/test_tools_upload.py @@ -1,5 +1,6 @@ import json import os +import tempfile import urllib.parse from base64 import b64encode from typing import cast @@ -10,6 +11,7 @@ from galaxy.tool_util.verify.test_data import TestDataResolver from galaxy.util import UNKNOWN from galaxy.util.compression_utils import decompress_bytes_to_directory +from galaxy.util.hash_util import md5_hash_file from galaxy.util.unittest_utils import ( skip_if_github_down, skip_if_site_down, @@ -31,6 +33,14 @@ B64_FOR_1_2_3 = b64encode(b"1 2 3").decode("utf-8") URI_FOR_1_2_3 = f"base64://{B64_FOR_1_2_3}" +EXPECTED_TAR_CONTENTS = { + "testdir": "Directory", + "testdir/c": "Directory", + "testdir/a": "File", + "testdir/b": "File", + "testdir/c/d": "File", +} + class TestToolsUpload(ApiTestCase): dataset_populator: DatasetPopulator @@ -606,18 +616,11 @@ def _check_testdir_composite(self, dataset, history_id): assert content.strip() == "Test123" extra_files = self.dataset_populator.get_history_dataset_extra_files(history_id, dataset_id=dataset["id"]) assert len(extra_files) == 5, extra_files - expected_contents = { - "testdir": "Directory", - "testdir/c": "Directory", - "testdir/a": "File", - "testdir/b": "File", - "testdir/c/d": "File", - } found_files = set() for extra_file in extra_files: path = extra_file["path"] - assert path in expected_contents - assert extra_file["class"] == expected_contents[path] + assert path in EXPECTED_TAR_CONTENTS + assert extra_file["class"] == EXPECTED_TAR_CONTENTS[path] found_files.add(path) assert len(found_files) == 5, found_files @@ -641,44 +644,74 @@ def test_upload_composite_from_bad_tar(self, history_id): details = self.dataset_populator.get_history_dataset_details(history_id, dataset=dataset, assert_ok=False) assert details["state"] == "error" - def test_upload_zip_directory_roundtrip(self, history_id): + def test_upload_tar_roundtrip(self, history_id): + testdir = TestDataResolver().get_filename("testdir.tar") + expected_size = os.path.getsize(testdir) + with open(testdir, "rb") as fh: + details = self._upload_and_get_details(fh, api="fetch", history_id=history_id, assert_ok=True) + assert details["file_ext"] == "tar" + assert details["file_size"] == expected_size + content = cast( + bytes, self.dataset_populator.get_history_dataset_content(history_id, dataset=details, type="bytes") + ) + # Make sure we got the expected content size. + assert len(content) == expected_size + + # Make sure we get the expected contents. + dir_path = decompress_bytes_to_directory(content) + assert dir_path.endswith("testdir") + for path, entry_class in EXPECTED_TAR_CONTENTS.items(): + path = os.path.join(dir_path, os.path.pardir, path) + if entry_class == "Directory": + assert os.path.isdir(path) + else: + assert os.path.isfile(path) + + # Make sure the hash of the content matches the hash of the original file. + expected_hash = md5_hash_file(testdir) + assert expected_hash is not None + self._assert_content_matches_hash(content, expected_hash) + + def _assert_content_matches_hash(self, content: bytes, expected_hash: str): + with tempfile.NamedTemporaryFile("wb") as temp: + temp.write(content) + actual_hash = md5_hash_file(temp.name) + assert actual_hash == expected_hash + + def test_upload_zip_roundtrip(self, history_id): testdir = TestDataResolver().get_filename("testdir1.zip") + expected_size = os.path.getsize(testdir) with open(testdir, "rb") as fh: - details = self._upload_and_get_details(fh, api="fetch", history_id=history_id, ext="zip", assert_ok=True) + details = self._upload_and_get_details(fh, api="fetch", history_id=history_id, assert_ok=True) assert details["file_ext"] == "zip" - - # Convert/unpack zip to directory. - payload = { - "src": "hda", - "id": details["id"], - "source_type": "zip", - "target_type": "directory", - "history_id": history_id, - } - create_response = self._post("tools/CONVERTER_archive_to_directory/convert", data=payload) - self.dataset_populator.wait_for_job(create_response.json()["jobs"][0]["id"], assert_ok=True) - create_response.raise_for_status() - directory_dataset = create_response.json()["outputs"][0] - - # Download (compressed) directory and verify structure. - content = self.dataset_populator.get_history_dataset_content( - history_id, dataset=directory_dataset, to_ext="directory", type="bytes" + assert details["file_size"] == expected_size + content = cast( + bytes, self.dataset_populator.get_history_dataset_content(history_id, dataset=details, type="bytes") ) - dir_path = decompress_bytes_to_directory(cast(bytes, content)) - expected_directory_structure = { + # Make sure we got the expected content size. + assert len(content) == expected_size + + # Make sure we get the expected contents. + dir_path = decompress_bytes_to_directory(content) + assert dir_path.endswith("testdir1") + EXPECTED_ZIP_CONTENTS = { "file1": "File", "file2": "File", - "dir1": "Directory", + "dir1/": "Directory", "dir1/file3": "File", } - assert dir_path.endswith("testdir1") - for path, entry_class in expected_directory_structure.items(): + for path, entry_class in EXPECTED_ZIP_CONTENTS.items(): path = os.path.join(dir_path, path) if entry_class == "Directory": assert os.path.isdir(path) else: assert os.path.isfile(path) + # Make sure the hash of the content matches the hash of the original file. + expected_hash = md5_hash_file(testdir) + assert expected_hash is not None + self._assert_content_matches_hash(content, expected_hash) + def test_upload_dbkey(self): with self.dataset_populator.test_history() as history_id: payload = self.dataset_populator.upload_payload(history_id, "Test123", dbkey="hg19") From 14eb0c3b6fe2eef0a3ad6b952d265e01582daedd Mon Sep 17 00:00:00 2001 From: davelopez <46503462+davelopez@users.noreply.github.com> Date: Wed, 30 Oct 2024 18:12:28 +0100 Subject: [PATCH 17/17] Flush temporary file before testing hashes --- lib/galaxy_test/api/test_tools_upload.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/galaxy_test/api/test_tools_upload.py b/lib/galaxy_test/api/test_tools_upload.py index 952f2066bde7..15d0198f9c31 100644 --- a/lib/galaxy_test/api/test_tools_upload.py +++ b/lib/galaxy_test/api/test_tools_upload.py @@ -675,6 +675,7 @@ def test_upload_tar_roundtrip(self, history_id): def _assert_content_matches_hash(self, content: bytes, expected_hash: str): with tempfile.NamedTemporaryFile("wb") as temp: temp.write(content) + temp.flush() actual_hash = md5_hash_file(temp.name) assert actual_hash == expected_hash