From 99e1fc89d43c4756b6aa60b13c266640fdbe8af5 Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 23 Apr 2024 14:35:07 +0200 Subject: [PATCH 01/14] Add support for Ogg, Webm, Mpeg, Mpga, M4a, Mov, Avi, Wmv, and Wma media types --- lib/galaxy/datatypes/media.py | 90 +++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index 5be3dea9fbb0..893a9e03457d 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -287,3 +287,93 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N dataset.metadata.nchannels = fd.getnchannels() except wave.Error: pass + + +class Ogg(Audio): + file_ext = "ogg" + + def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "ogg" in metadata["format_name"].split(",") + return False + + +class Webm(Video): + file_ext = "webm" + + def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "webm" in metadata["format_name"].split(",") + return False + + +class Mpeg(Video): + file_ext = "mpeg" + + def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mpeg" in metadata["format_name"].split(",") + return False + + +class Mpga(Audio): + file_ext = "mpga" + + def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mp3" in metadata["format_name"].split(",") + return False + + +class M4a(Audio): + file_ext = "m4a" + + def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "m4a" in metadata["format_name"].split(",") + return False + + +class Mov(Video): + file_ext = "mov" + + def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mov" in metadata["format_name"].split(",") + return False + + +class Avi(Video): + file_ext = "avi" + + def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "avi" in metadata["format_name"].split(",") + return False + + +class Wmv(Video): + file_ext = "wmv" + + def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "asf" in metadata["format_name"].split(",") + return False + + +class Wma(Audio): + file_ext = "wma" + + def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "asf" in metadata["format_name"].split(",") + return False From 541939f1ecb88d39f8b62f682ed7b8542e7cdd80 Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 23 Apr 2024 15:13:58 +0200 Subject: [PATCH 02/14] Add support for Ogg, Webm, Mpeg, Mpga, M4a, Mov, Avi, Wmv, and Wma media types --- .../config/sample/datatypes_conf.xml.sample | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample index 96be00d60aa7..7f94405408eb 100644 --- a/lib/galaxy/config/sample/datatypes_conf.xml.sample +++ b/lib/galaxy/config/sample/datatypes_conf.xml.sample @@ -931,10 +931,19 @@ + + + + + + + + + @@ -1246,6 +1255,15 @@ + + + + + + + + + From 498e095d452e99cf030c9ebc681aa4d5d6c2f544 Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 23 Apr 2024 18:10:15 +0200 Subject: [PATCH 03/14] Update media datatypes --- .../config/sample/datatypes_conf.xml.sample | 17 +++---- lib/galaxy/datatypes/media.py | 51 ++++++++++--------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample index 7f94405408eb..8dd9ed0e3487 100644 --- a/lib/galaxy/config/sample/datatypes_conf.xml.sample +++ b/lib/galaxy/config/sample/datatypes_conf.xml.sample @@ -931,19 +931,17 @@ - - - - + + - - - - - + + + + + @@ -1258,7 +1256,6 @@ - diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index 893a9e03457d..a4950eeeebda 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -17,6 +17,10 @@ from galaxy.datatypes.protocols import DatasetProtocol from galaxy.util import which +import magic + +mime = magic.Magic(mime=True) + @lru_cache(maxsize=128) def _ffprobe(path): @@ -183,7 +187,8 @@ class Mkv(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "matroska" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "matroska" in metadata["format_name"].split(",") and mime_type == "video/x-matroska" return False @@ -200,7 +205,8 @@ class Mp4(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "mp4" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "mp4" in metadata["format_name"].split(",") and mime_type == "video/mp4" return False @@ -210,7 +216,8 @@ class Flv(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "flv" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "flv" in metadata["format_name"].split(",") and mime_type == "video/x-flv" return False @@ -220,7 +227,8 @@ class Mpg(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "mpegvideo" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "mpegvideo" in metadata["format_name"].split(",") and mime_type == "video/mpeg" return False @@ -239,7 +247,8 @@ class Mp3(Audio): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "mp3" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "mp3" in metadata["format_name"].split(",") and mime_type == "audio/mpeg" return False @@ -295,7 +304,8 @@ class Ogg(Audio): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "ogg" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "ogg" in metadata["format_name"].split(",") and mime_type == "audio/ogg" return False @@ -305,7 +315,8 @@ class Webm(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "webm" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "webm" in metadata["format_name"].split(",") and mime_type == "video/webm" return False @@ -315,17 +326,8 @@ class Mpeg(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "mpeg" in metadata["format_name"].split(",") - return False - - -class Mpga(Audio): - file_ext = "mpga" - - def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - return "mp3" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "mpeg" in metadata["format_name"].split(",") and mime_type == "video/mpeg" return False @@ -335,7 +337,8 @@ class M4a(Audio): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "m4a" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "m4a" in metadata["format_name"].split(",") and mime_type == "audio/x-m4a" return False @@ -345,7 +348,8 @@ class Mov(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "mov" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "mov" in metadata["format_name"].split(",") and mime_type == "video/quicktime" return False @@ -355,7 +359,8 @@ class Avi(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "avi" in metadata["format_name"].split(",") + mime_type = mime.from_file(filename) + return "avi" in metadata["format_name"].split(",") and mime_type == "video/x-msvideo" return False @@ -365,7 +370,7 @@ class Wmv(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "asf" in metadata["format_name"].split(",") + return "asf" in metadata["format_name"].split(",") and metadata["nb_streams"] > 1 return False @@ -375,5 +380,5 @@ class Wma(Audio): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "asf" in metadata["format_name"].split(",") + return "asf" in metadata["format_name"].split(",") and metadata["nb_streams"] == 1 return False From f420c9788be82f57c2380a6bae1c7a62ff173e3b Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 23 Apr 2024 18:40:50 +0200 Subject: [PATCH 04/14] Refactor media datatype sniffing logic in media.py --- lib/galaxy/datatypes/media.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index a4950eeeebda..f14354a5bfd6 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -370,7 +370,8 @@ class Wmv(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "asf" in metadata["format_name"].split(",") and metadata["nb_streams"] > 1 + is_video = "video" in [stream["codec_type"] for stream in streams] + return "asf" in metadata["format_name"].split(",") and is_video return False @@ -380,5 +381,6 @@ class Wma(Audio): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - return "asf" in metadata["format_name"].split(",") and metadata["nb_streams"] == 1 + is_audio = "video" not in [stream["codec_type"] for stream in streams] + return "asf" in metadata["format_name"].split(",") and metadata["nb_streams"] == is_audio return False From 1cfc63ee54bc9f22dd9a3f2228d8732c895667df Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 23 Apr 2024 18:46:00 +0200 Subject: [PATCH 05/14] Refactor media datatype sniffing logic in media.py --- lib/galaxy/datatypes/media.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index f14354a5bfd6..e408c616b730 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -382,5 +382,5 @@ def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) is_audio = "video" not in [stream["codec_type"] for stream in streams] - return "asf" in metadata["format_name"].split(",") and metadata["nb_streams"] == is_audio + return "asf" in metadata["format_name"].split(",") and is_audio return False From e64cb60fbfb692c8370a26b05c21fb7bbd6b0c87 Mon Sep 17 00:00:00 2001 From: Arash Date: Thu, 25 Apr 2024 14:13:44 +0200 Subject: [PATCH 06/14] Update media datatypes and using magic numbers as main idinifier of file formats --- .../config/sample/datatypes_conf.xml.sample | 3 - lib/galaxy/datatypes/media.py | 168 +++++++++++------- 2 files changed, 105 insertions(+), 66 deletions(-) diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample index 8dd9ed0e3487..976a428bef1c 100644 --- a/lib/galaxy/config/sample/datatypes_conf.xml.sample +++ b/lib/galaxy/config/sample/datatypes_conf.xml.sample @@ -936,7 +936,6 @@ - @@ -1252,11 +1251,9 @@ - - diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index e408c616b730..94c0bfd0fadc 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -17,10 +17,6 @@ from galaxy.datatypes.protocols import DatasetProtocol from galaxy.util import which -import magic - -mime = magic.Magic(mime=True) - @lru_cache(maxsize=128) def _ffprobe(path): @@ -36,6 +32,92 @@ def ffprobe(path): return data["format"], data["streams"] +magic_number = { + "mp4": { + "offset": 4, + "hex": [ + "66 74 79 70 69", + "66 74 79 70 6D", + "66 74 79 70 4D", + ], + }, + "flv": {"offset": 0, "hex": ["46 4C 56 01"]}, + "mkv": {"offset": 0, "hex": ["1A 45 DF A3"]}, + "webm": {"offset": 0, "hex": ["1A 45 DF A3"]}, + "mov": {"offset": 4, "hex": ["66 74 79 70 71", "6D 6F 6F 76"]}, + "wav": {"offset": 8, "hex": ["57 41 56 45"]}, + "mp3": { + "offset": 0, + "hex": [ + "49 44 33", + "FF E0", + "FF E1", + "FF E2", + "FF E3", + "FF E4", + "FF E5", + "FF E6", + "FF E7", + "FF E8", + "FF E9", + "FF EA", + "FF EB", + "FF EC", + "FF ED", + "FF EE", + "FF EF", + "FF F0", + "FF F1", + "FF F2", + "FF F3", + "FF F4", + "FF F5", + "FF F6", + "FF F7", + "FF F8", + "FF F9", + "FF FA", + "FF FB", + "FF FC", + "FF FD", + "FF FE", + "FF FF", + ], + }, + "ogg": {"offset": 0, "hex": ["4F 67 67"]}, + "wma": {"offset": 0, "hex": ["30 26 B2 75"]}, + "wmv": {"offset": 0, "hex": ["30 26 B2 75"]}, + "avi": {"offset": 8, "hex": ["41 56 49"]}, + "mpeg": { + "offset": 0, + "hex": [ + "00 00 01 B0", + "00 00 01 B1", + "00 00 01 B3", + "00 00 01 B4", + "00 00 01 B5", + "00 00 01 B6", + "00 00 01 B7", + "00 00 01 B8", + "00 00 01 B9", + "00 00 01 BA", + "00 00 01 BB", + "00 00 01 BC", + "00 00 01 BD", + "00 00 01 BE", + "00 00 01 BF", + ], + }, +} + + +def file_format(filename: str, ff: str): + with open(filename, "rb") as f: + f.seek(magic_number[ff]["offset"]) + head = f.read(8) + return any(head.startswith(bytes.fromhex(hex_code)) for hex_code in magic_number[ff]["hex"]) + + class Audio(Binary): MetadataElement( name="duration", @@ -187,8 +269,10 @@ class Mkv(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "matroska" in metadata["format_name"].split(",") and mime_type == "video/x-matroska" + vp_check = any( + stream["codec_name"] in ["vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" + ) + return file_format(filename, "mkv") and not vp_check return False @@ -203,33 +287,21 @@ class Mp4(Video): file_ext = "mp4" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "mp4" in metadata["format_name"].split(",") and mime_type == "video/mp4" - return False + return file_format(filename, "mp4") class Flv(Video): file_ext = "flv" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "flv" in metadata["format_name"].split(",") and mime_type == "video/x-flv" - return False + return file_format(filename, "flv") class Mpg(Video): file_ext = "mpg" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "mpegvideo" in metadata["format_name"].split(",") and mime_type == "video/mpeg" - return False + return file_format(filename, "mpg") class Mp3(Audio): @@ -245,11 +317,7 @@ class Mp3(Audio): file_ext = "mp3" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "mp3" in metadata["format_name"].split(",") and mime_type == "audio/mpeg" - return False + return file_format(filename, "mp3") class Wav(Audio): @@ -283,8 +351,7 @@ def get_mime(self) -> str: return "audio/wav" def sniff(self, filename: str) -> bool: - with wave.open(filename, "rb"): - return True + return file_format(filename, "wav") def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: """Set the metadata for this dataset from the file contents.""" @@ -302,11 +369,7 @@ class Ogg(Audio): file_ext = "ogg" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "ogg" in metadata["format_name"].split(",") and mime_type == "audio/ogg" - return False + return file_format(filename, "ogg") class Webm(Video): @@ -315,8 +378,10 @@ class Webm(Video): def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "webm" in metadata["format_name"].split(",") and mime_type == "video/webm" + vp_check = any( + stream["codec_name"] in ["vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" + ) + return file_format(filename, "webm") and vp_check return False @@ -324,44 +389,21 @@ class Mpeg(Video): file_ext = "mpeg" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "mpeg" in metadata["format_name"].split(",") and mime_type == "video/mpeg" - return False - - -class M4a(Audio): - file_ext = "m4a" - - def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "m4a" in metadata["format_name"].split(",") and mime_type == "audio/x-m4a" - return False + return file_format(filename, "mpeg") class Mov(Video): file_ext = "mov" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "mov" in metadata["format_name"].split(",") and mime_type == "video/quicktime" - return False + return file_format(filename, "mov") class Avi(Video): file_ext = "avi" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - mime_type = mime.from_file(filename) - return "avi" in metadata["format_name"].split(",") and mime_type == "video/x-msvideo" - return False + return file_format(filename, "avi") class Wmv(Video): @@ -371,7 +413,7 @@ def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) is_video = "video" in [stream["codec_type"] for stream in streams] - return "asf" in metadata["format_name"].split(",") and is_video + return file_format(filename, "wmv") and is_video return False @@ -382,5 +424,5 @@ def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) is_audio = "video" not in [stream["codec_type"] for stream in streams] - return "asf" in metadata["format_name"].split(",") and is_audio + return file_format(filename, "wma") and is_audio return False From de6f00cf578d00b54dee4c5206d1e2488eaf34bd Mon Sep 17 00:00:00 2001 From: Arash Date: Thu, 25 Apr 2024 14:26:48 +0200 Subject: [PATCH 07/14] Refactor media datatype sniffing logic in media.py --- lib/galaxy/datatypes/media.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index 94c0bfd0fadc..008e589f0112 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -297,13 +297,6 @@ def sniff(self, filename: str) -> bool: return file_format(filename, "flv") -class Mpg(Video): - file_ext = "mpg" - - def sniff(self, filename: str) -> bool: - return file_format(filename, "mpg") - - class Mp3(Audio): """ Class that reads MP3 audio file. From e50f829af78bcad443ab71f6831d652f44fe028c Mon Sep 17 00:00:00 2001 From: Arash Date: Thu, 25 Apr 2024 14:34:02 +0200 Subject: [PATCH 08/14] change the order of file check, change names to meaningful names --- .../config/sample/datatypes_conf.xml.sample | 6 ++-- lib/galaxy/datatypes/media.py | 30 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample index 976a428bef1c..f43cb4226be8 100644 --- a/lib/galaxy/config/sample/datatypes_conf.xml.sample +++ b/lib/galaxy/config/sample/datatypes_conf.xml.sample @@ -1052,7 +1052,7 @@ - + @@ -1247,17 +1247,17 @@ - - + + diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index 008e589f0112..94faa318a963 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -111,11 +111,11 @@ def ffprobe(path): } -def file_format(filename: str, ff: str): +def _get_file_format_from_magic_number(filename: str, file_ext: str): with open(filename, "rb") as f: - f.seek(magic_number[ff]["offset"]) + f.seek(magic_number[file_ext]["offset"]) head = f.read(8) - return any(head.startswith(bytes.fromhex(hex_code)) for hex_code in magic_number[ff]["hex"]) + return any(head.startswith(bytes.fromhex(hex_code)) for hex_code in magic_number[file_ext]["hex"]) class Audio(Binary): @@ -272,7 +272,7 @@ def sniff(self, filename: str) -> bool: vp_check = any( stream["codec_name"] in ["vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" ) - return file_format(filename, "mkv") and not vp_check + return _get_file_format_from_magic_number(filename, "mkv") and not vp_check return False @@ -287,14 +287,14 @@ class Mp4(Video): file_ext = "mp4" def sniff(self, filename: str) -> bool: - return file_format(filename, "mp4") + return _get_file_format_from_magic_number(filename, "mp4") class Flv(Video): file_ext = "flv" def sniff(self, filename: str) -> bool: - return file_format(filename, "flv") + return _get_file_format_from_magic_number(filename, "flv") class Mp3(Audio): @@ -310,7 +310,7 @@ class Mp3(Audio): file_ext = "mp3" def sniff(self, filename: str) -> bool: - return file_format(filename, "mp3") + return _get_file_format_from_magic_number(filename, "mp3") class Wav(Audio): @@ -344,7 +344,7 @@ def get_mime(self) -> str: return "audio/wav" def sniff(self, filename: str) -> bool: - return file_format(filename, "wav") + return _get_file_format_from_magic_number(filename, "wav") def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: """Set the metadata for this dataset from the file contents.""" @@ -362,7 +362,7 @@ class Ogg(Audio): file_ext = "ogg" def sniff(self, filename: str) -> bool: - return file_format(filename, "ogg") + return _get_file_format_from_magic_number(filename, "ogg") class Webm(Video): @@ -374,7 +374,7 @@ def sniff(self, filename: str) -> bool: vp_check = any( stream["codec_name"] in ["vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" ) - return file_format(filename, "webm") and vp_check + return _get_file_format_from_magic_number(filename, "webm") and vp_check return False @@ -382,21 +382,21 @@ class Mpeg(Video): file_ext = "mpeg" def sniff(self, filename: str) -> bool: - return file_format(filename, "mpeg") + return _get_file_format_from_magic_number(filename, "mpeg") class Mov(Video): file_ext = "mov" def sniff(self, filename: str) -> bool: - return file_format(filename, "mov") + return _get_file_format_from_magic_number(filename, "mov") class Avi(Video): file_ext = "avi" def sniff(self, filename: str) -> bool: - return file_format(filename, "avi") + return _get_file_format_from_magic_number(filename, "avi") class Wmv(Video): @@ -406,7 +406,7 @@ def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) is_video = "video" in [stream["codec_type"] for stream in streams] - return file_format(filename, "wmv") and is_video + return _get_file_format_from_magic_number(filename, "wmv") and is_video return False @@ -417,5 +417,5 @@ def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) is_audio = "video" not in [stream["codec_type"] for stream in streams] - return file_format(filename, "wma") and is_audio + return _get_file_format_from_magic_number(filename, "wma") and is_audio return False From 371e22752dac3d1242b1fd9f109c21d0552950e1 Mon Sep 17 00:00:00 2001 From: Arash Date: Thu, 25 Apr 2024 14:34:52 +0200 Subject: [PATCH 09/14] Fix typo in datatype sniffing logic for H5 files --- lib/galaxy/config/sample/datatypes_conf.xml.sample | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample index f43cb4226be8..c6ac8cdd499f 100644 --- a/lib/galaxy/config/sample/datatypes_conf.xml.sample +++ b/lib/galaxy/config/sample/datatypes_conf.xml.sample @@ -1052,7 +1052,7 @@ - + From 230b73477464cabb5f755b53ff09473a868b482c Mon Sep 17 00:00:00 2001 From: Arash Date: Thu, 25 Apr 2024 17:35:37 +0200 Subject: [PATCH 10/14] Refactor media datatype sniffing logic --- .../config/sample/datatypes_conf.xml.sample | 4 +- lib/galaxy/datatypes/media.py | 56 +++++++++++++------ 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample index c6ac8cdd499f..9f654de4485a 100644 --- a/lib/galaxy/config/sample/datatypes_conf.xml.sample +++ b/lib/galaxy/config/sample/datatypes_conf.xml.sample @@ -937,7 +937,7 @@ - + @@ -1251,7 +1251,7 @@ - + diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index 94faa318a963..9f325a18aeb4 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -7,6 +7,7 @@ from typing import ( List, Tuple, + cast, ) from galaxy.datatypes.binary import Binary @@ -88,7 +89,7 @@ def ffprobe(path): "wma": {"offset": 0, "hex": ["30 26 B2 75"]}, "wmv": {"offset": 0, "hex": ["30 26 B2 75"]}, "avi": {"offset": 8, "hex": ["41 56 49"]}, - "mpeg": { + "mpg": { "offset": 0, "hex": [ "00 00 01 B0", @@ -113,9 +114,11 @@ def ffprobe(path): def _get_file_format_from_magic_number(filename: str, file_ext: str): with open(filename, "rb") as f: - f.seek(magic_number[file_ext]["offset"]) + f.seek(cast(int, magic_number[file_ext]["offset"])) head = f.read(8) - return any(head.startswith(bytes.fromhex(hex_code)) for hex_code in magic_number[file_ext]["hex"]) + return any( + head.startswith(bytes.fromhex(hex_code)) for hex_code in cast(List[str], magic_number[file_ext]["hex"]) + ) class Audio(Binary): @@ -270,10 +273,10 @@ def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) vp_check = any( - stream["codec_name"] in ["vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" + stream["codec_name"] in ["av1", "vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" ) - return _get_file_format_from_magic_number(filename, "mkv") and not vp_check - return False + return "matroska" in metadata["format_name"].split(",") and not vp_check + return _get_file_format_from_magic_number(filename, "mkv") class Mp4(Video): @@ -287,6 +290,9 @@ class Mp4(Video): file_ext = "mp4" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mp4" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mp4") return _get_file_format_from_magic_number(filename, "mp4") @@ -294,6 +300,9 @@ class Flv(Video): file_ext = "flv" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "flv" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "flv") @@ -310,6 +319,9 @@ class Mp3(Audio): file_ext = "mp3" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mp3" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "mp3") @@ -362,6 +374,9 @@ class Ogg(Audio): file_ext = "ogg" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "ogg" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "ogg") @@ -372,23 +387,29 @@ def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) vp_check = any( - stream["codec_name"] in ["vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" + stream["codec_name"] in ["av1", "vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" ) - return _get_file_format_from_magic_number(filename, "webm") and vp_check - return False + return "webm" in metadata["format_name"].split(",") and vp_check + return _get_file_format_from_magic_number(filename, "webm") -class Mpeg(Video): - file_ext = "mpeg" +class Mpg(Video): + file_ext = "mpg" def sniff(self, filename: str) -> bool: - return _get_file_format_from_magic_number(filename, "mpeg") + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mpegvideo" in metadata["format_name"].split(",") + return _get_file_format_from_magic_number(filename, "mpg") class Mov(Video): file_ext = "mov" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mov" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mov") return _get_file_format_from_magic_number(filename, "mov") @@ -396,6 +417,9 @@ class Avi(Video): file_ext = "avi" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "avi" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "avi") @@ -406,8 +430,8 @@ def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) is_video = "video" in [stream["codec_type"] for stream in streams] - return _get_file_format_from_magic_number(filename, "wmv") and is_video - return False + return "asf" in metadata["format_name"].split(",") and is_video + return _get_file_format_from_magic_number(filename, "wmv") class Wma(Audio): @@ -417,5 +441,5 @@ def sniff(self, filename: str) -> bool: if which("ffprobe"): metadata, streams = ffprobe(filename) is_audio = "video" not in [stream["codec_type"] for stream in streams] - return _get_file_format_from_magic_number(filename, "wma") and is_audio - return False + return "asf" in metadata["format_name"].split(",") and is_audio + return _get_file_format_from_magic_number(filename, "wma") From d7673ab04bb5b627297e849580e149584d633d7a Mon Sep 17 00:00:00 2001 From: Arash Date: Thu, 25 Apr 2024 18:35:45 +0200 Subject: [PATCH 11/14] Sorting imports --- lib/galaxy/datatypes/media.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index 9f325a18aeb4..fa0c4e0365cc 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -5,9 +5,9 @@ import wave from functools import lru_cache from typing import ( + cast, List, Tuple, - cast, ) from galaxy.datatypes.binary import Binary From a7e9775970f552e24695346449dd7377cf3b9dc9 Mon Sep 17 00:00:00 2001 From: Arash Date: Mon, 29 Apr 2024 17:08:37 +0200 Subject: [PATCH 12/14] Refactor media datatype sniffing logic in media.py --- lib/galaxy/datatypes/media.py | 69 ++++++++++++++--------------------- 1 file changed, 28 insertions(+), 41 deletions(-) diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index fa0c4e0365cc..12a75d6e52fb 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -36,17 +36,13 @@ def ffprobe(path): magic_number = { "mp4": { "offset": 4, - "hex": [ - "66 74 79 70 69", - "66 74 79 70 6D", - "66 74 79 70 4D", - ], + "string": ["ftypisom", "ftypmp42", "ftypMSNV"], }, - "flv": {"offset": 0, "hex": ["46 4C 56 01"]}, + "flv": {"offset": 0, "string": ["FLV"]}, "mkv": {"offset": 0, "hex": ["1A 45 DF A3"]}, "webm": {"offset": 0, "hex": ["1A 45 DF A3"]}, - "mov": {"offset": 4, "hex": ["66 74 79 70 71", "6D 6F 6F 76"]}, - "wav": {"offset": 8, "hex": ["57 41 56 45"]}, + "mov": {"offset": 4, "string": ["ftypqt", "moov"]}, + "wav": {"offset": 8, "string": ["WAVE"]}, "mp3": { "offset": 0, "hex": [ @@ -85,10 +81,10 @@ def ffprobe(path): "FF FF", ], }, - "ogg": {"offset": 0, "hex": ["4F 67 67"]}, + "ogg": {"offset": 0, "string": ["OggS"]}, "wma": {"offset": 0, "hex": ["30 26 B2 75"]}, "wmv": {"offset": 0, "hex": ["30 26 B2 75"]}, - "avi": {"offset": 8, "hex": ["41 56 49"]}, + "avi": {"offset": 8, "string": ["AVI"]}, "mpg": { "offset": 0, "hex": [ @@ -116,9 +112,21 @@ def _get_file_format_from_magic_number(filename: str, file_ext: str): with open(filename, "rb") as f: f.seek(cast(int, magic_number[file_ext]["offset"])) head = f.read(8) - return any( - head.startswith(bytes.fromhex(hex_code)) for hex_code in cast(List[str], magic_number[file_ext]["hex"]) - ) + if "string" in magic_number[file_ext]: + string_check = any( + [ + head.startswith(string_code.encode("iso-8859-1")) + for string_code in cast(List[str], magic_number[file_ext]["string"]) + ] + ) + if "hex" in magic_number[file_ext]: + hex_check = any( + [ + head.startswith(bytes.fromhex(hex_code)) + for hex_code in cast(List[str], magic_number[file_ext]["hex"]) + ] + ) + return string_check or hex_check class Audio(Binary): @@ -290,9 +298,6 @@ class Mp4(Video): file_ext = "mp4" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - return "mp4" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mp4") return _get_file_format_from_magic_number(filename, "mp4") @@ -300,12 +305,16 @@ class Flv(Video): file_ext = "flv" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - return "flv" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "flv") +class Mpg(Video): + file_ext = "mpg" + + def sniff(self, filename: str) -> bool: + return _get_file_format_from_magic_number(filename, "mpg") + + class Mp3(Audio): """ Class that reads MP3 audio file. @@ -319,9 +328,6 @@ class Mp3(Audio): file_ext = "mp3" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - return "mp3" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "mp3") @@ -374,9 +380,6 @@ class Ogg(Audio): file_ext = "ogg" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - return "ogg" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "ogg") @@ -393,23 +396,10 @@ def sniff(self, filename: str) -> bool: return _get_file_format_from_magic_number(filename, "webm") -class Mpg(Video): - file_ext = "mpg" - - def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - return "mpegvideo" in metadata["format_name"].split(",") - return _get_file_format_from_magic_number(filename, "mpg") - - class Mov(Video): file_ext = "mov" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - return "mov" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mov") return _get_file_format_from_magic_number(filename, "mov") @@ -417,9 +407,6 @@ class Avi(Video): file_ext = "avi" def sniff(self, filename: str) -> bool: - if which("ffprobe"): - metadata, streams = ffprobe(filename) - return "avi" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "avi") From 85c1db5102dc795f56c4115b931617deef6f1da8 Mon Sep 17 00:00:00 2001 From: Arash Date: Thu, 2 May 2024 10:41:28 +0200 Subject: [PATCH 13/14] using magic bit as fallback --- lib/galaxy/datatypes/media.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index 12a75d6e52fb..1ebde97e91eb 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -298,6 +298,9 @@ class Mp4(Video): file_ext = "mp4" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mp4" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mp4") return _get_file_format_from_magic_number(filename, "mp4") @@ -305,6 +308,9 @@ class Flv(Video): file_ext = "flv" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "flv" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "flv") @@ -312,6 +318,9 @@ class Mpg(Video): file_ext = "mpg" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mpegvideo" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "mpg") @@ -328,6 +337,9 @@ class Mp3(Audio): file_ext = "mp3" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mp3" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "mp3") @@ -380,6 +392,9 @@ class Ogg(Audio): file_ext = "ogg" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "ogg" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "ogg") @@ -400,6 +415,9 @@ class Mov(Video): file_ext = "mov" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "mov" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mov") return _get_file_format_from_magic_number(filename, "mov") @@ -407,6 +425,9 @@ class Avi(Video): file_ext = "avi" def sniff(self, filename: str) -> bool: + if which("ffprobe"): + metadata, streams = ffprobe(filename) + return "avi" in metadata["format_name"].split(",") return _get_file_format_from_magic_number(filename, "avi") From ef45b494671223cceb077a08e81f6a7b7eae711a Mon Sep 17 00:00:00 2001 From: Arash Date: Fri, 3 May 2024 15:11:41 +0200 Subject: [PATCH 14/14] remove unnecessary list --- lib/galaxy/datatypes/media.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/lib/galaxy/datatypes/media.py b/lib/galaxy/datatypes/media.py index 1ebde97e91eb..9da37c423913 100644 --- a/lib/galaxy/datatypes/media.py +++ b/lib/galaxy/datatypes/media.py @@ -114,17 +114,12 @@ def _get_file_format_from_magic_number(filename: str, file_ext: str): head = f.read(8) if "string" in magic_number[file_ext]: string_check = any( - [ - head.startswith(string_code.encode("iso-8859-1")) - for string_code in cast(List[str], magic_number[file_ext]["string"]) - ] + head.startswith(string_code.encode("iso-8859-1")) + for string_code in cast(List[str], magic_number[file_ext]["string"]) ) if "hex" in magic_number[file_ext]: hex_check = any( - [ - head.startswith(bytes.fromhex(hex_code)) - for hex_code in cast(List[str], magic_number[file_ext]["hex"]) - ] + head.startswith(bytes.fromhex(hex_code)) for hex_code in cast(List[str], magic_number[file_ext]["hex"]) ) return string_check or hex_check