-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for additional media types #18054
Merged
Merged
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
99e1fc8
Add support for Ogg, Webm, Mpeg, Mpga, M4a, Mov, Avi, Wmv, and Wma me…
arash77 541939f
Add support for Ogg, Webm, Mpeg, Mpga, M4a, Mov, Avi, Wmv, and Wma me…
arash77 498e095
Update media datatypes
arash77 f420c97
Refactor media datatype sniffing logic in media.py
arash77 1cfc63e
Refactor media datatype sniffing logic in media.py
arash77 e64cb60
Update media datatypes and using magic numbers as main idinifier of f…
arash77 de6f00c
Refactor media datatype sniffing logic in media.py
arash77 e50f829
change the order of file check, change names to meaningful names
arash77 371e227
Fix typo in datatype sniffing logic for H5 files
arash77 230b734
Refactor media datatype sniffing logic
arash77 d7673ab
Sorting imports
arash77 a7e9775
Refactor media datatype sniffing logic in media.py
arash77 85c1db5
using magic bit as fallback
arash77 ef45b49
remove unnecessary list
arash77 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import wave | ||
from functools import lru_cache | ||
from typing import ( | ||
cast, | ||
List, | ||
Tuple, | ||
) | ||
|
@@ -32,6 +33,97 @@ def ffprobe(path): | |
return data["format"], data["streams"] | ||
|
||
|
||
magic_number = { | ||
"mp4": { | ||
"offset": 4, | ||
"string": ["ftypisom", "ftypmp42", "ftypMSNV"], | ||
}, | ||
"flv": {"offset": 0, "string": ["FLV"]}, | ||
"mkv": {"offset": 0, "hex": ["1A 45 DF A3"]}, | ||
"webm": {"offset": 0, "hex": ["1A 45 DF A3"]}, | ||
"mov": {"offset": 4, "string": ["ftypqt", "moov"]}, | ||
"wav": {"offset": 8, "string": ["WAVE"]}, | ||
"mp3": { | ||
"offset": 0, | ||
"hex": [ | ||
"49 44 33", | ||
"FF E0", | ||
"FF E1", | ||
"FF E2", | ||
"FF E3", | ||
"FF E4", | ||
"FF E5", | ||
"FF E6", | ||
"FF E7", | ||
"FF E8", | ||
"FF E9", | ||
"FF EA", | ||
"FF EB", | ||
"FF EC", | ||
"FF ED", | ||
"FF EE", | ||
"FF EF", | ||
"FF F0", | ||
"FF F1", | ||
"FF F2", | ||
"FF F3", | ||
"FF F4", | ||
"FF F5", | ||
"FF F6", | ||
"FF F7", | ||
"FF F8", | ||
"FF F9", | ||
"FF FA", | ||
"FF FB", | ||
"FF FC", | ||
"FF FD", | ||
"FF FE", | ||
"FF FF", | ||
], | ||
}, | ||
"ogg": {"offset": 0, "string": ["OggS"]}, | ||
"wma": {"offset": 0, "hex": ["30 26 B2 75"]}, | ||
"wmv": {"offset": 0, "hex": ["30 26 B2 75"]}, | ||
"avi": {"offset": 8, "string": ["AVI"]}, | ||
"mpg": { | ||
"offset": 0, | ||
"hex": [ | ||
"00 00 01 B0", | ||
"00 00 01 B1", | ||
"00 00 01 B3", | ||
"00 00 01 B4", | ||
"00 00 01 B5", | ||
"00 00 01 B6", | ||
"00 00 01 B7", | ||
"00 00 01 B8", | ||
"00 00 01 B9", | ||
"00 00 01 BA", | ||
"00 00 01 BB", | ||
"00 00 01 BC", | ||
"00 00 01 BD", | ||
"00 00 01 BE", | ||
"00 00 01 BF", | ||
], | ||
}, | ||
} | ||
|
||
|
||
def _get_file_format_from_magic_number(filename: str, file_ext: str): | ||
with open(filename, "rb") as f: | ||
f.seek(cast(int, magic_number[file_ext]["offset"])) | ||
head = f.read(8) | ||
if "string" in magic_number[file_ext]: | ||
string_check = any( | ||
head.startswith(string_code.encode("iso-8859-1")) | ||
for string_code in cast(List[str], magic_number[file_ext]["string"]) | ||
) | ||
if "hex" in magic_number[file_ext]: | ||
hex_check = any( | ||
head.startswith(bytes.fromhex(hex_code)) for hex_code in cast(List[str], magic_number[file_ext]["hex"]) | ||
) | ||
return string_check or hex_check | ||
|
||
|
||
class Audio(Binary): | ||
MetadataElement( | ||
name="duration", | ||
|
@@ -183,8 +275,11 @@ class Mkv(Video): | |
def sniff(self, filename: str) -> bool: | ||
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
return "matroska" in metadata["format_name"].split(",") | ||
return False | ||
vp_check = any( | ||
stream["codec_name"] in ["av1", "vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" | ||
) | ||
return "matroska" in metadata["format_name"].split(",") and not vp_check | ||
return _get_file_format_from_magic_number(filename, "mkv") | ||
|
||
|
||
class Mp4(Video): | ||
|
@@ -200,8 +295,8 @@ class Mp4(Video): | |
def sniff(self, filename: str) -> bool: | ||
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
return "mp4" in metadata["format_name"].split(",") | ||
return False | ||
return "mp4" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mp4") | ||
return _get_file_format_from_magic_number(filename, "mp4") | ||
|
||
|
||
class Flv(Video): | ||
|
@@ -211,7 +306,7 @@ def sniff(self, filename: str) -> bool: | |
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
return "flv" in metadata["format_name"].split(",") | ||
return False | ||
return _get_file_format_from_magic_number(filename, "flv") | ||
|
||
|
||
class Mpg(Video): | ||
hexylena marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
@@ -221,7 +316,7 @@ def sniff(self, filename: str) -> bool: | |
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
return "mpegvideo" in metadata["format_name"].split(",") | ||
return False | ||
return _get_file_format_from_magic_number(filename, "mpg") | ||
|
||
|
||
class Mp3(Audio): | ||
|
@@ -240,7 +335,7 @@ def sniff(self, filename: str) -> bool: | |
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
return "mp3" in metadata["format_name"].split(",") | ||
return False | ||
return _get_file_format_from_magic_number(filename, "mp3") | ||
Comment on lines
-243
to
+338
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but yes cases like this, instead of |
||
|
||
|
||
class Wav(Audio): | ||
|
@@ -274,8 +369,7 @@ def get_mime(self) -> str: | |
return "audio/wav" | ||
|
||
def sniff(self, filename: str) -> bool: | ||
with wave.open(filename, "rb"): | ||
return True | ||
return _get_file_format_from_magic_number(filename, "wav") | ||
|
||
def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: | ||
"""Set the metadata for this dataset from the file contents.""" | ||
|
@@ -287,3 +381,68 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N | |
dataset.metadata.nchannels = fd.getnchannels() | ||
except wave.Error: | ||
pass | ||
|
||
|
||
class Ogg(Audio): | ||
file_ext = "ogg" | ||
|
||
def sniff(self, filename: str) -> bool: | ||
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
return "ogg" in metadata["format_name"].split(",") | ||
return _get_file_format_from_magic_number(filename, "ogg") | ||
|
||
|
||
class Webm(Video): | ||
file_ext = "webm" | ||
|
||
def sniff(self, filename: str) -> bool: | ||
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
vp_check = any( | ||
stream["codec_name"] in ["av1", "vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" | ||
) | ||
return "webm" in metadata["format_name"].split(",") and vp_check | ||
return _get_file_format_from_magic_number(filename, "webm") | ||
|
||
|
||
class Mov(Video): | ||
file_ext = "mov" | ||
|
||
def sniff(self, filename: str) -> bool: | ||
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
return "mov" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mov") | ||
return _get_file_format_from_magic_number(filename, "mov") | ||
|
||
|
||
class Avi(Video): | ||
file_ext = "avi" | ||
|
||
def sniff(self, filename: str) -> bool: | ||
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
return "avi" in metadata["format_name"].split(",") | ||
return _get_file_format_from_magic_number(filename, "avi") | ||
|
||
|
||
class Wmv(Video): | ||
file_ext = "wmv" | ||
|
||
def sniff(self, filename: str) -> bool: | ||
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
is_video = "video" in [stream["codec_type"] for stream in streams] | ||
return "asf" in metadata["format_name"].split(",") and is_video | ||
return _get_file_format_from_magic_number(filename, "wmv") | ||
|
||
|
||
class Wma(Audio): | ||
file_ext = "wma" | ||
|
||
def sniff(self, filename: str) -> bool: | ||
if which("ffprobe"): | ||
metadata, streams = ffprobe(filename) | ||
is_audio = "video" not in [stream["codec_type"] for stream in streams] | ||
return "asf" in metadata["format_name"].split(",") and is_audio | ||
return _get_file_format_from_magic_number(filename, "wma") |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are these all seen in practice? Wikipedia's list suggests it should be a much reduced set
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was using this source:
https://www.garykessler.net/library/file_sigs.html