-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for additional media types #18054
Changes from 11 commits
99e1fc8
541939f
498e095
f420c97
1cfc63e
e64cb60
de6f00c
e50f829
371e227
230b734
d7673ab
a7e9775
85c1db5
ef45b49
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -5,6 +5,7 @@ | |||||
import wave | ||||||
from functools import lru_cache | ||||||
from typing import ( | ||||||
cast, | ||||||
List, | ||||||
Tuple, | ||||||
) | ||||||
|
@@ -32,6 +33,94 @@ def ffprobe(path): | |||||
return data["format"], data["streams"] | ||||||
|
||||||
|
||||||
magic_number = { | ||||||
"mp4": { | ||||||
"offset": 4, | ||||||
"hex": [ | ||||||
"66 74 79 70 69", | ||||||
"66 74 79 70 6D", | ||||||
"66 74 79 70 4D", | ||||||
], | ||||||
}, | ||||||
"flv": {"offset": 0, "hex": ["46 4C 56 01"]}, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also by wikipedia's list, the magic should just be Since many of these are human readable strings (like
Suggested change
and then check the correct number of bytes based off of the length of the string or so, to not have to specify a 4th byte if it isn't relevant would reduce some of these long hex lists. |
||||||
"mkv": {"offset": 0, "hex": ["1A 45 DF A3"]}, | ||||||
"webm": {"offset": 0, "hex": ["1A 45 DF A3"]}, | ||||||
"mov": {"offset": 4, "hex": ["66 74 79 70 71", "6D 6F 6F 76"]}, | ||||||
"wav": {"offset": 8, "hex": ["57 41 56 45"]}, | ||||||
"mp3": { | ||||||
"offset": 0, | ||||||
"hex": [ | ||||||
"49 44 33", | ||||||
"FF E0", | ||||||
"FF E1", | ||||||
"FF E2", | ||||||
"FF E3", | ||||||
"FF E4", | ||||||
"FF E5", | ||||||
"FF E6", | ||||||
"FF E7", | ||||||
"FF E8", | ||||||
"FF E9", | ||||||
"FF EA", | ||||||
"FF EB", | ||||||
"FF EC", | ||||||
"FF ED", | ||||||
"FF EE", | ||||||
"FF EF", | ||||||
"FF F0", | ||||||
"FF F1", | ||||||
"FF F2", | ||||||
"FF F3", | ||||||
"FF F4", | ||||||
"FF F5", | ||||||
"FF F6", | ||||||
"FF F7", | ||||||
"FF F8", | ||||||
"FF F9", | ||||||
"FF FA", | ||||||
"FF FB", | ||||||
"FF FC", | ||||||
"FF FD", | ||||||
"FF FE", | ||||||
"FF FF", | ||||||
], | ||||||
}, | ||||||
"ogg": {"offset": 0, "hex": ["4F 67 67"]}, | ||||||
"wma": {"offset": 0, "hex": ["30 26 B2 75"]}, | ||||||
"wmv": {"offset": 0, "hex": ["30 26 B2 75"]}, | ||||||
"avi": {"offset": 8, "hex": ["41 56 49"]}, | ||||||
"mpg": { | ||||||
"offset": 0, | ||||||
"hex": [ | ||||||
"00 00 01 B0", | ||||||
"00 00 01 B1", | ||||||
"00 00 01 B3", | ||||||
"00 00 01 B4", | ||||||
"00 00 01 B5", | ||||||
"00 00 01 B6", | ||||||
"00 00 01 B7", | ||||||
"00 00 01 B8", | ||||||
"00 00 01 B9", | ||||||
"00 00 01 BA", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are these all seen in practice? Wikipedia's list suggests it should be a much reduced set
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was using this source:
|
||||||
"00 00 01 BB", | ||||||
"00 00 01 BC", | ||||||
"00 00 01 BD", | ||||||
"00 00 01 BE", | ||||||
"00 00 01 BF", | ||||||
], | ||||||
}, | ||||||
} | ||||||
|
||||||
|
||||||
def _get_file_format_from_magic_number(filename: str, file_ext: str): | ||||||
with open(filename, "rb") as f: | ||||||
f.seek(cast(int, magic_number[file_ext]["offset"])) | ||||||
head = f.read(8) | ||||||
return any( | ||||||
head.startswith(bytes.fromhex(hex_code)) for hex_code in cast(List[str], magic_number[file_ext]["hex"]) | ||||||
) | ||||||
|
||||||
|
||||||
class Audio(Binary): | ||||||
MetadataElement( | ||||||
name="duration", | ||||||
|
@@ -183,8 +272,11 @@ class Mkv(Video): | |||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
return "matroska" in metadata["format_name"].split(",") | ||||||
return False | ||||||
vp_check = any( | ||||||
stream["codec_name"] in ["av1", "vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" | ||||||
) | ||||||
return "matroska" in metadata["format_name"].split(",") and not vp_check | ||||||
return _get_file_format_from_magic_number(filename, "mkv") | ||||||
|
||||||
|
||||||
class Mp4(Video): | ||||||
|
@@ -200,8 +292,8 @@ class Mp4(Video): | |||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
return "mp4" in metadata["format_name"].split(",") | ||||||
return False | ||||||
return "mp4" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mp4") | ||||||
return _get_file_format_from_magic_number(filename, "mp4") | ||||||
|
||||||
|
||||||
class Flv(Video): | ||||||
|
@@ -211,17 +303,7 @@ def sniff(self, filename: str) -> bool: | |||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
return "flv" in metadata["format_name"].split(",") | ||||||
return False | ||||||
|
||||||
|
||||||
class Mpg(Video): | ||||||
hexylena marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
file_ext = "mpg" | ||||||
|
||||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
return "mpegvideo" in metadata["format_name"].split(",") | ||||||
return False | ||||||
return _get_file_format_from_magic_number(filename, "flv") | ||||||
|
||||||
|
||||||
class Mp3(Audio): | ||||||
|
@@ -240,7 +322,7 @@ def sniff(self, filename: str) -> bool: | |||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
return "mp3" in metadata["format_name"].split(",") | ||||||
return False | ||||||
return _get_file_format_from_magic_number(filename, "mp3") | ||||||
Comment on lines
-243
to
+338
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but yes cases like this, instead of |
||||||
|
||||||
|
||||||
class Wav(Audio): | ||||||
|
@@ -274,8 +356,7 @@ def get_mime(self) -> str: | |||||
return "audio/wav" | ||||||
|
||||||
def sniff(self, filename: str) -> bool: | ||||||
with wave.open(filename, "rb"): | ||||||
return True | ||||||
return _get_file_format_from_magic_number(filename, "wav") | ||||||
|
||||||
def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: | ||||||
"""Set the metadata for this dataset from the file contents.""" | ||||||
|
@@ -287,3 +368,78 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N | |||||
dataset.metadata.nchannels = fd.getnchannels() | ||||||
except wave.Error: | ||||||
pass | ||||||
|
||||||
|
||||||
class Ogg(Audio): | ||||||
file_ext = "ogg" | ||||||
|
||||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
return "ogg" in metadata["format_name"].split(",") | ||||||
return _get_file_format_from_magic_number(filename, "ogg") | ||||||
|
||||||
|
||||||
class Webm(Video): | ||||||
file_ext = "webm" | ||||||
|
||||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
vp_check = any( | ||||||
stream["codec_name"] in ["av1", "vp8", "vp9"] for stream in streams if stream["codec_type"] == "video" | ||||||
) | ||||||
return "webm" in metadata["format_name"].split(",") and vp_check | ||||||
return _get_file_format_from_magic_number(filename, "webm") | ||||||
|
||||||
|
||||||
class Mpg(Video): | ||||||
file_ext = "mpg" | ||||||
|
||||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
return "mpegvideo" in metadata["format_name"].split(",") | ||||||
return _get_file_format_from_magic_number(filename, "mpg") | ||||||
|
||||||
|
||||||
class Mov(Video): | ||||||
file_ext = "mov" | ||||||
|
||||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
return "mov" in metadata["format_name"].split(",") and _get_file_format_from_magic_number(filename, "mov") | ||||||
return _get_file_format_from_magic_number(filename, "mov") | ||||||
|
||||||
|
||||||
class Avi(Video): | ||||||
file_ext = "avi" | ||||||
|
||||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
return "avi" in metadata["format_name"].split(",") | ||||||
return _get_file_format_from_magic_number(filename, "avi") | ||||||
|
||||||
|
||||||
class Wmv(Video): | ||||||
file_ext = "wmv" | ||||||
|
||||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
is_video = "video" in [stream["codec_type"] for stream in streams] | ||||||
return "asf" in metadata["format_name"].split(",") and is_video | ||||||
return _get_file_format_from_magic_number(filename, "wmv") | ||||||
|
||||||
|
||||||
class Wma(Audio): | ||||||
file_ext = "wma" | ||||||
|
||||||
def sniff(self, filename: str) -> bool: | ||||||
if which("ffprobe"): | ||||||
metadata, streams = ffprobe(filename) | ||||||
is_audio = "video" not in [stream["codec_type"] for stream in streams] | ||||||
return "asf" in metadata["format_name"].split(",") and is_audio | ||||||
return _get_file_format_from_magic_number(filename, "wma") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://en.wikipedia.org/wiki/List_of_file_signatures
6d is heic, not sure if you want to lump this in with mp4? (I assume you based this on files you had/have seen). Also might want to consider extending these magic numbers to include the whole string to avoid future false positives
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wanted to grab m4v file format.
https://www.garykessler.net/library/file_sigs.html
But, you are right it will be missed with the heic I will change it to
66 74 79 70 6D 70 34 32
.Also, I will consider extending the magic numbers.