Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Dataset.example_media and cache Datacards #84

Merged
merged 14 commits into from
May 8, 2024
284 changes: 176 additions & 108 deletions audbcards/core/datacard.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import audiofile
import audplot

from audbcards.core.config import config
from audbcards.core.dataset import Dataset
from audbcards.core.utils import set_plot_margins

Expand Down Expand Up @@ -41,14 +42,19 @@ class Datacard(object):
a call to :meth:`audbcards.Datacard.player`
will store an example audio file
under
``<sphinx_build_dir>/<path>/<db-name>/<media-file-in-db>``
``<sphinx_build_dir>/<path>/<dataset-name>/``
sphinx_src_dir: source dir of sphinx.
If not ``None``
and ``example`` is ``True``,
a call to :meth:`audbcards.Datacard.player`
will store a wavplot of the example audio file
will store a waveform plot of the example audio file
under
``<sphinx_src_dir>/<path>/<db-name>/<db-name>.png``
``<sphinx_src_dir>/<path>/<dataset-name>/``
cache_root: cache folder.
If ``None``,
the environmental variable ``AUDBCARDS_CACHE_ROOT``,
or :attr:`audbcards.config.CACHE_ROOT`
is used

"""

Expand All @@ -60,6 +66,7 @@ def __init__(
example: bool = True,
sphinx_build_dir: str = None,
sphinx_src_dir: str = None,
cache_root: str = None,
):
self.dataset = dataset
"""Dataset object."""
Expand All @@ -76,6 +83,11 @@ def __init__(
self.sphinx_src_dir = sphinx_src_dir
"""Sphinx source dir."""

if cache_root is None:
cache_root = os.environ.get("AUDBCARDS_CACHE_ROOT") or config.CACHE_ROOT
self.cache_root = audeer.mkdir(cache_root)
r"""Cache root folder."""

self.rst_preamble = ""
"""RST code added at top of data card."""

Expand All @@ -84,47 +96,6 @@ def content(self):
"""Property Accessor for rendered jinja2 content."""
return self._render_template()

@property
def example_media(self) -> typing.Optional[str]:
r"""Select example media file.

This select a media file
based on the median duration
of all files
between 0.5 s and 300 s
and downloads it to the cache.

"""
# Pick a meaningful duration for the example audio file
min_dur = 0.5
max_dur = 300 # 5 min
durations = self.dataset.file_durations
selected_durations = [d for d in durations if d >= min_dur and d <= max_dur]
if len(selected_durations) == 0:
return None
selected_duration = np.median(selected_durations)

# Get index for duration closest to selected duration
# see https://stackoverflow.com/a/9706105
# durations.index(selected_duration)
# is an alternative but fails due to rounding errors
index = min(
range(len(durations)),
key=lambda n: abs(durations[n] - selected_duration),
)
# Download of example data might fail
try:
media = self.dataset.deps.media[index]
audb.load_media(
self.dataset.name,
media,
version=self.dataset.version,
verbose=False,
)
except: # noqa: E722
media = None
return media

@property
def file_duration_distribution(self) -> str:
r"""Minimum and maximum of files durations, and plotted distribution.
Expand All @@ -133,17 +104,32 @@ def file_duration_distribution(self) -> str:
containing the mininimum and maximum values
of files durations.

If :attr:`audbcards.Datacard.sphinx_src_dir` is set
If :attr:`audbcards.Datacard.sphinx_src_dir` is not ``None``
(e.g. when used in the sphinx extension),
an inline image is stored
in the sphinx source folder
under ``<dataset-name>/<dataset-name>-file-durations.png``
and displayed
an image is stored in the file
``<dataset-name>-<dataset-version>-file-duration-distribution.png``,
which is cached in
``<cache-root>/<dataset-name>/<dataset-version>/``
and copied to the sphinx source folder
into
``<sphinx-src-dir>/<path><dataset-name>/``.
The image is displayed inline
between the minimum and maximum values.
If all duration values are the same,
no distribution plot is created.

"""
file_name = (
f"{self.dataset.name}-{self.dataset.version}-file-duration-distribution.png"
)
# Cache is organized as `<cache_root>/<name>/<version>/`
cache_file = audeer.path(
self.cache_root,
self.dataset.name,
self.dataset.version,
file_name,
)

min_ = 0
max_ = 0
unit = "s"
Expand All @@ -161,88 +147,174 @@ def file_duration_distribution(self) -> str:

# Save distribution plot
if self.sphinx_src_dir is not None:
self._plot_distribution(durations)
name = "file-durations"
# Plot distribution to cache,
# if not found there already.
if not os.path.exists(cache_file):
audeer.mkdir(os.path.dirname(cache_file))
self._plot_distribution(durations)
plt.savefig(cache_file, transparent=True)
plt.close()

image_file = audeer.path(
self.sphinx_src_dir,
self.path,
self.dataset.name,
f"{self.dataset.name}-{name}.png",
file_name,
)
audeer.mkdir(os.path.dirname(image_file))
plt.savefig(image_file, transparent=True)
plt.close()
shutil.copyfile(cache_file, image_file)
distribution_str = self._inline_image(
f"{min_:.1f} {unit}",
f"./{self.dataset.name}/{self.dataset.name}-{name}.png",
f"./{self.dataset.name}/{file_name}",
f"{max_:.1f} {unit}",
)

return distribution_str

def player(
self,
file: str = None,
) -> str:
def player(self) -> str:
r"""Create an audio player showing the waveform.

Args:
file: input audio file to be used in the player.
If ``None``,
:attr:`audbcards.Datacard.example_media`
is used
If :attr:`audbcards.Datacard.sphinx_build_dir`
or :attr:`audbcards.Datacard.sphinx_src_dir`
is not ``None``,
an example media file is cached in the folder
``<dataset-name>-<dataset-version>-player-media/``
inside
``<cache-root>/<dataset-name>/<dataset-version>/``,
using the same sub-folder structure
as the media file has inside its dataset.
If :attr:`audbcards.Datacard.sphinx_build_dir`
is not ``None``,
the media sub-folder structure
is also copied
to the sphinx build dir into
``<sphinx-build-dir>/<path>/<dataset-name>/``,
and an audio element referencing this file
is added to the returned RST string.

If :attr:`audbcards.Datacard.sphinx_src_dir` is not ``None``,
a plot of the waveform of the media file
is cached under
``<dataset-name>-<dataset-version>-player-waveform.png``
inside
``<cache-root>/<dataset-name>/<dataset-version>/``.
It is also copied to the sphinx source folder into
``<sphinx-src-dir>/<path>/<dataset-name>/``,
and referenced at the beginning of the returned RST string.

If :attr:`audbcards.Datacard.sphinx_build_dir`
and :attr:`audbcards.Datacard.sphinx_src_dir`
are ``None``,
an empty string is returned.

"""
if file is None:
file = self.example_media
Returns:
String containing RST code to include the player

# use audb cache instead of dataset.cache_root
media_src_dir = (
f"{audb.default_cache_root()}/"
f"{audb.flavor_path(self.dataset.name, self.dataset.version)}"
"""
# Cache is organized as `<cache_root>/<name>/<version>/`
cache_folder = audeer.path(
self.cache_root,
self.dataset.name,
self.dataset.version,
)

# Move file to build folder
def load_media_to_cache() -> str:
r"""Load media file with audb and copy to audbcards cache.

Load example media file to cache,
if not existent.

Returns:
full path to media file in cache

"""
cache_example_media = audeer.path(
cache_folder,
f"{self.dataset.name}-{self.dataset.version}-player-media",
self.dataset.example_media,
)
if not os.path.exists(cache_example_media):
ChristianGeng marked this conversation as resolved.
Show resolved Hide resolved
media = audb.load_media(
self.dataset.name,
self.dataset.example_media,
version=self.dataset.version,
verbose=False,
)[0]
audeer.mkdir(os.path.dirname(cache_example_media))
shutil.copy(media, cache_example_media)
return cache_example_media

def plot_waveform_to_cache(cache_example_media: str) -> str:
r"""Plot waveform of example media to cache.

Args:
cache_example_media: full path to media file in cache

Returns:
full path to waveform file in cache

"""
cache_waveform_file = audeer.path(
cache_folder,
f"{self.dataset.name}-{self.dataset.version}-player-waveform.png",
)
if not os.path.exists(cache_waveform_file):
signal, sampling_rate = audiofile.read(
ChristianGeng marked this conversation as resolved.
Show resolved Hide resolved
cache_example_media,
always_2d=True,
)
audeer.mkdir(os.path.dirname(cache_waveform_file))
plt.figure(figsize=[3, 0.5])
ax = plt.subplot(111)
audplot.waveform(signal[0, :], ax=ax)
set_plot_margins()
plt.savefig(cache_waveform_file)
plt.close()
return cache_waveform_file

# String holding the RST code to include the player
player_str = ""

# Add plot of waveform to Sphinx source folder (e.g. docs/)
if self.sphinx_src_dir is not None:
cache_example_media = load_media_to_cache()
cache_waveform_file = plot_waveform_to_cache(cache_example_media)
plot_dst_dir = audeer.path(
self.sphinx_src_dir,
self.path,
self.dataset.name,
)
audeer.mkdir(plot_dst_dir)
shutil.copy(
cache_waveform_file,
os.path.join(plot_dst_dir, os.path.basename(cache_waveform_file)),
)
waveform_src = (
f"./{self.dataset.name}/{os.path.basename(cache_waveform_file)}"
)
player_str += f".. image:: {waveform_src}\n\n"

# Copy media file to Sphinx build folder (e.g. build/)
if self.sphinx_build_dir is not None:
cache_example_media = load_media_to_cache()
media_dst_dir = audeer.path(
self.sphinx_build_dir,
self.path,
self.dataset.name,
)
audeer.mkdir(os.path.join(media_dst_dir, os.path.dirname(file)))
audeer.mkdir(media_dst_dir, os.path.dirname(self.dataset.example_media))
shutil.copy(
os.path.join(media_src_dir, file),
os.path.join(media_dst_dir, file),
cache_example_media,
os.path.join(media_dst_dir, self.dataset.example_media),
)

# Add plot of waveform
if self.sphinx_src_dir is not None:
signal, sampling_rate = audiofile.read(
os.path.join(media_src_dir, file),
always_2d=True,
)
image_file = audeer.path(
self.sphinx_src_dir,
self.path,
self.dataset.name,
f"{self.dataset.name}.png",
player_src = f"./{self.dataset.name}/{self.dataset.example_media}"
player_str += (
".. raw:: html\n"
"\n"
f' <p><audio controls src="{player_src}"></audio></p>'
)
audeer.mkdir(os.path.dirname(image_file))
plt.figure(figsize=[3, 0.5])
ax = plt.subplot(111)
audplot.waveform(signal[0, :], ax=ax)
set_plot_margins()
plt.savefig(image_file)
plt.close()

player_src = f"./{self.dataset.name}/{file}"
player_str = (
f".. image:: ./{self.dataset.name}/{self.dataset.name}.png\n"
"\n"
".. raw:: html\n"
"\n"
f' <p><audio controls src="{player_src}"></audio></p>'
)

return player_str

def save(self, file: str = None):
Expand Down Expand Up @@ -365,14 +437,10 @@ def _expand_dataset(
"""
# Add path of datacard folder
dataset["path"] = self.path
# Add audio player for example file
dataset["example"] = None
if self.example:
example = self.example_media
if example is not None:
player = self.player(example)
if self.dataset.example_media is not None:
player = self.player()
dataset["player"] = player
dataset["example"] = example
dataset["file_duration_distribution"] = self.file_duration_distribution
return dataset

Expand Down
Loading
Loading