Skip to content

Commit

Permalink
Add dynamic rate control mode for audio decode.
Browse files Browse the repository at this point in the history
  • Loading branch information
Themaister committed Oct 15, 2023
1 parent 7ac1a10 commit f9b2b84
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 28 deletions.
170 changes: 142 additions & 28 deletions video/ffmpeg_decode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#ifdef HAVE_GRANITE_AUDIO
#include "audio_mixer.hpp"
#include "dsp/dsp.hpp"
#include "dsp/sinc_resampler.hpp"
#endif

extern "C"
Expand Down Expand Up @@ -81,16 +82,20 @@ static void free_av_objects(CodecStream &stream)
#ifdef HAVE_GRANITE_AUDIO
struct AVFrameRingStream final : Audio::MixerStream, Util::ThreadSafeIntrusivePtrEnabled<AVFrameRingStream>
{
AVFrameRingStream(float sample_rate, unsigned num_channels, double timebase);
AVFrameRingStream(float sample_rate, unsigned num_channels, double timebase, bool support_resample);
~AVFrameRingStream() override;

float sample_rate;
unsigned num_channels;
double timebase;
double inv_sample_rate_ns;

void set_rate_factor(float factor);
float get_rate_factor() const noexcept;

bool setup(float mixer_output_rate, unsigned mixer_channels, size_t max_num_frames) override;
size_t accumulate_samples(float * const *channels, const float *gain, size_t num_frames) noexcept override;
size_t accumulate_samples_inner(float * const *channels, const float *gain, size_t num_frames) noexcept;
unsigned get_num_channels() const override;
float get_sample_rate() const override;
void dispose() override;
Expand All @@ -105,12 +110,18 @@ struct AVFrameRingStream final : Audio::MixerStream, Util::ThreadSafeIntrusivePt
std::atomic_uint32_t read_count;
std::atomic_uint32_t read_frames_count;
std::atomic_uint32_t write_frames_count;
std::atomic_uint32_t rate_factor_u32;
std::atomic_bool complete;
int packet_frames = 0;
bool running_state = false;
unsigned get_num_buffered_audio_frames();
unsigned get_num_buffered_av_frames();

enum { MaxChannels = 8 };
std::unique_ptr<Audio::DSP::SincResampler> resamplers[MaxChannels];
std::vector<float> tmp_resampler_buffer[MaxChannels];
float *tmp_resampler_ptrs[MaxChannels] = {};

struct
{
double pts = -1.0;
Expand All @@ -124,8 +135,11 @@ struct AVFrameRingStream final : Audio::MixerStream, Util::ThreadSafeIntrusivePt
void mark_complete();
};

AVFrameRingStream::AVFrameRingStream(float sample_rate_, unsigned num_channels_, double timebase_)
: sample_rate(sample_rate_), num_channels(num_channels_), timebase(timebase_), inv_sample_rate_ns(1e9 / sample_rate)
AVFrameRingStream::AVFrameRingStream(float sample_rate_, unsigned num_channels_, double timebase_, bool support_resample)
: sample_rate(sample_rate_)
, num_channels(num_channels_)
, timebase(timebase_)
, inv_sample_rate_ns(1e9 / sample_rate)
{
for (auto &f : frames)
f = av_frame_alloc();
Expand All @@ -135,6 +149,27 @@ AVFrameRingStream::AVFrameRingStream(float sample_rate_, unsigned num_channels_,
write_frames_count = 0;
pts_index = 0;
complete = false;
set_rate_factor(1.0f);

if (support_resample)
for (unsigned i = 0; i < num_channels; i++)
resamplers[i] = std::make_unique<Audio::DSP::SincResampler>(sample_rate, sample_rate, Audio::DSP::SincResampler::Quality::High);
}

void AVFrameRingStream::set_rate_factor(float factor)
{
factor = 1.0f / factor;
uint32_t v;
memcpy(&v, &factor, sizeof(uint32_t));
rate_factor_u32.store(v, std::memory_order_relaxed);
}

float AVFrameRingStream::get_rate_factor() const noexcept
{
float v;
uint32_t u = rate_factor_u32.load(std::memory_order_relaxed);
memcpy(&v, &u, sizeof(uint32_t));
return v;
}

void AVFrameRingStream::mark_uncorked_audio_pts()
Expand All @@ -146,10 +181,22 @@ void AVFrameRingStream::mark_uncorked_audio_pts()
progress[index].sampled_ns = Util::get_current_time_nsecs();
}

bool AVFrameRingStream::setup(float, unsigned mixer_channels, size_t)
bool AVFrameRingStream::setup(float, unsigned mixer_channels, size_t num_frames)
{
// TODO: Could promote mono to stereo.
return mixer_channels == num_channels;
if (mixer_channels != num_channels)
return false;

for (unsigned i = 0; i < MaxChannels; i++)
{
if (resamplers[i])
{
tmp_resampler_buffer[i].resize(num_frames * 2); // Maximum ratio distortion is 1.5x.
tmp_resampler_ptrs[i] = tmp_resampler_buffer[i].data();
}
}

return true;
}

void AVFrameRingStream::dispose()
Expand All @@ -168,6 +215,41 @@ unsigned AVFrameRingStream::get_num_channels() const
}

size_t AVFrameRingStream::accumulate_samples(float *const *channels, const float *gain, size_t num_frames) noexcept
{
if (resamplers[0])
{
float ratio = get_rate_factor();

// Safeguard when we're starting to hit underruns.
if (get_num_buffered_av_frames() <= 1 && ratio > 1.0f)
ratio = 1.0f;

for (unsigned i = 0; i < num_channels; i++)
resamplers[i]->set_sample_rate_ratio(ratio);
size_t required = resamplers[0]->get_maximum_input_for_output_frames(num_frames);
for (unsigned i = 0; i < num_channels; i++)
{
assert(required <= tmp_resampler_buffer[i].size());
// Should have a no-accumulation variant, but eeeeeeh.
// We need to clear out to zero anyway for underruns, etc.
memset(tmp_resampler_ptrs[i], 0, required * sizeof(float));
}
size_t accum = accumulate_samples_inner(tmp_resampler_ptrs, gain, required);
for (unsigned i = 0; i < num_channels; i++)
{
resamplers[i]->set_sample_rate_ratio(ratio);
resamplers[i]->process_and_accumulate_output_frames(channels[i], tmp_resampler_ptrs[i], num_frames);
}

return complete.load(std::memory_order_relaxed) && accum == 0 ? 0 : num_frames;
}
else
{
return accumulate_samples_inner(channels, gain, num_channels);
}
}

size_t AVFrameRingStream::accumulate_samples_inner(float *const *channels, const float *gain, size_t num_frames) noexcept
{
// Hold back playback until we have buffered enough to avoid instant underrun.
uint32_t written_count = write_count.load(std::memory_order_acquire);
Expand Down Expand Up @@ -387,6 +469,7 @@ struct VideoDecoder::Impl
double get_audio_buffering_duration();
double get_last_video_buffering_pts();
double get_estimated_audio_playback_timestamp_raw();
void latch_audio_presentation_target(double pts);

bool acquire_video_frame(VideoFrame &frame);
int try_acquire_video_frame(VideoFrame &frame);
Expand Down Expand Up @@ -807,7 +890,8 @@ void VideoDecoder::Impl::begin_audio_stream()
#else
audio.av_ctx->ch_layout.nb_channels,
#endif
av_q2d(audio.av_stream->time_base));
av_q2d(audio.av_stream->time_base),
opts.realtime);

stream->add_reference();
stream_id = mixer->add_mixer_stream(stream, !is_paused);
Expand Down Expand Up @@ -1775,7 +1859,16 @@ double VideoDecoder::Impl::get_estimated_audio_playback_timestamp_raw()

double pts = stream->progress[pts_buffer_index].pts;
if (pts < 0.0)
{
pts = 0.0;
}
else if (!is_paused)
{
// Crude estimate based on last reported PTS, offset by time since reported.
int64_t sampled_ns = stream->progress[pts_buffer_index].sampled_ns;
int64_t d = std::max<int64_t>(Util::get_current_time_nsecs(), sampled_ns) - sampled_ns;
pts += 1e-9 * double(d);
}

return pts;
}
Expand All @@ -1798,6 +1891,37 @@ double VideoDecoder::Impl::get_audio_buffering_duration()
#endif
}

void VideoDecoder::Impl::latch_audio_presentation_target(double pts)
{
#ifdef HAVE_GRANITE_AUDIO
if (!stream)
return;

double raw_pts = get_estimated_audio_playback_timestamp_raw();
auto delta = float(pts - raw_pts);

LOGI("Delta: %.4f\n", delta);

if (delta > 0.1f)
{
// Need to catch up.
stream->set_rate_factor(1.2f);
}
else if (delta < -0.1f)
{
// Shouldn't really happen in real-time mode ... Slow down a little to let us catch up.
stream->set_rate_factor(0.995f);
}
else
{
// Max rate distortion: 0.2% (3.45 cents).
// This is inaudible in practice. Practical distortion will be much lower.
// And should be less than 1 cent on average.
stream->set_rate_factor(1.0f + delta * 0.02f);
}
#endif
}

double VideoDecoder::Impl::get_last_video_buffering_pts()
{
double last_pts = -1.0;
Expand All @@ -1814,36 +1938,21 @@ double VideoDecoder::Impl::get_estimated_audio_playback_timestamp(double elapsed
#ifdef HAVE_GRANITE_AUDIO
if (stream)
{
uint32_t pts_buffer_index = (stream->pts_index.load(std::memory_order_acquire) - 1) %
AVFrameRingStream::Frames;
// Unsmoothed PTS.
auto pts = get_estimated_audio_playback_timestamp_raw();

double pts = stream->progress[pts_buffer_index].pts;
if (pts < 0.0)
{
pts = 0.0;
smooth_elapsed = 0.0;
smooth_pts = 0.0;
}
else if (!is_paused)
{
// Crude estimate based on last reported PTS, offset by time since reported.
int64_t sampled_ns = stream->progress[pts_buffer_index].sampled_ns;
int64_t d = std::max<int64_t>(Util::get_current_time_nsecs(), sampled_ns) - sampled_ns;
pts += 1e-9 * double(d);
}

// Smooth out the reported PTS.
// The reported PTS should be tied to the host timer,
// but we need to gradually adjust the timer based on the reported audio PTS to be accurate.

if (smooth_elapsed == 0.0)
if (pts == 0.0)
{
// Latch the PTS.
smooth_elapsed = elapsed_time;
smooth_pts = pts;
}
else
{
// Smooth out the reported PTS.
// The reported PTS should be tied to the host timer,
// but we need to gradually adjust the timer based on the reported audio PTS to be accurate over time.

// This is the value we should get in principle if everything is steady.
smooth_pts += elapsed_time - smooth_elapsed;
smooth_elapsed = elapsed_time;
Expand Down Expand Up @@ -2131,4 +2240,9 @@ void VideoDecoder::release_video_frame(unsigned index, Vulkan::Semaphore sem)
{
impl->release_video_frame(index, std::move(sem));
}

void VideoDecoder::latch_audio_presentation_target(double pts)
{
impl->latch_audio_presentation_target(pts);
}
}
4 changes: 4 additions & 0 deletions video/ffmpeg_decode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ class VideoDecoder

void release_video_frame(unsigned index, Vulkan::Semaphore sem);

// Should be called after every acquire in realtime mode.
// Lets audio buffer speed up or slow down appropriately to try to match video.
void latch_audio_presentation_target(double pts);

private:
struct Impl;
std::unique_ptr<Impl> impl;
Expand Down

0 comments on commit f9b2b84

Please sign in to comment.