Skip to content

Commit

Permalink
Convert audio files (mp3/flac/ogg) to 16khz wav
Browse files Browse the repository at this point in the history
It's no longer necessary to run sox or ffmpeg beforehand, when using the
whisperfile command. If you're audio file isn't in the preferred format,
it'll be converted for you automatically using the embedded audio tools.
  • Loading branch information
jart committed Aug 19, 2024
1 parent 2043660 commit dc99002
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 17 deletions.
5 changes: 4 additions & 1 deletion whisper.cpp/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,12 @@ o/$(MODE)/whisper.cpp/main: \
o/$(MODE)/whisper.cpp/main.1.asc.zip.o \
o/$(MODE)/whisper.cpp/whisper.cpp.a \
o/$(MODE)/llama.cpp/llama.cpp.a \
o/$(MODE)/stb/stb.a \

o/$(MODE)/whisper.cpp/miniaudio.o: private COPTS += -O3

$(WHISPER_CPP_OBJS): whisper.cpp/BUILD.mk

.PHONY: o/$(MODE)/whisper.cpp
o/$(MODE)/whisper.cpp: \
o/$(MODE)/whisper.cpp/main
o/$(MODE)/whisper.cpp/main \
1 change: 1 addition & 0 deletions whisper.cpp/README.llamafile
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ ORIGIN
LOCAL MODIFICATIONS

- Integrate with llamafile file loader
- Automatically convert MP3/FLAC/OGG to WAV
142 changes: 130 additions & 12 deletions whisper.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
#define _USE_MATH_DEFINES // for M_PI

#include "llamafile/log.h"
#include "llamafile/llamafile.h"
#include "common.h"

// third-party utilities
// use your favorite implementations
#define DR_WAV_IMPLEMENTATION
// #define DR_WAV_IMPLEMENTATION // [jart] comment out
#include "dr_wav.h"

#if defined(_MSC_VER)
Expand All @@ -18,6 +20,23 @@
#include <io.h>
#endif

#include <cosmo.h>
#include <stdlib.h>
#include <unistd.h>

#include "stb/stb_vorbis.h"
#include "miniaudio.h"

#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE 4096

static std::string delete_me;

static void on_exit(void) {
if (!delete_me.empty()) {
unlink(delete_me.c_str());
}
}

bool is_wav_buffer(const std::string buf) {
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
Expand All @@ -33,10 +52,103 @@ bool is_wav_buffer(const std::string buf) {
return true;
}

bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
static ma_result perform_audio_conversion(ma_decoder* pDecoder, ma_encoder* pEncoder) {
ma_result rc = MA_SUCCESS;
for (;;) {
ma_uint8 pRawData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
ma_uint64 framesReadThisIteration;
ma_uint64 framesToReadThisIteration;
framesToReadThisIteration = sizeof(pRawData) / ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
rc = ma_decoder_read_pcm_frames(pDecoder, pRawData, framesToReadThisIteration, &framesReadThisIteration);
if (rc != MA_SUCCESS) {
break;
}
ma_encoder_write_pcm_frames(pEncoder, pRawData, framesReadThisIteration, NULL);
if (framesReadThisIteration < framesToReadThisIteration) {
break;
}
}
return rc;
}

// converts audio file to signed 16-bit 16000hz wav
static std::string convert_audio_file(const std::string & fname, bool stereo) {

// create temporary filename
std::string newpath;
newpath = __get_tmpdir();
newpath += "/whisperfile.";
newpath += std::to_string(_rand64());
newpath += ".wav";

// create decoder
ma_decoder_config decoderConfig =
ma_decoder_config_init(ma_format_s16, 1 + stereo, COMMON_SAMPLE_RATE);
decoderConfig.resampling.algorithm = ma_resample_algorithm_linear;
decoderConfig.resampling.linear.lpfOrder = 8;

// open input file
ma_decoder decoder;
ma_result rc = ma_decoder_init_file(fname.c_str(), &decoderConfig, &decoder);
if (rc != MA_SUCCESS) {
fprintf(stderr, "%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n",
fname.c_str(), ma_result_description(rc));
return "";
}

// create encoder
ma_encoder encoder;
ma_encoder_config encoderConfig = ma_encoder_config_init(
ma_encoding_format_wav,
decoder.outputFormat,
decoder.outputChannels,
decoder.outputSampleRate);
rc = ma_encoder_init_file(newpath.c_str(), &encoderConfig, &encoder);
if (rc != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
fprintf(stderr, "%s: failed to open output file: %s\n",
newpath.c_str(), ma_result_description(rc));
return "";
}

// perform the conversion
rc = perform_audio_conversion(&decoder, &encoder);
ma_encoder_uninit(&encoder);
ma_decoder_uninit(&decoder);
if (rc != MA_SUCCESS) {
fprintf(stderr, "%s: failed to convert audio file: %s\n",
fname.c_str(), ma_result_description(rc));
return "";
}

// return new path
delete_me = newpath;
atexit(on_exit);
return newpath;
}

#define TRY_CONVERSION \
do { \
if (did_conversion) { \
fprintf(stderr, "error: failed to open audio file\n"); \
return false; \
} \
std::string fname2 = convert_audio_file(fname, stereo); \
if (fname2.empty()) { \
return false; \
} \
fname = fname2; \
did_conversion = true; \
goto TryAgain; \
} while (0)

bool read_wav(const std::string & fname_, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
drwav wav;
std::vector<uint8_t> wav_data; // used for pipe input from stdin
std::string fname = fname_;
bool did_conversion = false;

TryAgain:
if (fname == "-") {
{
#ifdef _WIN32
Expand Down Expand Up @@ -68,32 +180,38 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
}
}
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
tinylogf("%s: converting to wav...\n", fname.c_str());
TRY_CONVERSION;
}

if (stereo && wav.channels < 2) {
fprintf(stderr, "%s: audio file must be stereo for diarization\n", fname.c_str());
drwav_uninit(&wav);
return false;
}

if (wav.channels != 1 && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
tinylogf("%s: audio file has %d channels\n", fname.c_str(), wav.channels);
drwav_uninit(&wav);
return false;
TRY_CONVERSION;
}

if (stereo && wav.channels != 2) {
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
tinylogf("%s: audio file has %d channels (we want diarization)\n", fname.c_str(), wav.channels);
drwav_uninit(&wav);
return false;
TRY_CONVERSION;
}

if (wav.sampleRate != COMMON_SAMPLE_RATE) {
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
tinylogf("%s: audio file has %d sample rate\n", fname.c_str(), wav.sampleRate);
drwav_uninit(&wav);
return false;
TRY_CONVERSION;
}

if (wav.bitsPerSample != 16) {
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
tinylogf("%s: audio file has %d bits per sample\n", fname.c_str(), wav.bitsPerSample);
drwav_uninit(&wav);
return false;
TRY_CONVERSION;
}

const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
Expand Down Expand Up @@ -171,7 +289,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
energy_last /= n_samples_last;

if (verbose) {
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
tinylogf("%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
}

if (energy_last > vad_thold*energy_all) {
Expand Down
2 changes: 2 additions & 0 deletions whisper.cpp/dr_wav.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"
5 changes: 4 additions & 1 deletion whisper.cpp/main.1
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ Puts program in HTTP server mode.
Path of Whisper model weights. See
https://huggingface.co/ggerganov/whisper.cpp
.It Fl f Ar FNAME , Fl Fl file Ar FNAME
Path of WAV file to transcribe.
Path of audio file to transcribe. The preferred audio format is a 16khz
16-bit signed linear WAV file, which can be stereo or mono. It's also
permissible to pass an MP3, FLAC, or OGG file, in which case it'll be
converted to .wav file in your temp directory before transcribing.
.It Fl tr , Fl Fl translate
Translate audio into English text.
.It Fl ot Ar N , Fl Fl offset-t Ar N
Expand Down
11 changes: 8 additions & 3 deletions whisper.cpp/main.1.asc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@
face.co/ggerganov/whisper.cpp
-f FNAME, --file FNAME
Path of WAV file to transcribe.
Path of audio file to transcribe. The preferred audio format is
a 16khz 16-bit signed linear WAV file, which can be stereo or
mono. It's also permissible to pass an MP3, FLAC, or OGG file,
in which case it'll be converted to .wav file in your temp di‐
rectory before transcribing.
-tr, --translate
Translate audio into English text.
Expand All @@ -47,8 +51,9 @@
-pc, --print-colors
Enables CLI printing of ANSI color codes.
Transcribed text will appear in the terminal on a spectrum of
color ranging from green to red. Green means the model
Transcribed text will appear in the terminal on a spectrum of
color ranging from green to red. Green represents confidence
whereas red represents uncertainty.
-t N, --threads N
Overrides number of threads to use.
Expand Down
1 change: 1 addition & 0 deletions whisper.cpp/miniaudio.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "stb/stb_vorbis.h"

#define MA_NO_DEVICE_IO
#define MA_NO_RUNTIME_LINKING
#define MINIAUDIO_IMPLEMENTATION
#pragma GCC diagnostic ignored "-Wstringop-overflow"
#include "miniaudio.h"

0 comments on commit dc99002

Please sign in to comment.