Convert audio files (mp3/flac/ogg) to 16khz wav

It's no longer necessary to run sox or ffmpeg beforehand, when using the whisperfile command. If you're audio file isn't in the preferred format, it'll be converted for you automatically using the embedded audio tools.
Mozilla-Ocho · Aug 19, 2024 · dc99002 · dc99002
1 parent 2043660
commit dc99002
Show file tree

Hide file tree

Showing 7 changed files with 150 additions and 17 deletions.
diff --git a/whisper.cpp/BUILD.mk b/whisper.cpp/BUILD.mk
@@ -34,9 +34,12 @@ o/$(MODE)/whisper.cpp/main:				\
 		o/$(MODE)/whisper.cpp/main.1.asc.zip.o	\
 		o/$(MODE)/whisper.cpp/whisper.cpp.a	\
 		o/$(MODE)/llama.cpp/llama.cpp.a		\
+		o/$(MODE)/stb/stb.a			\
+
+o/$(MODE)/whisper.cpp/miniaudio.o: private COPTS += -O3
 
 $(WHISPER_CPP_OBJS): whisper.cpp/BUILD.mk
 
 .PHONY: o/$(MODE)/whisper.cpp
 o/$(MODE)/whisper.cpp:					\
-		o/$(MODE)/whisper.cpp/main
+		o/$(MODE)/whisper.cpp/main		\
diff --git a/whisper.cpp/README.llamafile b/whisper.cpp/README.llamafile
@@ -15,3 +15,4 @@ ORIGIN
 LOCAL MODIFICATIONS
 
   - Integrate with llamafile file loader
+  - Automatically convert MP3/FLAC/OGG to WAV
diff --git a/whisper.cpp/common.cpp b/whisper.cpp/common.cpp
@@ -2,11 +2,13 @@
 // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 #define _USE_MATH_DEFINES // for M_PI
 
+#include "llamafile/log.h"
+#include "llamafile/llamafile.h"
 #include "common.h"
 
 // third-party utilities
 // use your favorite implementations
-#define DR_WAV_IMPLEMENTATION
+// #define DR_WAV_IMPLEMENTATION // [jart] comment out
 #include "dr_wav.h"
 
 #if defined(_MSC_VER)
@@ -18,6 +20,23 @@
 #include <io.h>
 #endif
 
+#include <cosmo.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "stb/stb_vorbis.h"
+#include "miniaudio.h"
+
+#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE 4096
+
+static std::string delete_me;
+
+static void on_exit(void) {
+    if (!delete_me.empty()) {
+        unlink(delete_me.c_str());
+    }
+}
+
 bool is_wav_buffer(const std::string buf) {
     // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
     // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
@@ -33,10 +52,103 @@ bool is_wav_buffer(const std::string buf) {
     return true;
 }
 
-bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
+static ma_result perform_audio_conversion(ma_decoder* pDecoder, ma_encoder* pEncoder) {
+    ma_result rc = MA_SUCCESS;
+    for (;;) {
+        ma_uint8 pRawData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+        ma_uint64 framesReadThisIteration;
+        ma_uint64 framesToReadThisIteration;
+        framesToReadThisIteration = sizeof(pRawData) / ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
+        rc = ma_decoder_read_pcm_frames(pDecoder, pRawData, framesToReadThisIteration, &framesReadThisIteration);
+        if (rc != MA_SUCCESS) {
+            break;
+        }
+        ma_encoder_write_pcm_frames(pEncoder, pRawData, framesReadThisIteration, NULL);
+        if (framesReadThisIteration < framesToReadThisIteration) {
+            break;
+        }
+    }
+    return rc;
+}
+
+// converts audio file to signed 16-bit 16000hz wav
+static std::string convert_audio_file(const std::string & fname, bool stereo) {
+
+    // create temporary filename
+    std::string newpath;
+    newpath = __get_tmpdir();
+    newpath += "/whisperfile.";
+    newpath += std::to_string(_rand64());
+    newpath += ".wav";
+
+    // create decoder
+    ma_decoder_config decoderConfig =
+            ma_decoder_config_init(ma_format_s16, 1 + stereo, COMMON_SAMPLE_RATE);
+    decoderConfig.resampling.algorithm = ma_resample_algorithm_linear;
+    decoderConfig.resampling.linear.lpfOrder = 8;
+
+    // open input file
+    ma_decoder decoder;
+    ma_result rc = ma_decoder_init_file(fname.c_str(), &decoderConfig, &decoder);
+    if (rc != MA_SUCCESS) {
+        fprintf(stderr, "%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n",
+                fname.c_str(), ma_result_description(rc));
+        return "";
+    }
+
+    // create encoder
+    ma_encoder encoder;
+    ma_encoder_config encoderConfig = ma_encoder_config_init(
+        ma_encoding_format_wav,
+        decoder.outputFormat,
+        decoder.outputChannels,
+        decoder.outputSampleRate);
+    rc = ma_encoder_init_file(newpath.c_str(), &encoderConfig, &encoder);
+    if (rc != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        fprintf(stderr, "%s: failed to open output file: %s\n",
+                newpath.c_str(), ma_result_description(rc));
+        return "";
+    }
+
+    // perform the conversion
+    rc = perform_audio_conversion(&decoder, &encoder);
+    ma_encoder_uninit(&encoder);
+    ma_decoder_uninit(&decoder);
+    if (rc != MA_SUCCESS) {
+        fprintf(stderr, "%s: failed to convert audio file: %s\n",
+                fname.c_str(), ma_result_description(rc));
+        return "";
+    }
+
+    // return new path
+    delete_me = newpath;
+    atexit(on_exit);
+    return newpath;
+}
+
+#define TRY_CONVERSION                                                  \
+    do {                                                                \
+        if (did_conversion) {                                           \
+            fprintf(stderr, "error: failed to open audio file\n");      \
+            return false;                                               \
+        }                                                               \
+        std::string fname2 = convert_audio_file(fname, stereo);         \
+        if (fname2.empty()) {                                           \
+            return false;                                               \
+        }                                                               \
+        fname = fname2;                                                 \
+        did_conversion = true;                                          \
+        goto TryAgain;                                                  \
+    } while (0)
+
+bool read_wav(const std::string & fname_, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
     drwav wav;
     std::vector<uint8_t> wav_data; // used for pipe input from stdin
+    std::string fname = fname_;
+    bool did_conversion = false;
 
+TryAgain:
     if (fname == "-") {
         {
             #ifdef _WIN32
@@ -68,32 +180,38 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
         }
     }
     else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
-        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
+        tinylogf("%s: converting to wav...\n", fname.c_str());
+        TRY_CONVERSION;
+    }
+
+    if (stereo && wav.channels < 2) {
+        fprintf(stderr, "%s: audio file must be stereo for diarization\n", fname.c_str());
+        drwav_uninit(&wav);
         return false;
     }
 
     if (wav.channels != 1 && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
+        tinylogf("%s: audio file has %d channels\n", fname.c_str(), wav.channels);
         drwav_uninit(&wav);
-        return false;
+        TRY_CONVERSION;
     }
 
     if (stereo && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
+        tinylogf("%s: audio file has %d channels (we want diarization)\n", fname.c_str(), wav.channels);
         drwav_uninit(&wav);
-        return false;
+        TRY_CONVERSION;
     }
 
     if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
+        tinylogf("%s: audio file has %d sample rate\n", fname.c_str(), wav.sampleRate);
         drwav_uninit(&wav);
-        return false;
+        TRY_CONVERSION;
     }
 
     if (wav.bitsPerSample != 16) {
-        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
+        tinylogf("%s: audio file has %d bits per sample\n", fname.c_str(), wav.bitsPerSample);
         drwav_uninit(&wav);
-        return false;
+        TRY_CONVERSION;
     }
 
     const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
@@ -171,7 +289,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
     energy_last /= n_samples_last;
 
     if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+        tinylogf("%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
     }
 
     if (energy_last > vad_thold*energy_all) {

diff --git a/whisper.cpp/dr_wav.c b/whisper.cpp/dr_wav.c
@@ -0,0 +1,2 @@
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
diff --git a/whisper.cpp/main.1 b/whisper.cpp/main.1
@@ -32,7 +32,10 @@ Puts program in HTTP server mode.
 Path of Whisper model weights. See
 https://huggingface.co/ggerganov/whisper.cpp
 .It Fl f Ar FNAME , Fl Fl file Ar FNAME
-Path of WAV file to transcribe.
+Path of audio file to transcribe. The preferred audio format is a 16khz
+16-bit signed linear WAV file, which can be stereo or mono. It's also
+permissible to pass an MP3, FLAC, or OGG file, in which case it'll be
+converted to .wav file in your temp directory before transcribing.
 .It Fl tr , Fl Fl translate
 Translate audio into English text.
 .It Fl ot Ar N , Fl Fl offset-t Ar N

diff --git a/whisper.cpp/main.1.asc b/whisper.cpp/main.1.asc
@@ -30,7 +30,11 @@
                face.co/ggerganov/whisper.cpp
 
        [1m-f [4m[22mFNAME[24m, [1m--file [4m[22mFNAME[0m
-               Path of WAV file to transcribe.
+               Path of audio file to transcribe. The preferred audio format is
+               a  16khz  16-bit signed linear WAV file, which can be stereo or
+               mono. It's also permissible to pass an MP3, FLAC, or OGG  file,
+               in  which case it'll be converted to .wav file in your temp di‐
+               rectory before transcribing.
 
        [1m-tr[22m, [1m--translate[0m
                Translate audio into English text.
@@ -47,8 +51,9 @@
        [1m-pc[22m, [1m--print-colors[0m
                Enables CLI printing of ANSI color codes.
 
-               Transcribed  text  will appear in the terminal on a spectrum of
-               color ranging from green to red. Green means the model
+               Transcribed text will appear in the terminal on a  spectrum  of
+               color  ranging  from  green to red. Green represents confidence
+               whereas red represents uncertainty.
 
        [1m-t [4m[22mN[24m, [1m--threads [4m[22mN[0m
                Overrides number of threads to use.

diff --git a/whisper.cpp/miniaudio.c b/whisper.cpp/miniaudio.c
@@ -1,6 +1,7 @@
 #include "stb/stb_vorbis.h"
 
 #define MA_NO_DEVICE_IO
+#define MA_NO_RUNTIME_LINKING
 #define MINIAUDIO_IMPLEMENTATION
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #include "miniaudio.h"