From 3becc50ef98a4c45ea15a3a852184abcf6540faf Mon Sep 17 00:00:00 2001
From: srhinos <6531393+srhinos@users.noreply.github.com>
Date: Wed, 5 Jun 2024 10:50:24 -0400
Subject: [PATCH] Upsample 48KHz

---
 .../synthesizer/eleven_labs_synthesizer.py      |  4 +++-
 .../eleven_labs_websocket_synthesizer.py        | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
index 389a4cc48..350aaee86 100644
--- a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
+++ b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
@@ -38,6 +38,7 @@ def __init__(
         self.optimize_streaming_latency = synthesizer_config.optimize_streaming_latency
         self.words_per_minute = 150
         self.upsample = False
+        self.sample_rate = self.synthesizer_config.sampling_rate
 
         if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
             match self.synthesizer_config.sampling_rate:
@@ -52,6 +53,7 @@ def __init__(
                 case SamplingRate.RATE_48000:
                     self.output_format = "pcm_44100"
                     self.upsample = SamplingRate.RATE_48000.value
+                    self.sample_rate = SamplingRate.RATE_44100.value
                 case _:
                     raise ValueError(
                         f"Unsupported sampling rate: {self.synthesizer_config.sampling_rate}. Elevenlabs only supports 16000, 22050, 24000, and 44100 Hz."
@@ -148,7 +150,7 @@ async def get_chunks(
                 if self.upsample:
                     chunk = self._resample_chunk(
                         chunk,
-                        self.synthesizer_config.sampling_rate,
+                        self.sample_rate,
                         self.upsample,
                     )
                 chunk_queue.put_nowait(chunk)
diff --git a/vocode/streaming/synthesizer/eleven_labs_websocket_synthesizer.py b/vocode/streaming/synthesizer/eleven_labs_websocket_synthesizer.py
index 0f0679713..22c061e34 100644
--- a/vocode/streaming/synthesizer/eleven_labs_websocket_synthesizer.py
+++ b/vocode/streaming/synthesizer/eleven_labs_websocket_synthesizer.py
@@ -108,6 +108,8 @@ def __init__(
             "writer": None,
         }
         self.end_of_turn = False
+        self.upsample = False
+        self.sample_rate = self.synthesizer_config.sampling_rate
 
         # While this looks useless, we need to assign the response of `asyncio.gather`
         # to *something* or we risk garbage collection of the running coroutines spawned
@@ -124,6 +126,10 @@ def __init__(
                     self.output_format = "pcm_24000"
                 case SamplingRate.RATE_44100:
                     self.output_format = "pcm_44100"
+                case SamplingRate.RATE_48000:
+                    self.output_format = "pcm_44100"
+                    self.upsample = SamplingRate.RATE_48000.value
+                    self.sample_rate = SamplingRate.RATE_44100.value
                 case _:
                     raise ValueError(
                         f"Unsupported sampling rate: {self.synthesizer_config.sampling_rate}. Elevenlabs only supports 16000, 22050, 24000, and 44100 Hz."
@@ -212,12 +218,21 @@ async def listen() -> None:
                     message = await ws.recv()
                     if "audio" not in message:
                         continue
-                    response = ElevenLabsWebsocketResponse.parse_raw(message)
+                    response = ElevenLabsWebsocketResponse.model_validate_json(message)
                     if response.audio:
                         decoded = base64.b64decode(response.audio)
                         seconds = len(decoded) / (
                             self.sample_width * self.synthesizer_config.sampling_rate
                         )
+
+                        if self.upsample:
+                            decoded = self._resample_chunk(
+                                decoded,
+                                self.sample_rate,
+                                self.upsample,
+                            )
+                            seconds = len(decoded) / (self.sample_width * self.sample_rate)
+
                         if response.alignment:
                             utterance_chunk = "".join(response.alignment.chars) + " "
                             self.current_turn_utterances_by_chunk.append((utterance_chunk, seconds))