Merge pull request #332 from harmony-one/v11-voices

V11 voices
harmony-one · Oct 5, 2023 · ba563a2 · ba563a2
2 parents 8348cf7 + 188856c
commit ba563a2
Show file tree

Hide file tree

Showing 10 changed files with 6,393 additions and 55 deletions.
diff --git a/src/config.ts b/src/config.ts
@@ -162,5 +162,6 @@ export default {
     index: process.env.ES_INDEX
   },
   deepL: { apikey: process.env.DEEPL_API_KEY ?? '' },
-  gc: { credentials: process.env.GC_CREDENTIALS ?? '' }
+  gc: { credentials: process.env.GC_CREDENTIALS ?? '' },
+  elevenlabs: { apiKey: process.env.ELEVENLABS_API_KEY ?? '' }
 }
diff --git a/src/elevenlabs/elevenlabsClient.ts b/src/elevenlabs/elevenlabsClient.ts
@@ -1,5 +1,37 @@
 import axios, { type AxiosInstance } from 'axios'
 
+interface Voice {
+  'voice_id': string
+  'name': string
+  'samples': null
+  'category': 'premade'
+  'fine_tuning': {
+    'language': null
+    'is_allowed_to_fine_tune': boolean
+    'fine_tuning_requested': boolean
+    'finetuning_state': 'not_started'
+    'verification_attempts': null
+    'verification_failures': []
+    'verification_attempts_count': 0
+    'slice_ids': null
+    'manual_verification': null
+    'manual_verification_requested': false
+  }
+  'labels': {
+    'accent': 'american' | string
+    'description': 'strong' | string
+    'age': 'young' | string
+    'gender': 'female' | string
+    'use case': 'narration' | string
+  }
+  'description': null
+  'preview_url': string
+  'available_for_tiers': []
+  'settings': null
+  'sharing': null
+  'high_quality_base_model_ids': []
+}
+
 export class ElevenlabsClient {
   private readonly _token: string
   private readonly _httpClient: AxiosInstance
@@ -17,13 +49,20 @@ export class ElevenlabsClient {
   }
 
   public async textToSpeech ({ text, voiceId }: { text: string, voiceId: string }): Promise<string | Uint8Array | null | undefined> {
-    return await this._httpClient.post(`/v1/text-to-speech/${voiceId}`, {
-      text: 'string',
-      model_id: 'eleven_monolingual_v1',
+    const response = await this._httpClient.post(`/v1/text-to-speech/${voiceId}`, {
+      text,
+      model_id: 'eleven_multilingual_v2',
       voice_settings: {
         stability: 0.5,
         similarity_boost: 0.5
       }
-    })
+    }, { responseType: 'arraybuffer' })
+
+    return Buffer.from(response.data, 'binary')
+  }
+
+  public async voiceList (): Promise<Voice[]> {
+    const response = await this._httpClient.get<{ voices: Voice[] }>('/v1/voices')
+    return response.data.voices
   }
 }
diff --git a/src/elevenlabs/sandbox.ts b/src/elevenlabs/sandbox.ts
@@ -0,0 +1,19 @@
+import { ElevenlabsClient } from './elevenlabsClient'
+import config from '../config'
+
+function labelsToString (labels: Record<string, string>): string {
+  return Object.entries(labels).reduce((acc, item) => {
+    return acc + item.join(': ') + '; '
+  }, '')
+}
+
+async function main (): Promise<void> {
+  const client = new ElevenlabsClient(config.elevenlabs.apiKey)
+  const voiceList = await client.voiceList()
+
+  for (const voice of voiceList) {
+    console.log(voice.voice_id, voice.name, '\t', labelsToString(voice.labels))
+  }
+}
+
+main().then(() => { console.log('### finish') }).catch(console.log)
diff --git a/src/elevenlabs/test.ts b/src/elevenlabs/test.ts
diff --git a/src/google-cloud/gcTextToSpeechClient.ts b/src/google-cloud/gcTextToSpeechClient.ts
@@ -1,12 +1,13 @@
 import GcTextToSpeech, { type TextToSpeechClient } from '@google-cloud/text-to-speech'
 import config from '../config'
 import type { CredentialBody } from 'google-auth-library/build/src/auth/credentials'
+import type { google } from '@google-cloud/text-to-speech/build/protos/protos'
 
 export interface TextToSpeechParams {
   text: string
   languageCode: string
-  ssmlGender?: 'MALE' | 'FEMALE'
-  voiceName?: string
+  ssmlGender?: google.cloud.texttospeech.v1.SsmlVoiceGender | keyof typeof google.cloud.texttospeech.v1.SsmlVoiceGender | null
+  voiceName?: string | null
 }
 
 class GcTextToSpeechClient {
@@ -35,6 +36,11 @@ class GcTextToSpeechClient {
 
     return response.audioContent
   }
+
+  async listVoices (): Promise<google.cloud.texttospeech.v1.IVoice[] | null | undefined> {
+    const response = await this._client.listVoices()
+    return response[0].voices
+  }
 }
 
 const credentials = JSON.parse(Buffer.from(config.gc.credentials, 'base64').toString('utf-8'))