Merge pull request #296 from harmony-one/voice-translate

Voice-translate Added raw demo (voice to synthetic voice)
harmony-one · Sep 27, 2023 · 95ede4a · 95ede4a
2 parents 99c8202 + 421c810
commit 95ede4a
Show file tree

Hide file tree

Showing 9 changed files with 613 additions and 18 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -74,9 +74,11 @@
   },
   "dependencies": {
     "@elastic/elasticsearch": "^8.9.0",
+    "@google-cloud/text-to-speech": "^5.0.1",
     "@grammyjs/auto-chat-action": "^0.1.1",
     "@grammyjs/auto-retry": "^1.1.1",
     "@grammyjs/conversations": "^1.1.2",
+    "@grammyjs/files": "^1.0.4",
     "@grammyjs/menu": "^1.2.1",
     "@grammyjs/ratelimiter": "^1.2.0",
     "@grammyjs/runner": "^2.0.3",
@@ -93,7 +95,7 @@
     "express-async-handler": "^1.2.0",
     "form-data": "^4.0.0",
     "gpt-tokenizer": "^2.1.1",
-    "grammy": "^1.17.1",
+    "grammy": "^1.18.3",
     "jsqr": "^1.4.0",
     "litllm": "^3.0.0",
     "lokijs": "^1.5.12",

diff --git a/src/bot.ts b/src/bot.ts
@@ -49,6 +49,9 @@ import * as Sentry from '@sentry/node'
 import * as Events from 'events'
 import { ProfilingIntegration } from '@sentry/profiling-node'
 import { ES } from './es'
+import { hydrateFiles } from '@grammyjs/files'
+import { VoiceTranslateBot } from './modules/voice-translate'
+import { TextToSpeechBot } from './modules/text-to-speech'
 
 Events.EventEmitter.defaultMaxListeners = 30
 
@@ -61,6 +64,7 @@ const logger = pino({
 })
 
 export const bot = new Bot<BotContext>(config.telegramBotAuthToken)
+bot.api.config.use(hydrateFiles(bot.token))
 bot.api.config.use(autoRetry())
 
 bot.use(
@@ -215,6 +219,8 @@ const translateBot = new TranslateBot()
 const llmsBot = new LlmsBot(payments)
 const documentBot = new DocumentHandler()
 const telegramPayments = new TelegramPayments(payments)
+const voiceTranslateBot = new VoiceTranslateBot(payments)
+const textToSpeechBot = new TextToSpeechBot(payments)
 
 bot.on('message:new_chat_members:me', async (ctx) => {
   try {
@@ -324,9 +330,11 @@ const writeCommandLog = async (
 const PayableBots: Record<string, PayableBotConfig> = {
   qrCodeBot: { bot: qrCodeBot },
   sdImagesBot: { bot: sdImagesBot },
+  voiceTranslate: { bot: voiceTranslateBot },
   voiceMemo: { bot: voiceMemo },
   documentBot: { bot: documentBot },
   translateBot: { bot: translateBot },
+  textToSpeech: { bot: textToSpeechBot },
   openAiBot: {
     enabled: (ctx: OnMessageContext) => ctx.session.openAi.imageGen.isEnabled,
     bot: openAiBot
@@ -662,6 +670,6 @@ async function bootstrap (): Promise<void> {
 }
 
 bootstrap().catch((error) => {
-  console.error(`bot bootstrap error ${error}`)
+  logger.error(`bot bootstrap error ${error}`)
   process.exit(1)
 })
diff --git a/src/config.ts b/src/config.ts
@@ -142,6 +142,7 @@ export default {
       'https://api.thegraph.com/subgraphs/name/nick8319/uniswap-v3-harmony'
   },
   walletConnect: { projectId: process.env.WALLET_CONNECT_PROJECT_ID ?? '' },
+  voiceTranslate: { isEnabled: Boolean(parseInt(process.env.BOT_VOICE_TRANSLATE_ENABLE ?? '0')) },
   db: { url: process.env.DATABASE_URL ?? '' },
   credits: {
     maxChats: 3,
@@ -159,5 +160,6 @@ export default {
     password: process.env.ES_PASSWORD ?? '',
     index: process.env.ES_INDEX
   },
-  deepL: { apikey: process.env.DEEPL_API_KEY ?? '' }
+  deepL: { apikey: process.env.DEEPL_API_KEY ?? '' },
+  gc: { credentials: process.env.GC_CREDENTIALS ?? '' }
 }
diff --git a/src/google-cloud/gcTextToSpeechClient.ts b/src/google-cloud/gcTextToSpeechClient.ts
@@ -0,0 +1,26 @@
+import GcTextToSpeech, { type TextToSpeechClient } from '@google-cloud/text-to-speech'
+import config from '../config'
+import type { CredentialBody } from 'google-auth-library/build/src/auth/credentials'
+
+class GcTextToSpeechClient {
+  private readonly _client: TextToSpeechClient
+  constructor (credentials: CredentialBody) {
+    this._client = new GcTextToSpeech.TextToSpeechClient({ credentials })
+  }
+
+  async textToSpeech (text: string): Promise<string | Uint8Array | null | undefined> {
+    const ssml = `<speak>${text}</speak>`
+
+    const [response] = await this._client.synthesizeSpeech({
+      input: { ssml },
+      voice: { languageCode: 'en-US', ssmlGender: 'MALE' },
+      audioConfig: { audioEncoding: 'OGG_OPUS' }
+    })
+
+    return response.audioContent
+  }
+}
+
+const credentials = JSON.parse(Buffer.from(config.gc.credentials, 'base64').toString('utf-8'))
+
+export const gcTextToSpeedClient = new GcTextToSpeechClient(credentials)
diff --git a/src/modules/open-ai/api/openAi.ts b/src/modules/open-ai/api/openAi.ts
@@ -17,6 +17,7 @@ import {
   type DalleGPTModel,
   DalleGPTModels
 } from '../types'
+import type fs from 'fs'
 
 const openai = new OpenAI({ apiKey: config.openAiKey })
 
@@ -245,3 +246,12 @@ export function getGrammy429Error (): GrammyError {
     { parameters: { retry_after: 33 } }
   )
 }
+
+export async function speechToText (readStream: fs.ReadStream): Promise<string> {
+  const result = await openai.audio.transcriptions.create({
+    file: readStream,
+    model: 'whisper-1'
+  })
+
+  return result.text
+}
diff --git a/src/modules/text-to-speech/index.ts b/src/modules/text-to-speech/index.ts
@@ -0,0 +1,72 @@
+import pino from 'pino'
+import { InputFile } from 'grammy'
+import type { Logger } from 'pino'
+import type { BotPayments } from '../payment'
+import type { OnMessageContext, PayableBot } from '../types'
+import { gcTextToSpeedClient } from '../../google-cloud/gcTextToSpeechClient'
+
+export class TextToSpeechBot implements PayableBot {
+  private readonly payments: BotPayments
+
+  private readonly logger: Logger
+
+  constructor (payments: BotPayments) {
+    this.payments = payments
+    this.logger = pino({
+      name: 'TextToSpeech',
+      transport: {
+        target: 'pino-pretty',
+        options: { colorize: true }
+      }
+    })
+  }
+
+  public isSupportedEvent (ctx: OnMessageContext): boolean {
+    return ctx.hasCommand('voice')
+  }
+
+  public getEstimatedPrice (ctx: OnMessageContext): number {
+    const str = this.getTextFromMessage(ctx)
+    return str.length * 0.005
+  }
+
+  public getTextFromMessage (ctx: OnMessageContext): string {
+    if (ctx.match?.toString()) {
+      return ctx.match.toString()
+    }
+
+    return ctx.message.reply_to_message?.text ?? ''
+  }
+
+  public async onEvent (ctx: OnMessageContext): Promise<void> {
+    if (ctx.hasCommand('voice')) {
+      const text = this.getTextFromMessage(ctx)
+      await this.onTextToSpeech(ctx, text)
+    }
+  }
+
+  public async onTextToSpeech (ctx: OnMessageContext, message: string): Promise<void> {
+    if (!message) {
+      await ctx.reply('/voice command should contain text.')
+      return
+    }
+
+    if (!ctx.chat?.id) {
+      throw new Error('Internal error')
+    }
+
+    const progressMessage = await ctx.reply('Generating...')
+
+    const voiceResult = await gcTextToSpeedClient.textToSpeech(message)
+
+    if (!voiceResult) {
+      await ctx.api.editMessageText(ctx.chat.id, progressMessage.message_id, 'An error occurred during the process of generating the message.')
+      return
+    }
+
+    const inputFile = new InputFile(voiceResult)
+
+    await ctx.api.deleteMessage(ctx.chat.id, progressMessage.message_id)
+    await ctx.replyWithVoice(inputFile)
+  }
+}
diff --git a/src/modules/types.ts b/src/modules/types.ts
@@ -11,6 +11,7 @@ import {
 import { type AutoChatActionFlavor } from '@grammyjs/auto-chat-action'
 import { type ParseMode } from 'grammy/types'
 import { type InlineKeyboardMarkup } from 'grammy/out/types'
+import type { FileFlavor } from '@grammyjs/files'
 
 export interface ImageGenSessionData {
   numImages: number
@@ -99,10 +100,10 @@ export interface BotSessionData {
   analytics: Analytics
 }
 
-export type BotContext = Context &
+export type BotContext = FileFlavor<Context &
 SessionFlavor<BotSessionData> &
 ConversationFlavor &
-AutoChatActionFlavor
+AutoChatActionFlavor>
 
 export type CustomContext<Q extends FilterQuery> = Filter<BotContext, Q>
 export type OnMessageContext = CustomContext<'message'>

diff --git a/src/modules/voice-translate/index.ts b/src/modules/voice-translate/index.ts
@@ -0,0 +1,87 @@
+import fs from 'fs'
+import pino from 'pino'
+import { InputFile } from 'grammy'
+import type { Logger } from 'pino'
+import { gcTextToSpeedClient } from '../../google-cloud/gcTextToSpeechClient'
+import type { BotPayments } from '../payment'
+import { speechToText } from '../open-ai/api/openAi'
+import type { OnMessageContext, PayableBot } from '../types'
+import config from '../../config'
+import { translator } from '../translate/deeplClient'
+
+export class VoiceTranslateBot implements PayableBot {
+  private readonly payments: BotPayments
+
+  private readonly logger: Logger
+
+  constructor (payments: BotPayments) {
+    this.payments = payments
+    this.logger = pino({
+      name: 'VoiceTranslate',
+      transport: {
+        target: 'pino-pretty',
+        options: { colorize: true }
+      }
+    })
+  }
+
+  public isSupportedEvent (ctx: OnMessageContext): boolean {
+    const { voice, audio } = ctx.update.message
+
+    if (!config.voiceTranslate.isEnabled) {
+      return false
+    }
+
+    return (!!voice || !!audio)
+  }
+
+  public getEstimatedPrice (ctx: OnMessageContext): number {
+    const { voice, audio } = ctx.update.message
+    const seconds = (voice?.duration ?? audio?.duration) ?? 0
+    return seconds * 0.005
+  }
+
+  public async onEvent (ctx: OnMessageContext): Promise<void> {
+    const { voice, audio } = ctx.update.message
+
+    if (!(!!voice || !!audio)) {
+      return
+    }
+
+    const progressMessage = await ctx.reply('Generating...')
+
+    if (!ctx.chat?.id) {
+      throw Error('chat id is undefined')
+    }
+
+    const file = await ctx.getFile()
+    const path = await file.download()
+
+    let ext = 'ogg'
+
+    if (file.file_path) {
+      ext = file.file_path.split('.').pop() ?? ext
+    }
+
+    const filename = path + '.' + ext
+    fs.renameSync(path, filename)
+
+    const resultText = await speechToText(fs.createReadStream(filename))
+    fs.rmSync(filename)
+
+    const translateResult = await translator.translateText(resultText, null, 'en-US')
+
+    const voiceResult = await gcTextToSpeedClient.textToSpeech(translateResult.text)
+
+    if (!voiceResult) {
+      await ctx.reply('voice generation error')
+      return
+    }
+
+    await ctx.api.deleteMessage(ctx.chat.id, progressMessage.message_id)
+
+    const inputFile = new InputFile(voiceResult)
+
+    await ctx.replyWithVoice(inputFile)
+  }
+}