Skip to content

Commit

Permalink
Merge pull request #296 from harmony-one/voice-translate
Browse files Browse the repository at this point in the history
Voice-translate Added raw demo (voice to synthetic voice)
  • Loading branch information
theofandrich authored Sep 27, 2023
2 parents 99c8202 + 421c810 commit 95ede4a
Show file tree
Hide file tree
Showing 9 changed files with 613 additions and 18 deletions.
413 changes: 400 additions & 13 deletions package-lock.json

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,11 @@
},
"dependencies": {
"@elastic/elasticsearch": "^8.9.0",
"@google-cloud/text-to-speech": "^5.0.1",
"@grammyjs/auto-chat-action": "^0.1.1",
"@grammyjs/auto-retry": "^1.1.1",
"@grammyjs/conversations": "^1.1.2",
"@grammyjs/files": "^1.0.4",
"@grammyjs/menu": "^1.2.1",
"@grammyjs/ratelimiter": "^1.2.0",
"@grammyjs/runner": "^2.0.3",
Expand All @@ -93,7 +95,7 @@
"express-async-handler": "^1.2.0",
"form-data": "^4.0.0",
"gpt-tokenizer": "^2.1.1",
"grammy": "^1.17.1",
"grammy": "^1.18.3",
"jsqr": "^1.4.0",
"litllm": "^3.0.0",
"lokijs": "^1.5.12",
Expand Down
10 changes: 9 additions & 1 deletion src/bot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ import * as Sentry from '@sentry/node'
import * as Events from 'events'
import { ProfilingIntegration } from '@sentry/profiling-node'
import { ES } from './es'
import { hydrateFiles } from '@grammyjs/files'
import { VoiceTranslateBot } from './modules/voice-translate'
import { TextToSpeechBot } from './modules/text-to-speech'

Events.EventEmitter.defaultMaxListeners = 30

Expand All @@ -61,6 +64,7 @@ const logger = pino({
})

export const bot = new Bot<BotContext>(config.telegramBotAuthToken)
bot.api.config.use(hydrateFiles(bot.token))
bot.api.config.use(autoRetry())

bot.use(
Expand Down Expand Up @@ -215,6 +219,8 @@ const translateBot = new TranslateBot()
const llmsBot = new LlmsBot(payments)
const documentBot = new DocumentHandler()
const telegramPayments = new TelegramPayments(payments)
const voiceTranslateBot = new VoiceTranslateBot(payments)
const textToSpeechBot = new TextToSpeechBot(payments)

bot.on('message:new_chat_members:me', async (ctx) => {
try {
Expand Down Expand Up @@ -324,9 +330,11 @@ const writeCommandLog = async (
const PayableBots: Record<string, PayableBotConfig> = {
qrCodeBot: { bot: qrCodeBot },
sdImagesBot: { bot: sdImagesBot },
voiceTranslate: { bot: voiceTranslateBot },
voiceMemo: { bot: voiceMemo },
documentBot: { bot: documentBot },
translateBot: { bot: translateBot },
textToSpeech: { bot: textToSpeechBot },
openAiBot: {
enabled: (ctx: OnMessageContext) => ctx.session.openAi.imageGen.isEnabled,
bot: openAiBot
Expand Down Expand Up @@ -662,6 +670,6 @@ async function bootstrap (): Promise<void> {
}

bootstrap().catch((error) => {
console.error(`bot bootstrap error ${error}`)
logger.error(`bot bootstrap error ${error}`)
process.exit(1)
})
4 changes: 3 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ export default {
'https://api.thegraph.com/subgraphs/name/nick8319/uniswap-v3-harmony'
},
walletConnect: { projectId: process.env.WALLET_CONNECT_PROJECT_ID ?? '' },
voiceTranslate: { isEnabled: Boolean(parseInt(process.env.BOT_VOICE_TRANSLATE_ENABLE ?? '0')) },
db: { url: process.env.DATABASE_URL ?? '' },
credits: {
maxChats: 3,
Expand All @@ -159,5 +160,6 @@ export default {
password: process.env.ES_PASSWORD ?? '',
index: process.env.ES_INDEX
},
deepL: { apikey: process.env.DEEPL_API_KEY ?? '' }
deepL: { apikey: process.env.DEEPL_API_KEY ?? '' },
gc: { credentials: process.env.GC_CREDENTIALS ?? '' }
}
26 changes: 26 additions & 0 deletions src/google-cloud/gcTextToSpeechClient.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import GcTextToSpeech, { type TextToSpeechClient } from '@google-cloud/text-to-speech'
import config from '../config'
import type { CredentialBody } from 'google-auth-library/build/src/auth/credentials'

class GcTextToSpeechClient {
private readonly _client: TextToSpeechClient
constructor (credentials: CredentialBody) {
this._client = new GcTextToSpeech.TextToSpeechClient({ credentials })
}

async textToSpeech (text: string): Promise<string | Uint8Array | null | undefined> {
const ssml = `<speak>${text}</speak>`

const [response] = await this._client.synthesizeSpeech({
input: { ssml },
voice: { languageCode: 'en-US', ssmlGender: 'MALE' },
audioConfig: { audioEncoding: 'OGG_OPUS' }
})

return response.audioContent
}
}

const credentials = JSON.parse(Buffer.from(config.gc.credentials, 'base64').toString('utf-8'))

export const gcTextToSpeedClient = new GcTextToSpeechClient(credentials)
10 changes: 10 additions & 0 deletions src/modules/open-ai/api/openAi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
type DalleGPTModel,
DalleGPTModels
} from '../types'
import type fs from 'fs'

const openai = new OpenAI({ apiKey: config.openAiKey })

Expand Down Expand Up @@ -245,3 +246,12 @@ export function getGrammy429Error (): GrammyError {
{ parameters: { retry_after: 33 } }
)
}

export async function speechToText (readStream: fs.ReadStream): Promise<string> {
const result = await openai.audio.transcriptions.create({
file: readStream,
model: 'whisper-1'
})

return result.text
}
72 changes: 72 additions & 0 deletions src/modules/text-to-speech/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import pino from 'pino'
import { InputFile } from 'grammy'
import type { Logger } from 'pino'
import type { BotPayments } from '../payment'
import type { OnMessageContext, PayableBot } from '../types'
import { gcTextToSpeedClient } from '../../google-cloud/gcTextToSpeechClient'

export class TextToSpeechBot implements PayableBot {
private readonly payments: BotPayments

private readonly logger: Logger

constructor (payments: BotPayments) {
this.payments = payments
this.logger = pino({
name: 'TextToSpeech',
transport: {
target: 'pino-pretty',
options: { colorize: true }
}
})
}

public isSupportedEvent (ctx: OnMessageContext): boolean {
return ctx.hasCommand('voice')
}

public getEstimatedPrice (ctx: OnMessageContext): number {
const str = this.getTextFromMessage(ctx)
return str.length * 0.005
}

public getTextFromMessage (ctx: OnMessageContext): string {
if (ctx.match?.toString()) {
return ctx.match.toString()
}

return ctx.message.reply_to_message?.text ?? ''
}

public async onEvent (ctx: OnMessageContext): Promise<void> {
if (ctx.hasCommand('voice')) {
const text = this.getTextFromMessage(ctx)
await this.onTextToSpeech(ctx, text)
}
}

public async onTextToSpeech (ctx: OnMessageContext, message: string): Promise<void> {
if (!message) {
await ctx.reply('/voice command should contain text.')
return
}

if (!ctx.chat?.id) {
throw new Error('Internal error')
}

const progressMessage = await ctx.reply('Generating...')

const voiceResult = await gcTextToSpeedClient.textToSpeech(message)

if (!voiceResult) {
await ctx.api.editMessageText(ctx.chat.id, progressMessage.message_id, 'An error occurred during the process of generating the message.')
return
}

const inputFile = new InputFile(voiceResult)

await ctx.api.deleteMessage(ctx.chat.id, progressMessage.message_id)
await ctx.replyWithVoice(inputFile)
}
}
5 changes: 3 additions & 2 deletions src/modules/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
import { type AutoChatActionFlavor } from '@grammyjs/auto-chat-action'
import { type ParseMode } from 'grammy/types'
import { type InlineKeyboardMarkup } from 'grammy/out/types'
import type { FileFlavor } from '@grammyjs/files'

export interface ImageGenSessionData {
numImages: number
Expand Down Expand Up @@ -99,10 +100,10 @@ export interface BotSessionData {
analytics: Analytics
}

export type BotContext = Context &
export type BotContext = FileFlavor<Context &
SessionFlavor<BotSessionData> &
ConversationFlavor &
AutoChatActionFlavor
AutoChatActionFlavor>

export type CustomContext<Q extends FilterQuery> = Filter<BotContext, Q>
export type OnMessageContext = CustomContext<'message'>
Expand Down
87 changes: 87 additions & 0 deletions src/modules/voice-translate/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import fs from 'fs'
import pino from 'pino'
import { InputFile } from 'grammy'
import type { Logger } from 'pino'
import { gcTextToSpeedClient } from '../../google-cloud/gcTextToSpeechClient'
import type { BotPayments } from '../payment'
import { speechToText } from '../open-ai/api/openAi'
import type { OnMessageContext, PayableBot } from '../types'
import config from '../../config'
import { translator } from '../translate/deeplClient'

export class VoiceTranslateBot implements PayableBot {
private readonly payments: BotPayments

private readonly logger: Logger

constructor (payments: BotPayments) {
this.payments = payments
this.logger = pino({
name: 'VoiceTranslate',
transport: {
target: 'pino-pretty',
options: { colorize: true }
}
})
}

public isSupportedEvent (ctx: OnMessageContext): boolean {
const { voice, audio } = ctx.update.message

if (!config.voiceTranslate.isEnabled) {
return false
}

return (!!voice || !!audio)
}

public getEstimatedPrice (ctx: OnMessageContext): number {
const { voice, audio } = ctx.update.message
const seconds = (voice?.duration ?? audio?.duration) ?? 0
return seconds * 0.005
}

public async onEvent (ctx: OnMessageContext): Promise<void> {
const { voice, audio } = ctx.update.message

if (!(!!voice || !!audio)) {
return
}

const progressMessage = await ctx.reply('Generating...')

if (!ctx.chat?.id) {
throw Error('chat id is undefined')
}

const file = await ctx.getFile()
const path = await file.download()

let ext = 'ogg'

if (file.file_path) {
ext = file.file_path.split('.').pop() ?? ext
}

const filename = path + '.' + ext
fs.renameSync(path, filename)

const resultText = await speechToText(fs.createReadStream(filename))
fs.rmSync(filename)

const translateResult = await translator.translateText(resultText, null, 'en-US')

const voiceResult = await gcTextToSpeedClient.textToSpeech(translateResult.text)

if (!voiceResult) {
await ctx.reply('voice generation error')
return
}

await ctx.api.deleteMessage(ctx.chat.id, progressMessage.message_id)

const inputFile = new InputFile(voiceResult)

await ctx.replyWithVoice(inputFile)
}
}

0 comments on commit 95ede4a

Please sign in to comment.