-
-
Notifications
You must be signed in to change notification settings - Fork 38
/
audiototext.py
611 lines (430 loc) · 25.4 KB
/
audiototext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
# -*- coding: utf-8 -*-
"""AudioToText CLI
Original file is located at https://colab.research.google.com/github/Carleslc/AudioToText/blob/master/AudioToText.ipynb
# 🗣️ [**AudioToText**](https://github.com/Carleslc/AudioToText)
### 🛠 [Whisper by OpenAI (GitHub)](https://github.com/openai/whisper)
"""
print("AudioToText CLI\n")
import argparse
parser = argparse.ArgumentParser(
description="Transcribe and translate audio to text using Whisper and DeepL.",
epilog=f'web: https://carleslc.me/AudioToText/'
)
parser.add_argument("audio_file", nargs='+', help="source file to transcribe")
parser.add_argument("--task", help="transcribe (default) or translate (to English)",
default="transcribe", choices=["transcribe", "translate"])
parser.add_argument("--model", help="model to use (default: small)",
default="small", choices=["tiny", "base", "small", "medium", "large-v1", "large-v2"])
parser.add_argument("--language", help="source file language (default: Auto-Detect)",
default="Auto-Detect", choices=["Auto-Detect", "Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Assamese", "Azerbaijani", "Bashkir", "Basque", "Belarusian", "Bengali", "Bosnian", "Breton", "Bulgarian", "Burmese", "Castilian", "Catalan", "Chinese", "Croatian", "Czech", "Danish", "Dutch", "English", "Estonian", "Faroese", "Finnish", "Flemish", "French", "Galician", "Georgian", "German", "Greek", "Gujarati", "Haitian", "Haitian Creole", "Hausa", "Hawaiian", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Indonesian", "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean", "Lao", "Latin", "Latvian", "Letzeburgesch", "Lingala", "Lithuanian", "Luxembourgish", "Macedonian", "Malagasy", "Malay", "Malayalam", "Maltese", "Maori", "Marathi", "Moldavian", "Moldovan", "Mongolian", "Myanmar", "Nepali", "Norwegian", "Nynorsk", "Occitan", "Panjabi", "Pashto", "Persian", "Polish", "Portuguese", "Punjabi", "Pushto", "Romanian", "Russian", "Sanskrit", "Serbian", "Shona", "Sindhi", "Sinhala", "Sinhalese", "Slovak", "Slovenian", "Somali", "Spanish", "Sundanese", "Swahili", "Swedish", "Tagalog", "Tajik", "Tamil", "Tatar", "Telugu", "Thai", "Tibetan", "Turkish", "Turkmen", "Ukrainian", "Urdu", "Uzbek", "Valencian", "Vietnamese", "Welsh", "Yiddish", "Yoruba"])
parser.add_argument("--prompt", help="provide context about the audio or encourage a specific writing style, see https://platform.openai.com/docs/guides/speech-to-text/prompting")
parser.add_argument("--coherence_preference", help="True (default): More coherence, but may repeat text. False: Less repetitions, but may have less coherence",
default='True', choices=[True, False], type=lambda b: b.lower() != 'false')
parser.add_argument("--api_key", help="if set with your OpenAI API Key (https://platform.openai.com/account/api-keys), the OpenAI API is used, which can improve the inference speed substantially, but it has an associated cost, see API pricing: https://openai.com/pricing#audio-models. API model is large-v2 (ignores --model)")
parser.add_argument("--output_formats", "--output_format", help="desired result formats (default: txt,vtt,srt,tsv,json)",
default="txt,vtt,srt,tsv,json")
parser.add_argument("--output_dir", help="folder to save results (default: audio_transcription)",
default="audio_transcription")
parser.add_argument("--deepl_api_key", help="DeepL API key, if you want to translate results using DeepL. Get a DeepL Developer Account API Key: https://www.deepl.com/pro-api")
parser.add_argument("--deepl_target_language", help="results target language if you want to translate results using DeepL (--deepl_api_key required)",
choices=["Bulgarian", "Chinese", "Chinese (simplified)", "Czech", "Danish", "Dutch", "English", "English (American)", "English (British)", "Estonian", "Finnish", "French", "German", "Greek", "Hungarian", "Indonesian", "Italian", "Japanese", "Korean", "Latvian", "Lithuanian", "Norwegian", "Polish", "Portuguese", "Portuguese (Brazilian)", "Portuguese (European)", "Romanian", "Russian", "Slovak", "Slovenian", "Spanish", "Swedish", "Turkish", "Ukrainian"])
parser.add_argument("--deepl_coherence_preference", help="True (default): Share context between lines while translating. False: Translate each line independently",
default='True', choices=[True, False], type=lambda b: b.lower() != 'false')
parser.add_argument("--deepl_formality", help="whether the translated text should lean towards formal or informal language (languages with formality supported: German,French,Italian,Spanish,Dutch,Polish,Portuguese,Russian)",
default="default", choices=["default", "formal", "informal"])
parser.add_argument("--skip-install", help="skip pip dependencies installation", action="store_true", default=False)
args = parser.parse_args()
"""
## [Step 1] ⚙️ Install the required libraries
"""
import os, subprocess
from sys import platform as sys_platform
status, ffmpeg_version = subprocess.getstatusoutput("ffmpeg -version")
if status != 0:
from platform import platform
if sys_platform == 'linux' and 'ubuntu' in platform().lower():
os.system("apt install ffmpeg")
else:
print("Install ffmpeg: https://ffmpeg.org/download.html")
exit(status)
elif not args.skip_install:
print(ffmpeg_version.split('\n')[0])
if not args.skip_install:
os.system("pip install --user --upgrade pip")
os.system("pip install git+https://github.com/openai/whisper.git@v20231117 openai==1.9.0 numpy scipy deepl pydub cohere ffmpeg-python torch==2.1.0 tensorflow-probability==0.23.0 typing-extensions==4.9.0")
print()
"""## [Step 2] 📁 Upload your audio files to this folder
Almost any audio or video file format is [supported](https://gist.github.com/Carleslc/1d6b922c8bf4a7e9627a6970d178b3a6).
## [Step 3] 👂 Transcribe or Translate
3.1. Choose a --task:
- `transcribe` speech to text in the same language of the source audio file.
- `translate` speech to text in English.
Translation to other languages is not supported with _Whisper_ by default.
You may try to choose the _Transcribe_ task and set your desired --language, but translation is not guaranteed. However, you can use **_DeepL_** later in the Step 5 to translate the transcription to another language.
3.2. Set the audio_file to match your uploaded file name to transcribe.
- If you want to transcribe multiple files with the same parameters you must separate their file names with commas `,`
3.3. Run this cell and wait for the transcription to complete.
You can try other parameters if the result with default parameters does not suit your needs.
[Available models and languages](https://github.com/openai/whisper#available-models-and-languages)
Setting the --language to the language of source audio file may provide better results than Auto-Detect.
You can add an optional initial --prompt to provide context about the audio or encourage a specific writing style, see the [prompting guide](https://platform.openai.com/docs/guides/speech-to-text/prompting).
If the execution takes too long to complete you can choose a smaller model in --model, with an accuracy tradeoff, or use the OpenAI API.
By default the open-source models are used, but you can also use the OpenAI API if the --api_key parameter is set with your [OpenAI API Key](https://platform.openai.com/account/api-keys), which can improve the inference speed substantially, but it has an associated cost, see [API pricing](https://openai.com/pricing#audio-models).
When using API some options are fixed: --model is ignored (uses large-v2) and --coherence_preference is ignored (uses More coherence).
More parameters are available in the code `options` object.
"""
import whisper
from whisper.utils import format_timestamp, get_writer, WriteTXT
import numpy as np
import torch
import math
from openai import OpenAI
# select task
task = args.task
# set audio file path
audio_files = list(map(lambda audio_path: audio_path.strip(), args.audio_file))
for audio_path in audio_files:
if not os.path.isfile(audio_path):
raise FileNotFoundError(audio_path)
# set model
use_model = args.model
# detect device
if args.api_key:
print("Using API")
from pydub import AudioSegment
from pydub.silence import split_on_silence
else:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {'GPU' if DEVICE == 'cuda' else 'CPU ⚠️'}")
# https://medium.com/analytics-vidhya/the-google-colab-system-specification-check-69d159597417
if DEVICE == "cuda":
os.system("nvidia-smi -L")
else:
if sys_platform == 'linux':
os.system("lscpu | grep \"Model name\" | awk '{$1=$1};1'")
if use_model not in ['tiny', 'base', 'small']:
print("Not using GPU can result in a very slow execution")
print("You may want to try a smaller model (tiny, base, small)")
# display language
WHISPER_LANGUAGES = [k.title() for k in whisper.tokenizer.TO_LANGUAGE_CODE.keys()]
language = args.language
if language == "Auto-Detect":
language = "detect"
if language and language != "detect" and language not in WHISPER_LANGUAGES:
print(f"\nLanguage '{language}' is invalid")
language = "detect"
if language and language != "detect":
print(f"\nLanguage: {language}")
# load model
if args.api_key:
print()
else:
MODELS_WITH_ENGLISH_VERSION = ["tiny", "base", "small", "medium"]
if language == "English" and use_model in MODELS_WITH_ENGLISH_VERSION:
use_model += ".en"
print(f"\nLoading {use_model} model... {os.path.expanduser(f'~/.cache/whisper/{use_model}.pt')}")
model = whisper.load_model(use_model, device=DEVICE)
print(
f"Model {use_model} is {'multilingual' if model.is_multilingual else 'English-only'} "
f"and has {sum(np.prod(p.shape) for p in model.parameters()):,d} parameters.\n"
)
# set options
## https://github.com/openai/whisper/blob/v20231117/whisper/transcribe.py#L37
## https://github.com/openai/whisper/blob/v20231117/whisper/decoding.py#L81
options = {
'task': task,
'verbose': True,
'fp16': True,
'best_of': 5,
'beam_size': 5,
'patience': None,
'length_penalty': None,
'suppress_tokens': '-1',
'temperature': (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), # float or tuple
'condition_on_previous_text': args.coherence_preference,
'initial_prompt': args.prompt,
'word_timestamps': False,
}
if args.api_key:
api_client = OpenAI(api_key=args.api_key)
api_supported_formats = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
api_max_bytes = 25 * 1024 * 1024 # 25 MB
api_transcribe = api_client.audio.transcriptions if task == 'transcribe' else api_client.audio.translations
api_transcribe = api_transcribe.create
api_model = 'whisper-1' # large-v2
# https://platform.openai.com/docs/api-reference/audio?lang=python
api_options = {
'response_format': 'verbose_json',
}
if args.prompt:
api_options['prompt'] = args.prompt
api_temperature = options['temperature'][0] if isinstance(options['temperature'], (tuple, list)) else options['temperature']
if isinstance(api_temperature, (float, int)):
api_options['temperature'] = api_temperature
else:
raise ValueError("Invalid temperature type, it must be a float or a tuple of floats")
elif DEVICE == 'cpu':
options['fp16'] = False
torch.set_num_threads(os.cpu_count())
# execute task
# !whisper "{audio_file}" --task {task} --model {use_model} --output_dir {output_dir} --device {DEVICE} --verbose {options['verbose']}
if task == "translate":
print("-- TRANSLATE TO ENGLISH --")
else:
print("-- TRANSCRIPTION --")
results = {} # audio_path to result
for audio_path in audio_files:
print(f"\nProcessing: {audio_path}\n")
# detect language
detect_language = not language or language == "detect"
if not detect_language:
options['language'] = language
source_language_code = whisper.tokenizer.TO_LANGUAGE_CODE.get(language.lower())
elif not args.api_key:
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
source_language_code = max(probs, key=probs.get)
options['language'] = whisper.tokenizer.LANGUAGES[source_language_code].title()
print(f"Detected language: {options['language']}\n")
# transcribe
if args.api_key:
# API
if task == "transcribe" and not detect_language:
api_options['language'] = source_language_code
source_audio_name_path, source_audio_ext = os.path.splitext(audio_path)
source_audio_ext = source_audio_ext[1:]
if source_audio_ext in api_supported_formats:
api_audio_path = audio_path
api_audio_ext = source_audio_ext
else:
## convert audio file to a supported format
if options['verbose']:
print(f"API supported formats: {','.join(api_supported_formats)}")
print(f"Converting {source_audio_ext} audio to a supported format...")
api_audio_ext = 'mp3'
api_audio_path = f'{source_audio_name_path}.{api_audio_ext}'
subprocess.run(['ffmpeg', '-i', audio_path, api_audio_path], check=True, capture_output=True)
if options['verbose']:
print(api_audio_path, end='\n\n')
## split audio file in chunks
api_audio_chunks = []
audio_bytes = os.path.getsize(api_audio_path)
if audio_bytes >= api_max_bytes:
if options['verbose']:
print(f"Audio exceeds API maximum allowed file size.\nSplitting audio in chunks...")
audio_segment_file = AudioSegment.from_file(api_audio_path, api_audio_ext)
min_chunks = math.ceil(audio_bytes / (api_max_bytes / 2))
# print(f"Min chunks: {min_chunks}")
max_chunk_milliseconds = int(len(audio_segment_file) // min_chunks)
# print(f"Max chunk milliseconds: {max_chunk_milliseconds}")
def add_chunk(api_audio_chunk):
api_audio_chunk_path = f"{source_audio_name_path}_{len(api_audio_chunks) + 1}.{api_audio_ext}"
api_audio_chunk.export(api_audio_chunk_path, format=api_audio_ext)
api_audio_chunks.append(api_audio_chunk_path)
def raw_split(big_chunk):
subchunks = math.ceil(len(big_chunk) / max_chunk_milliseconds)
for subchunk_i in range(subchunks):
chunk_start = max_chunk_milliseconds * subchunk_i
chunk_end = min(max_chunk_milliseconds * (subchunk_i + 1), len(big_chunk))
add_chunk(big_chunk[chunk_start:chunk_end])
non_silent_chunks = split_on_silence(audio_segment_file,
seek_step=5, # ms
min_silence_len=1250, # ms
silence_thresh=-25, # dB
keep_silence=True) # needed to aggregate timestamps
# print(f"Non silent chunks: {len(non_silent_chunks)}")
current_chunk = non_silent_chunks[0] if non_silent_chunks else audio_segment_file
for next_chunk in non_silent_chunks[1:]:
if len(current_chunk) > max_chunk_milliseconds:
raw_split(current_chunk)
current_chunk = next_chunk
elif len(current_chunk) + len(next_chunk) <= max_chunk_milliseconds:
current_chunk += next_chunk
else:
add_chunk(current_chunk)
current_chunk = next_chunk
if len(current_chunk) > max_chunk_milliseconds:
raw_split(current_chunk)
else:
add_chunk(current_chunk)
if options['verbose']:
print(f'Total chunks: {len(api_audio_chunks)}\n')
else:
api_audio_chunks.append(api_audio_path)
## process chunks
result = None
for api_audio_chunk_path in api_audio_chunks:
## API request
with open(api_audio_chunk_path, 'rb') as api_audio_file:
api_result = api_transcribe(model=api_model, file=api_audio_file, **api_options)
api_result = api_result.model_dump() # to dict
api_segments = api_result['segments']
if result:
## update timestamps
last_segment_timestamp = result['segments'][-1]['end'] if result['segments'] else 0
for segment in api_segments:
segment['start'] += last_segment_timestamp
segment['end'] += last_segment_timestamp
## append new segments
result['segments'].extend(api_segments)
if 'duration' in result:
result['duration'] += api_result.get('duration', 0)
else:
## first request
result = api_result
if detect_language:
print(f"Detected language: {result['language'].title()}\n")
## display segments
if options['verbose']:
for segment in api_segments:
print(f"[{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}] {segment['text']}")
else:
# Open-Source
result = whisper.transcribe(model, audio_path, **options)
# fix results formatting
for segment in result['segments']:
segment['text'] = segment['text'].strip()
result['text'] = '\n'.join(map(lambda segment: segment['text'], result['segments']))
# set results for this audio file
results[audio_path] = result
"""## [Step 4] 💾 **Save results**
Run this cell to write the transcription as a file output.
Results will be available in the **audio_transcription** folder in the formats selected in `output_formats`.
If you don't see that folder, you may need to refresh 🔄 the Files folder.
Available formats: `txt,vtt,srt,tsv,json`
"""
# set output folder
output_dir = args.output_dir
# set output formats: https://github.com/openai/whisper/blob/v20231117/whisper/utils.py#L283
output_formats = args.output_formats
output_formats = output_formats.split(',')
from typing import TextIO
class WriteText(WriteTXT):
def write_result(self, result: dict, file: TextIO, **kwargs):
print(result['text'], file=file, flush=True)
def write_result(result, output_format, output_file_name):
output_format = output_format.strip()
# start captions in non-zero timestamp (some media players does not detect the first caption)
fix_vtt = output_format == 'vtt' and result['segments'] and result['segments'][0].get('start') == 0
if fix_vtt:
result['segments'][0]['start'] += 1/1000 # +1ms
# write result in the desired format
writer = WriteText(output_dir) if output_format == 'txt' else get_writer(output_format, output_dir)
writer(result, output_file_name)
if fix_vtt:
result['segments'][0]['start'] = 0 # reset change
output_file_path = os.path.join(output_dir, f"{output_file_name}.{output_format}")
print(output_file_path)
# save results
print("\nWriting results...")
os.makedirs(output_dir, exist_ok=True)
for audio_path, result in results.items():
print(end='\n')
output_file_name = os.path.splitext(os.path.basename(audio_path))[0]
for output_format in output_formats:
write_result(result, output_format, output_file_name)
"""## [Step 5] 💬 Translate results with DeepL (API key needed)
This is an **optional** step to translate the transcription to another language using the **DeepL** API.
[Get a DeepL Developer Account API Key](https://www.deepl.com/pro-api)
Set the `deepl_api_key` to translate the transcription to a supported language in `deepl_target_language`.
"""
import deepl
# translation service options (DeepL Developer Account)
deepl_api_key = args.deepl_api_key
deepl_target_language = args.deepl_target_language
deepl_coherence_preference = args.deepl_coherence_preference
deepl_formality = "default"
if deepl_api_key and not deepl_target_language:
deepl_target_language = 'English'
if deepl_target_language:
if not deepl_api_key:
print("\nRequired: --deepl_api_key")
print("Get a DeepL Developer Account API Key: https://www.deepl.com/pro-api")
elif deepl_target_language == 'English':
deepl_target_language = "English (British)"
elif deepl_target_language == 'Chinese':
deepl_target_language = "Chinese (simplified)"
elif deepl_target_language == 'Portuguese':
deepl_target_language = "Portuguese (European)"
use_deepl_translation = deepl_api_key and deepl_target_language
if use_deepl_translation:
print(end='\n')
if args.deepl_formality != 'default':
deepl_formality = 'prefer_more' if args.deepl_formality == 'formal' else 'prefer_less'
translated_results = {} # audio_path to translated results
try:
deepl_translator = deepl.Translator(deepl_api_key)
deepl_source_languages = [lang.code.upper() for lang in deepl_translator.get_source_languages()]
deepl_target_languages_dict = deepl_translator.get_target_languages()
deepl_target_languages = [lang.name for lang in deepl_target_languages_dict]
deepl_target_language_code = next(lang.code for lang in deepl_target_languages_dict if lang.name == deepl_target_language).upper()
target_language_code = deepl_target_language_code.split('-')[0]
for audio_path, result in results.items():
deepl_usage = deepl_translator.get_usage()
if deepl_usage.any_limit_reached:
print(audio_path)
raise deepl.DeepLException("Quota for this billing period has been exceeded, message: Quota Exceeded")
else:
print(audio_path + '\n')
# translate results (DeepL)
source_language_code = whisper.tokenizer.TO_LANGUAGE_CODE.get(result['language'].lower()).upper()
if (task == 'translate' and target_language_code != 'EN') or (task == 'transcribe' and source_language_code in deepl_source_languages and source_language_code != target_language_code):
source_lang = source_language_code if task == 'transcribe' else None
translate_from = f"from {result['language'].title()} [{source_language_code}] " if source_lang else ''
print(f"DeepL: Translate results {translate_from}to {deepl_target_language} [{deepl_target_language_code}]\n")
segments = result['segments']
translated_results[audio_path] = { 'text': '', 'segments': [], 'language': deepl_target_language }
# segments / request (max 128 KiB / request, so deepl_batch_requests_size is limited to around 1000)
deepl_batch_requests_size = 200 # 200 segments * ~100 bytes / segment = ~20 KB / request (~15 minutes of speech)
for batch_segments in [segments[i:i + deepl_batch_requests_size] for i in range(0, len(segments), deepl_batch_requests_size)]:
batch_segments_text = [segment['text'] for segment in batch_segments]
if deepl_coherence_preference:
batch_segments_text = '<br/>'.join(batch_segments_text)
# DeepL request
deepl_results = deepl_translator.translate_text(
text=batch_segments_text,
source_lang=source_lang,
target_lang=deepl_target_language_code,
formality=deepl_formality,
split_sentences='nonewlines',
tag_handling='xml' if deepl_coherence_preference else None,
ignore_tags='br' if deepl_coherence_preference else None, # used to synchronize sentences with whisper lines but without splitting sentences in DeepL
outline_detection=False if deepl_coherence_preference else None
)
deepl_results_segments = deepl_results.text.split('<br/>') if deepl_coherence_preference else [deepl_result_segment.text for deepl_result_segment in deepl_results]
for j, translated_text in enumerate(deepl_results_segments):
segment = batch_segments[j]
# fix sentence formatting
translated_text = translated_text.lstrip(',.。 ').rstrip()
if not deepl_coherence_preference and translated_text and translated_text[-1] in '.。' and segment['text'][-1] not in '.。':
translated_text = translated_text[:-1]
# add translated segments
translated_results[audio_path]['segments'].append(dict(id=segment['id'], start=segment['start'], end=segment['end'], text=translated_text))
if options['verbose']:
print(f"[{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}] {translated_text}")
deepl_usage = deepl_translator.get_usage()
if deepl_usage.character.valid:
print(f"\nDeepL: Character usage: {deepl_usage.character.count} / {deepl_usage.character.limit} ({100*(deepl_usage.character.count/deepl_usage.character.limit):.2f}%)\n")
elif source_language_code == target_language_code:
print(f"Nothing to translate. Results are already in {result['language']}.")
elif task == 'transcribe' and source_language_code not in deepl_source_languages:
print(f"DeepL: {result['language']} is not yet supported")
except deepl.DeepLException as e:
if isinstance(e, deepl.AuthorizationException) and str(e) == "Authorization failure, check auth_key":
e = "Authorization failure, check deepl_api_key"
print(f"\nDeepL: [Error] {e}\n")
# save translated results (if any)
if translated_results:
print("Writing translated results...")
for audio_path, translated_result in translated_results.items():
print(end='\n')
translated_result['text'] = '\n'.join(map(lambda translated_segment: translated_segment['text'], translated_result['segments']))
output_file_name = os.path.splitext(os.path.basename(audio_path))[0]
translated_output_file_name = f"{output_file_name}_{deepl_target_language}"
for output_format in output_formats:
write_result(translated_result, output_format, translated_output_file_name)