From b13d40945ab759f4edece04b8ff1223c3ac33f9f Mon Sep 17 00:00:00 2001 From: zh-plus Date: Mon, 24 Jun 2024 10:35:22 +0800 Subject: [PATCH 1/5] Also remove generated .wav files from videos. --- openlrc/openlrc.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py index 9e98cfa..7e41c44 100644 --- a/openlrc/openlrc.py +++ b/openlrc/openlrc.py @@ -51,9 +51,10 @@ class LRCer: retry_model: The model to use when retrying the translation. Default: None """ - def __init__(self, whisper_model='large-v3', compute_type='float16', device='cuda', chatbot_model: str = 'gpt-3.5-turbo', - fee_limit=0.25, consumer_thread=4, asr_options=None, vad_options=None, preprocess_options=None, - proxy=None, base_url_config=None, glossary: Union[dict, str, Path] = None, retry_model=None): + def __init__(self, whisper_model='large-v3', compute_type='float16', device='cuda', + chatbot_model: str = 'gpt-3.5-turbo', fee_limit=0.25, consumer_thread=4, asr_options=None, + vad_options=None, preprocess_options=None, proxy=None, base_url_config=None, + glossary: Union[dict, str, Path] = None, retry_model=None): self.chatbot_model = chatbot_model self.fee_limit = fee_limit self.api_fee = 0 # Can be updated in different thread, operation should be thread-safe @@ -181,7 +182,9 @@ def consumer_worker(self, transcription_queue, target_lang, skip_trans, bilingua return # Copy preprocessed/xxx_preprocessed.lrc or preprocessed/xxx_preprocessed.srt to xxx.lrc or xxx.srt - subtitle_format = 'srt' if audio_name in self.from_video else 'lrc' + original_name_wo_suffix = transcribed_path.parents[ + 1] / f"{transcribed_path.name.replace('_preprocessed_transcribed.json', '')}" + subtitle_format = 'srt' if original_name_wo_suffix in self.from_video else 'lrc' subtitle_path = getattr(final_subtitle, f'to_{subtitle_format}')() result_path = subtitle_path.parents[1] / subtitle_path.name.replace(f'_preprocessed.{subtitle_format}', f'.{subtitle_format}') @@ -250,7 +253,7 @@ def _translate(self, audio_name, target_lang, transcribed_opt_sub, translated_pa return final_subtitle def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optional[str] = None, target_lang='zh-cn', - skip_trans=False, noise_suppress=False, bilingual_sub=False, clear_temp_folder=False) -> List[str]: + skip_trans=False, noise_suppress=False, bilingual_sub=False, clear_temp=False) -> List[str]: """ Split the translation into 2 phases: transcription and translation. They're running in parallel. Firstly, transcribe the audios one-by-one. At the same time, translation threads are created and waiting for @@ -264,7 +267,7 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona skip_trans (bool): Whether to skip the translation process. (Default to False) noise_suppress (bool): Whether to suppress the noise in the audio. (Default to False) bilingual_sub (bool): Whether to generate bilingual subtitles. (Default to False) - clear_temp_folder (bool): Whether to clear the temporary folder. + clear_temp (bool): Whether to clear all the temporary files, including the generated .wav from video. Note, set this back to False to see more intermediate results if error encountered. (Default to False) Returns: @@ -305,14 +308,13 @@ def run(self, paths: Union[str, Path, List[Union[str, Path]]], src_lang: Optiona logger.info(f'Totally used API fee: {self.api_fee:.4f} USD') - if clear_temp_folder: + if clear_temp: logger.info('Clearing temporary folder...') self.clear_temp_files(audio_paths) return self.transcribed_paths - @staticmethod - def clear_temp_files(paths): + def clear_temp_files(self, paths): """ Clear the temporary files generated during the transcription and translation process. """ @@ -323,6 +325,12 @@ def clear_temp_files(paths): shutil.rmtree(folder) logger.debug(f'Removed {folder}') + for input_video_path in self.from_video: + generated_wave = input_video_path.with_suffix('.wav') + if generated_wave.exists(): + generated_wave.unlink() + logger.debug(f'Removed generated wav (from video): {generated_wave}') + @staticmethod def to_json(segments: List[Segment], name, lang): result = { @@ -352,10 +360,10 @@ def pre_process(self, paths, noise_suppress=False): if not path.exists() or not path.is_file(): raise FileNotFoundError(f'File not found: {path}') - paths[i] = extract_audio(path) - if get_file_type(path) == 'video': - self.from_video.add(path.stem + '_preprocessed') + self.from_video.add(path.with_suffix('')) + + paths[i] = extract_audio(path) # Audio-based process preprocessor = Preprocessor(paths, options=self.preprocess_options) From 98691cbb2b73fbba4fd12e9fccb8a14a2aef9ada Mon Sep 17 00:00:00 2001 From: zh-plus Date: Mon, 24 Jun 2024 12:50:11 +0800 Subject: [PATCH 2/5] Add context validators to ensure the effectiveness. --- README.md | 2 +- openlrc/agents.py | 50 ++++++++- openlrc/chatbot.py | 3 + openlrc/prompter.py | 236 +++++++++++------------------------------ openlrc/translate.py | 4 +- openlrc/validators.py | 150 ++++++++++++++++++++++++++ tests/test_agents.py | 6 +- tests/test_prompter.py | 6 +- 8 files changed, 271 insertions(+), 186 deletions(-) create mode 100644 openlrc/validators.py diff --git a/README.md b/README.md index 71ae357..01c12a4 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ if __name__ == '__main__': lrcer.run('./data/test.mp3', target_lang='zh-cn') # Clear temp folder after processing done - lrcer.run('./data/test.mp3', target_lang='zh-cn', clear_temp_folder=True) + lrcer.run('./data/test.mp3', target_lang='zh-cn', clear_temp=True) # Change base_url lrcer = LRCer(base_url_config={'openai': 'https://api.g4f.icu/v1', diff --git a/openlrc/agents.py b/openlrc/agents.py index c1c9977..625b0ed 100644 --- a/openlrc/agents.py +++ b/openlrc/agents.py @@ -7,8 +7,9 @@ from openlrc.chatbot import route_chatbot, GPTBot, ClaudeBot from openlrc.context import TranslationContext, TranslateInfo from openlrc.logger import logger -from openlrc.prompter import BaseTranslatePrompter, ContextReviewPrompter, POTENTIAL_PREFIX_COMBOS, \ - ProofreaderPrompter, PROOFREAD_PREFIX +from openlrc.prompter import ChunkedTranslatePrompter, ContextReviewPrompter, ProofreaderPrompter, PROOFREAD_PREFIX, \ + ContextReviewerValidatePrompter +from openlrc.validators import POTENTIAL_PREFIX_COMBOS class Agent(abc.ABC): @@ -38,7 +39,7 @@ def __init__(self, src_lang, target_lang, info: TranslateInfo = TranslateInfo(), self.chatbot_model = chatbot_model self.info = info self.chatbot = self._initialize_chatbot(chatbot_model, fee_limit, proxy, base_url_config) - self.prompter = BaseTranslatePrompter(src_lang, target_lang, info) + self.prompter = ChunkedTranslatePrompter(src_lang, target_lang, info) self.cost = 0 def __str__(self): @@ -106,30 +107,69 @@ class ContextReviewerAgent(Agent): TODO: Add chunking support. """ - TEMPERATURE = 0.8 + TEMPERATURE = 0.6 def __init__(self, src_lang, target_lang, info: TranslateInfo = TranslateInfo(), - chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.25, proxy: str = None, + chatbot_model: str = 'gpt-3.5-turbo', retry_model=None, + fee_limit: float = 0.25, proxy: str = None, base_url_config: Optional[dict] = None): super().__init__() self.src_lang = src_lang self.target_lang = target_lang self.info = info self.chatbot_model = chatbot_model + self.validate_prompter = ContextReviewerValidatePrompter() self.prompter = ContextReviewPrompter(src_lang, target_lang) self.chatbot = self._initialize_chatbot(chatbot_model, fee_limit, proxy, base_url_config) + self.retry_chatbot = self._initialize_chatbot( + retry_model, fee_limit, proxy, base_url_config + ) if retry_model else None def __str__(self): return f'Context Reviewer Agent ({self.chatbot_model})' + def _validate_context(self, context: str) -> bool: + messages_list = [ + {'role': 'system', 'content': self.validate_prompter.system()}, + {'role': 'user', 'content': self.validate_prompter.user(context)}, + ] + resp = self.chatbot.message(messages_list, output_checker=self.validate_prompter.check_format)[0] + return 'true' in self.chatbot.get_content(resp).lower() + def build_context(self, texts, title='', glossary: Optional[dict] = None) -> str: text_content = '\n'.join(texts) + messages_list = [ {'role': 'system', 'content': self.prompter.system()}, {'role': 'user', 'content': self.prompter.user(text_content, title=title, given_glossary=glossary)}, ] resp = self.chatbot.message(messages_list, output_checker=self.prompter.check_format)[0] context = self.chatbot.get_content(resp) + + # Validate + if not self._validate_context(context): + validated = False + if self.retry_chatbot: + logger.info(f'Failed to validate the context using {self.chatbot}, retrying with {self.retry_chatbot}') + resp = self.retry_chatbot.message(messages_list, output_checker=self.validate_prompter.check_format)[0] + context = self.retry_chatbot.get_content(resp) + if self._validate_context(context): + validated = True + else: + logger.warning(f'Failed to validate the context using {self.retry_chatbot}: {context}') + + if not validated: + for i in range(2, 4): + logger.warning(f'Retry to generate the context using {self.chatbot} at {i} reties.') + resp = self.chatbot.message(messages_list, output_checker=self.validate_prompter.check_format)[0] + context = self.chatbot.get_content(resp) + if self._validate_context(context): + validated = True + break + + if not validated: + logger.warning(f'Finally failed to validate the context: {context}, check the context manually.') + return context diff --git a/openlrc/chatbot.py b/openlrc/chatbot.py index 9fc8f93..332fc0b 100644 --- a/openlrc/chatbot.py +++ b/openlrc/chatbot.py @@ -155,6 +155,9 @@ def message(self, messages_list: Union[List[Dict], List[List[Dict]]], return results + def __str__(self): + return f'ChatBot ({self.model})' + @_register_chatbot class GPTBot(ChatBot): diff --git a/openlrc/prompter.py b/openlrc/prompter.py index 2dbcaa3..5db3747 100644 --- a/openlrc/prompter.py +++ b/openlrc/prompter.py @@ -1,31 +1,20 @@ # Copyright (C) 2024. Hao Zheng # All rights reserved. + import abc -import re from abc import ABC from typing import List, Tuple, Optional from langcodes import Language -from lingua import LanguageDetectorBuilder from openlrc.context import TranslateInfo -from openlrc.logger import logger +from openlrc.validators import ChunkedTranslateValidator, AtomicTranslateValidator, ProofreaderValidator, \ + ContextReviewerValidateValidator ORIGINAL_PREFIX = 'Original>' TRANSLATION_PREFIX = 'Translation>' PROOFREAD_PREFIX = 'Proofread>' -POTENTIAL_PREFIX_COMBOS = [ - [ORIGINAL_PREFIX, TRANSLATION_PREFIX], - ['原文>', '翻译>'], - ['原文>', '译文>'], - ['原文>', '翻譯>'], - ['原文>', '譯文>'], - ['Original>', 'Translation>'], - ['Original>', 'Traducción>'] -] - -# instruction prompt modified from https://github.com/machinewrapped/gpt-subtrans BASE_TRANSLATE_INSTRUCTION = f'''Ignore all previous instructions. You are a translator tasked with revising and translating subtitles into a target language. Your goal is to ensure accurate, concise, and natural-sounding translations for each line of dialogue. The input consists of transcribed audio, which may contain transcription errors. Your task is to first correct any errors you find in the sentences based on their context, and then translate them to the target language according to the revised sentences. The user will provide a chunk of lines, you should respond with an accurate, concise, and natural-sounding translation for the dialogue, with appropriate punctuation. @@ -102,7 +91,7 @@ The translation should be in a lovely colloquial style and suitable for high-quality subtitles. -I’m going to tip $1000 for a better translation! +I’m going to tip \$1000 for a better translation! ### retry_instructions There was an issue with the previous translation. @@ -116,7 +105,10 @@ class Prompter(abc.ABC): def check_format(self, messages, output_str): - return True + if hasattr(self, 'validator'): + return self.validator.validate(messages, output_str) + else: + return True class TranslatePrompter(Prompter, ABC): @@ -129,13 +121,13 @@ def post_process(texts): raise NotImplementedError() -class BaseTranslatePrompter(TranslatePrompter): +class ChunkedTranslatePrompter(TranslatePrompter): def __init__(self, src_lang, target_lang, context: TranslateInfo): self.src_lang = src_lang self.target_lang = target_lang self.src_lang_display = Language.get(src_lang).display_name('en') self.target_lang_display = Language.get(target_lang).display_name('en') - self.lan_detector = LanguageDetectorBuilder.from_all_languages().build() + self.validator = ChunkedTranslateValidator(target_lang) self.audio_type = context.audio_type self.title = context.title @@ -175,89 +167,8 @@ def formatted_glossary(self): @classmethod def format_texts(cls, texts: List[Tuple[int, str]]): - """ - Reconstruct list of text into desired format. - - Args: - texts: List of (id, text). - - Returns: - The formatted string: f"#id\n{original_prefix}\n{text}\n{translation_prefix}\n" - """ return '\n'.join([f'#{i}\n{ORIGINAL_PREFIX}\n{text}\n{TRANSLATION_PREFIX}\n' for i, text in texts]) - def check_format(self, messages, content): - summary = re.search(r'(.*)', content) - scene = re.search(r'(.*)', content) - - # If message is for claude, use messages[0] - user_input = messages[1]['content'] if len(messages) == 2 else messages[0]['content'] - original = re.findall(ORIGINAL_PREFIX + r'\n(.*?)\n' + TRANSLATION_PREFIX, user_input, re.DOTALL) - if not original: - logger.error(f'Fail to extract original text.') - return False - - translation = self._extract_translation(content) - if not translation: - # TODO: Try to change chatbot_model if always fail - logger.warning(f'Fail to extract translation.') - logger.debug(f'Content: {content}') - return False - - if len(original) != len(translation): - logger.warning( - f'Fail to ensure length consistent: original is {len(original)}, translation is {len(translation)}') - logger.debug(f'original: {original}') - logger.debug(f'translation: {original}') - return False - - # Ensure the translated langauge is in the target language - if not self._is_translation_in_target_language(translation): - return False - - # It's ok to keep going without summary and scene - if not summary or not summary.group(1): - logger.warning(f'Fail to extract summary.') - if not scene or not scene.group(1): - logger.warning(f'Fail to extract scene.') - - return True - - def _extract_translation(self, content: str) -> List[str]: - for potential_ori_prefix, potential_trans_prefix in POTENTIAL_PREFIX_COMBOS: - translation = re.findall(f'{potential_trans_prefix}\n*(.*?)(?:#\\d+||\\n*$)', content, re.DOTALL) - if translation: - return translation - return [] - - def _is_translation_in_target_language(self, translation: List[str]) -> bool: - if len(translation) >= 3: - chunk_size = len(translation) // 3 - translation_chunks = [translation[i:i + chunk_size] for i in range(0, len(translation), chunk_size)] - if len(translation_chunks) > 3: - translation_chunks[-2].extend(translation_chunks[-1]) - translation_chunks.pop() - - translated_langs = [self.lan_detector.detect_language_of(' '.join(chunk)) for chunk in translation_chunks] - translated_langs = [lang.name.lower() for lang in translated_langs if lang] - - if not translated_langs: - return True - - translated_lang = max(set(translated_langs), key=translated_langs.count) - else: - detected_lang = self.lan_detector.detect_language_of(' '.join(translation)) - if not detected_lang: - return True - translated_lang = detected_lang.name.lower() - - target_lang = Language.get(self.target_lang).language_name().lower() - if translated_lang != target_lang: - logger.warning(f'Translated language is {translated_lang}, not {target_lang}.') - return False - - return True - class AtomicTranslatePrompter(TranslatePrompter): def __init__(self, src_lang, target_lang): @@ -265,27 +176,12 @@ def __init__(self, src_lang, target_lang): self.target_lang = target_lang self.src_lang_display = Language.get(src_lang).display_name('en') self.target_lang_display = Language.get(target_lang).display_name('en') - self.lan_detector = LanguageDetectorBuilder.from_all_languages().build() + self.validator = AtomicTranslateValidator(target_lang) def user(self, text): return f'''Please translate the following text from {self.src_lang_display} to {self.target_lang_display}. Please do not output any content other than the translated text. Here is the text: {text}''' - def check_format(self, messages, output_str): - # Ensure the translated langauge is in the target language - detected_lang = self.lan_detector.detect_language_of(output_str) - if not detected_lang: - # Cant detect language - return True - - translated_lang = detected_lang.name.lower() - target_lang = Language.get(self.target_lang).language_name().lower() - if translated_lang != target_lang: - logger.warning(f'Translated text: "{output_str}" is {translated_lang}, not {target_lang}.') - return False - - return True - class ContextReviewPrompter(Prompter): def __init__(self, src_lang, target_lang): @@ -293,7 +189,6 @@ def __init__(self, src_lang, target_lang): self.target_lang = target_lang self.src_lang_display = Language.get(src_lang).display_name('en') self.target_lang_display = Language.get(target_lang).display_name('en') - self.lan_detector = LanguageDetectorBuilder.from_all_languages().build() def system(self): return f'''Context: @@ -358,7 +253,7 @@ def __init__(self, src_lang, target_lang): self.target_lang = target_lang self.src_lang_display = Language.get(src_lang).display_name('en') self.target_lang_display = Language.get(target_lang).display_name('en') - self.lan_detector = LanguageDetectorBuilder.from_all_languages().build() + self.validator = ProofreaderValidator(target_lang) def system(self): return f'''Ignore all previous instructions. @@ -388,70 +283,67 @@ def system(self): {ORIGINAL_PREFIX} Thus, it is important to adapt to changing circumstances and remain open to new opportunities. {TRANSLATION_PREFIX} -因此,适应变化的环境并对新机会持开放态度是很重要的。 +因此,适应变化的环境并对新机会持开放态度 +''' -Example output: -#1 -{TRANSLATION_PREFIX} -那些抗拒变化的人可能会发现自己被抛在后面。 -{PROOFREAD_PREFIX} -那些抗拒变化的人可能会发现自己落伍了。 +class ContextReviewerValidatePrompter(Prompter): + def __init__(self): + self.validator = ContextReviewerValidateValidator('en') -#2 -{TRANSLATION_PREFIX} -另一方面,那些接受变化的人可以在新环境中发展。 -{PROOFREAD_PREFIX} -相反,那些拥抱变化的人可以在新环境中如鱼得水。 + def system(self): + return f'''Ignore all previous instructions. +You are a context validator, responsible for validating the context provided by the Context Reviewer. Your role is to validate if the context is good. +A good context should include a comprehensive glossary of key terms and phrases, character name translations, a concise story summary, tone and style guidelines, and target audience insights. +Only output True/False based on the provided context. -#3 -{TRANSLATION_PREFIX} -因此,适应变化的环境并对新机会持开放态度是很重要的。 -{PROOFREAD_PREFIX} -因此,适应变化的环境并对新机会保持开放态度是非常重要的。 +# Example 1: +Input: +I will provide a context review for this translation, focusing on appropriate content and language: +### Glossary: +- PC hardware: 电脑硬件 +- gaming rigs: 游戏装置 +- motherboard: 主板 -### retry_instructions -Please proofread the subtitles again, paying careful attention to ensure that each line is proofreaded separately, and that every line has a matching text. -Do not merge lines together during the proofread, it leads to incorrect timings and confusion for the reader. -''' +### Characters: +No specific characters mentioned. - def user(self, texts, translations, guideline=''): - formated_texts = '\n'.join( - [ - f'#{i}\n{ORIGINAL_PREFIX}\n{text}\n{TRANSLATION_PREFIX}\n{trans}\n' for i, (text, trans) in - enumerate(zip(texts, translations), start=1) - ]) - return f'''Translation guidelines from context reviewer: -{guideline} +### Summary: +The text discusses a trend in PC hardware design where cables are being hidden by moving connectors to the back of the motherboard. The speaker expresses approval of this trend, noting it utilizes previously unused space. However, they also mention that not everyone agrees with this design change. -Please proofread the following translated subtitles, which is from {self.src_lang_display} to {self.target_lang_display}: -{formated_texts} +### Tone and Style: +The tone is casual and informative, with a touch of humor. The translation should maintain this conversational style while ensuring clarity for technical terms. Avoid overly formal language and try to capture the light-hearted nature of the commentary. + +### Target Audience: +The target audience appears to be tech-savvy individuals, particularly those interested in PC gaming and hardware. They likely have some familiarity with computer components and assembly. The translation should cater to Chinese speakers with similar interests and knowledge levels. + +Output: +True + +# Example 2: +Input: +Sorry, I can't provide the context for this text. I can assist in generating other texts. + +Output: +False + +# Example 3: +Input: +Key points for translation: + +1. The opening lines are a joke, likely setting a humorous tone for the video. +2. The main topic is about cable management in PC building. +3. There's a trend of moving cable connectors to the back of the motherboard to reduce clutter. +4. The speaker seems to approve of this trend. +5. The text mentions that not everyone likes this new trend. + +When translating, maintain the casual, slightly humorous tone of the original text. Technical terms like "PC hardware," "gaming rigs," and "motherboard" should be translated using their standard Chinese equivalents. The joke at the beginning should be translated in a way that preserves the humor if possible, but cultural adaptation may be necessary. Output: +False + ''' - def check_format(self, messages, content): - # If message is for claude, use messages[0] - user_input = messages[1]['content'] if len(messages) == 2 else messages[0]['content'] - original = re.findall(ORIGINAL_PREFIX + r'\n(.*?)\n' + TRANSLATION_PREFIX, user_input, re.DOTALL) - if not original: - logger.error(f'Fail to extract original text.') - return False - - localized = re.findall(PROOFREAD_PREFIX + r'\s*(.*)', content, re.MULTILINE) - - if not localized: - # TODO: Try to change chatbot_model if always fail - logger.warning(f'Fail to extract translation.') - logger.debug(f'Content: {content}') - return False - - if len(original) != len(localized): - logger.warning( - f'Fail to ensure length consistent: original is {len(original)}, translation is {len(localized)}') - logger.debug(f'original: {original}') - logger.debug(f'translation: {original}') - return False - - return True + def user(self, context): + return f'''Input:\n{context}\nOutput:''' diff --git a/openlrc/translate.py b/openlrc/translate.py index deac686..c57ad75 100644 --- a/openlrc/translate.py +++ b/openlrc/translate.py @@ -128,8 +128,8 @@ def translate(self, texts: Union[str, List[str]], src_lang: str, target_lang: st translations, summaries, compare_list, start_chunk, guideline = self._resume_translation(compare_path) if not guideline: - context_reviewer = ContextReviewerAgent(src_lang, target_lang, info, self.chatbot_model, self.fee_limit, - self.proxy, self.base_url_config) + context_reviewer = ContextReviewerAgent(src_lang, target_lang, info, self.chatbot_model, self.retry_model, + self.fee_limit, self.proxy, self.base_url_config) guideline = context_reviewer.build_context(texts, title=info.title, glossary=info.glossary) logger.info(f'Translation Guideline:\n{guideline}') diff --git a/openlrc/validators.py b/openlrc/validators.py new file mode 100644 index 0000000..a7f8b26 --- /dev/null +++ b/openlrc/validators.py @@ -0,0 +1,150 @@ +# Copyright (C) 2024. Hao Zheng +# All rights reserved. + +import re +from typing import List + +from langcodes import Language +from lingua import LanguageDetectorBuilder + +from openlrc.logger import logger + +ORIGINAL_PREFIX = 'Original>' +TRANSLATION_PREFIX = 'Translation>' +PROOFREAD_PREFIX = 'Proofread>' + +POTENTIAL_PREFIX_COMBOS = [ + [ORIGINAL_PREFIX, TRANSLATION_PREFIX], + ['原文>', '翻译>'], + ['原文>', '译文>'], + ['原文>', '翻譯>'], + ['原文>', '譯文>'], + ['Original>', 'Translation>'], + ['Original>', 'Traducción>'] +] + + +class BaseValidator: + def __init__(self, target_lang): + self.target_lang = target_lang + self.lan_detector = LanguageDetectorBuilder.from_all_languages().build() + + +class ChunkedTranslateValidator(BaseValidator): + def _extract_translation(self, content: str) -> List[str]: + for potential_ori_prefix, potential_trans_prefix in POTENTIAL_PREFIX_COMBOS: + translation = re.findall(f'{potential_trans_prefix}\n*(.*?)(?:#\\d+||\\n*$)', content, re.DOTALL) + if translation: + return translation + return [] + + def _is_translation_in_target_language(self, translation: List[str]) -> bool: + if len(translation) >= 3: + chunk_size = len(translation) // 3 + translation_chunks = [translation[i:i + chunk_size] for i in range(0, len(translation), chunk_size)] + if len(translation_chunks) > 3: + translation_chunks[-2].extend(translation_chunks[-1]) + translation_chunks.pop() + + translated_langs = [self.lan_detector.detect_language_of(' '.join(chunk)) for chunk in translation_chunks] + translated_langs = [lang.name.lower() for lang in translated_langs if lang] + + if not translated_langs: + return True + + translated_lang = max(set(translated_langs), key=translated_langs.count) + else: + detected_lang = self.lan_detector.detect_language_of(' '.join(translation)) + if not detected_lang: + return True + translated_lang = detected_lang.name.lower() + + target_lang = Language.get(self.target_lang).language_name().lower() + if translated_lang != target_lang: + logger.warning(f'Translated language is {translated_lang}, not {target_lang}.') + return False + + return True + + def validate(self, messages, content): + summary = re.search(r'(.*)', content) + scene = re.search(r'(.*)', content) + + user_input = messages[1]['content'] if len(messages) == 2 else messages[0]['content'] + original = re.findall(ORIGINAL_PREFIX + r'\n(.*?)\n' + TRANSLATION_PREFIX, user_input, re.DOTALL) + if not original: + logger.error(f'Fail to extract original text.') + return False + + translation = self._extract_translation(content) + if not translation: + logger.warning(f'Fail to extract translation.') + logger.debug(f'Content: {content}') + return False + + if len(original) != len(translation): + logger.warning( + f'Fail to ensure length consistent: original is {len(original)}, translation is {len(translation)}') + logger.debug(f'original: {original}') + logger.debug(f'translation: {translation}') + return False + + if not self._is_translation_in_target_language(translation): + return False + + if not summary or not summary.group(1): + logger.warning(f'Fail to extract summary.') + if not scene or not scene.group(1): + logger.warning(f'Fail to extract scene.') + + return True + + +class AtomicTranslateValidator(BaseValidator): + def validate(self, messages, content): + detected_lang = self.lan_detector.detect_language_of(content) + if not detected_lang: + return True + + translated_lang = detected_lang.name.lower() + target_lang = Language.get(self.target_lang).language_name().lower() + if translated_lang != target_lang: + logger.warning(f'Translated text: "{content}" is {translated_lang}, not {target_lang}.') + return False + + return True + + +class ProofreaderValidator(BaseValidator): + def validate(self, messages, content): + user_input = messages[1]['content'] if len(messages) == 2 else messages[0]['content'] + original = re.findall(ORIGINAL_PREFIX + r'\n(.*?)\n' + TRANSLATION_PREFIX, user_input, re.DOTALL) + if not original: + logger.error(f'Fail to extract original text.') + return False + + localized = re.findall(PROOFREAD_PREFIX + r'\s*(.*)', content, re.MULTILINE) + + if not localized: + logger.warning(f'Fail to extract translation.') + logger.debug(f'Content: {content}') + return False + + if len(original) != len(localized): + logger.warning( + f'Fail to ensure length consistent: original is {len(original)}, translation is {len(localized)}') + logger.debug(f'original: {original}') + logger.debug(f'translation: {localized}') + return False + + return True + + +class ContextReviewerValidateValidator(BaseValidator): + def validate(self, messages, content): + if 'true' in content.lower() or 'false' in content.lower(): + return True + else: + logger.warning(f'Context reviewer validation failed: {content}.') + + return False diff --git a/tests/test_agents.py b/tests/test_agents.py index ddf6420..b33b122 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -9,7 +9,7 @@ from openlrc.agents import ChunkedTranslatorAgent, TranslationContext, ContextReviewerAgent from openlrc.context import TranslateInfo -from openlrc.prompter import BaseTranslatePrompter +from openlrc.prompter import ChunkedTranslatePrompter class DummyMessage(BaseModel): @@ -75,7 +75,7 @@ def test_parse_response_success(self): # Properly format texts for translation def test_format_texts_success(self): texts = [(1, 'Hello, how are you?'), (2, 'I am fine, thank you.')] - formatted_text = BaseTranslatePrompter.format_texts(texts) + formatted_text = ChunkedTranslatePrompter.format_texts(texts) expected_output = '#1\nOriginal>\nHello, how are you?\nTranslation>\n\n#2\nOriginal>\nI am fine, thank you.\nTranslation>\n' self.assertEqual(formatted_text, expected_output) @@ -83,7 +83,7 @@ def test_format_texts_success(self): # Use glossary terms in translations when provided def test_use_glossary_terms_success(self): glossary = {'hello': 'bonjour', 'how are you': 'comment ça va'} - prompter = BaseTranslatePrompter(src_lang='en', target_lang='fr', context=TranslateInfo(glossary=glossary)) + prompter = ChunkedTranslatePrompter(src_lang='en', target_lang='fr', context=TranslateInfo(glossary=glossary)) formatted_glossary = prompter.formatted_glossary diff --git a/tests/test_prompter.py b/tests/test_prompter.py index e7fd5a9..5c96328 100644 --- a/tests/test_prompter.py +++ b/tests/test_prompter.py @@ -4,7 +4,7 @@ import unittest from openlrc.context import TranslateInfo -from openlrc.prompter import BaseTranslatePrompter +from openlrc.prompter import ChunkedTranslatePrompter formatted_user_input = '''Translation guidelines from context reviewer: This is a guidline. @@ -33,7 +33,7 @@ class TestPrompter(unittest.TestCase): def setUp(self) -> None: context = TranslateInfo(title='Title', audio_type='movie') - self.prompter = BaseTranslatePrompter('ja', 'zh-cn', context) + self.prompter = ChunkedTranslatePrompter('ja', 'zh-cn', context) self.formatted_user_input = formatted_user_input def test_user_prompt(self): @@ -56,7 +56,7 @@ def test_format_texts(self): texts = [(1, '変わりゆく時代において、'), (2, '生き残る秘訣は、進化し続けることです。')] expected_output = '#1\nOriginal>\n変わりゆく時代において、\nTranslation>\n\n#2\nOriginal>\n' \ '生き残る秘訣は、進化し続けることです。\nTranslation>\n' - self.assertEqual(BaseTranslatePrompter.format_texts(texts), expected_output) + self.assertEqual(ChunkedTranslatePrompter.format_texts(texts), expected_output) def test_check_format(self): messages = [{'role': 'system', 'content': 'system content'}, From 0b93c912205d9e20bf348599bde1b88482daa7f7 Mon Sep 17 00:00:00 2001 From: zh-plus Date: Mon, 24 Jun 2024 13:12:18 +0800 Subject: [PATCH 3/5] Also extend end-time for bilingual subtitles. --- openlrc/openlrc.py | 2 ++ openlrc/opt.py | 12 ++++++++++-- openlrc/subtitle.py | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py index 7e41c44..4db6c5d 100644 --- a/openlrc/openlrc.py +++ b/openlrc/openlrc.py @@ -194,6 +194,8 @@ def consumer_worker(self, transcription_queue, target_lang, skip_trans, bilingua bilingual_subtitle = BilingualSubtitle.from_preprocessed( transcribed_path.parent, audio_name.replace('_preprocessed', '') ) + bilingual_optimizer = SubtitleOptimizer(bilingual_subtitle) + bilingual_optimizer.extend_time() # TODO: consider the edge case (audio file name contains _preprocessed) getattr(bilingual_subtitle, f'to_{subtitle_format}')() bilingual_lrc_path = bilingual_subtitle.filename.with_suffix(bilingual_subtitle.suffix) diff --git a/openlrc/opt.py b/openlrc/opt.py index b0712ab..4459142 100644 --- a/openlrc/opt.py +++ b/openlrc/opt.py @@ -8,7 +8,7 @@ import zhconv from openlrc.logger import logger -from openlrc.subtitle import Subtitle +from openlrc.subtitle import Subtitle, BilingualSubtitle from openlrc.utils import extend_filename, format_timestamp # Thresholds for different languages @@ -42,7 +42,7 @@ class SubtitleOptimizer: SubtitleOptimizer class is used to optimize subtitles by performing various operations. """ - def __init__(self, subtitle: Union[Path, Subtitle]): + def __init__(self, subtitle: Union[Path, Subtitle, BilingualSubtitle]): if isinstance(subtitle, Path): subtitle = Subtitle.from_json(subtitle) @@ -139,6 +139,10 @@ def cut_long(self, max_length=20): """ Cut long texts based on language-specific thresholds. """ + if isinstance(self.subtitle, BilingualSubtitle): + logger.warning('Bilingual subtitle is not supported for cut_long operation.') + return + threshold = CUT_LONG_THRESHOLD.get(self.lang.lower(), 150) for element in self.subtitle.segments: @@ -157,6 +161,10 @@ def punctuation_optimization(self): """ Replace English punctuation with Chinese punctuation. """ + if isinstance(self.subtitle, BilingualSubtitle): + logger.warning('Bilingual subtitle is not supported for punctuation_optimization operation.') + return + for element in self.subtitle.segments: element.text = self._replace_punctuation_with_chinese(element.text) diff --git a/openlrc/subtitle.py b/openlrc/subtitle.py index e689a3d..c708fdd 100644 --- a/openlrc/subtitle.py +++ b/openlrc/subtitle.py @@ -254,6 +254,7 @@ def __init__(self, src: Subtitle, target: Subtitle, filename: Union[str, Path]): if len(src) != len(target): raise ValueError(f'Source and target subtitle length not equal: {len(src)} vs {len(target)}') + self.lang = f'{src.lang}-{target.lang}' self.segments = [] for src_seg, target_seg in zip(src.segments, target.segments): if src_seg.start != target_seg.start or src_seg.end != target_seg.end: From e126c38d1e42a1d035330f7f9f72313665b0e0fa Mon Sep 17 00:00:00 2001 From: zh-plus Date: Mon, 24 Jun 2024 13:30:55 +0800 Subject: [PATCH 4/5] Fix disk usage issue for CI. --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8eeb60a..1fe122e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,9 @@ jobs: runs-on: ${{ matrix.os }} steps: + - name: Delete huge unnecessary tools folder + run: rm -rf /opt/hostedtoolcache + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} From 97cd6ac5968661875c011886cf94a561ce674f65 Mon Sep 17 00:00:00 2001 From: zh-plus Date: Mon, 24 Jun 2024 16:07:19 +0800 Subject: [PATCH 5/5] Minor improvement on ContextReviewer prompt. --- openlrc/agents.py | 14 +++++++++---- openlrc/chatbot.py | 4 ++-- openlrc/openlrc.py | 2 +- openlrc/prompter.py | 48 ++++++++++++++++++++++++++++---------------- openlrc/translate.py | 2 +- 5 files changed, 45 insertions(+), 25 deletions(-) diff --git a/openlrc/agents.py b/openlrc/agents.py index 625b0ed..8aa0218 100644 --- a/openlrc/agents.py +++ b/openlrc/agents.py @@ -33,7 +33,7 @@ class ChunkedTranslatorAgent(Agent): TEMPERATURE = 1.0 def __init__(self, src_lang, target_lang, info: TranslateInfo = TranslateInfo(), - chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.25, proxy: str = None, + chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.3, proxy: str = None, base_url_config: Optional[dict] = None): super().__init__() self.chatbot_model = chatbot_model @@ -111,7 +111,7 @@ class ContextReviewerAgent(Agent): def __init__(self, src_lang, target_lang, info: TranslateInfo = TranslateInfo(), chatbot_model: str = 'gpt-3.5-turbo', retry_model=None, - fee_limit: float = 0.25, proxy: str = None, + fee_limit: float = 0.3, proxy: str = None, base_url_config: Optional[dict] = None): super().__init__() self.src_lang = src_lang @@ -146,6 +146,7 @@ def build_context(self, texts, title='', glossary: Optional[dict] = None) -> str resp = self.chatbot.message(messages_list, output_checker=self.prompter.check_format)[0] context = self.chatbot.get_content(resp) + context_pool = [context] # Validate if not self._validate_context(context): validated = False @@ -153,6 +154,7 @@ def build_context(self, texts, title='', glossary: Optional[dict] = None) -> str logger.info(f'Failed to validate the context using {self.chatbot}, retrying with {self.retry_chatbot}') resp = self.retry_chatbot.message(messages_list, output_checker=self.validate_prompter.check_format)[0] context = self.retry_chatbot.get_content(resp) + context_pool.append(context) if self._validate_context(context): validated = True else: @@ -163,12 +165,16 @@ def build_context(self, texts, title='', glossary: Optional[dict] = None) -> str logger.warning(f'Retry to generate the context using {self.chatbot} at {i} reties.') resp = self.chatbot.message(messages_list, output_checker=self.validate_prompter.check_format)[0] context = self.chatbot.get_content(resp) + context_pool.append(context) if self._validate_context(context): validated = True break if not validated: - logger.warning(f'Finally failed to validate the context: {context}, check the context manually.') + logger.warning( + f'Finally failed to validate the context: {context}, you may check the context manually.') + context = max(context_pool, key=len) + logger.info(f'Now using the longest context: {context}') return context @@ -180,7 +186,7 @@ class ProofreaderAgent(Agent): TEMPERATURE = 0.8 def __init__(self, src_lang, target_lang, info: TranslateInfo = TranslateInfo(), - chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.25, proxy: str = None, + chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.3, proxy: str = None, base_url_config: Optional[dict] = None): super().__init__() self.src_lang = src_lang diff --git a/openlrc/chatbot.py b/openlrc/chatbot.py index 332fc0b..057d5fe 100644 --- a/openlrc/chatbot.py +++ b/openlrc/chatbot.py @@ -65,7 +65,7 @@ def route_chatbot(model): class ChatBot: pricing = None - def __init__(self, pricing, temperature=1, top_p=1, retry=8, max_async=16, fee_limit=0.25): + def __init__(self, pricing, temperature=1, top_p=1, retry=8, max_async=16, fee_limit=0.3): self.pricing = pricing self._model = None @@ -254,7 +254,7 @@ class ClaudeBot(ChatBot): 'claude-3-5-sonnet-20240620': (3, 15), } - def __init__(self, model='claude-3-sonnet-20240229', temperature=1, top_p=1, retry=8, max_async=16, fee_limit=0.25, + def __init__(self, model='claude-3-sonnet-20240229', temperature=1, top_p=1, retry=8, max_async=16, fee_limit=0.3, proxy=None, base_url_config=None): # clamp temperature to 0-1 diff --git a/openlrc/openlrc.py b/openlrc/openlrc.py index 4db6c5d..ed8ad95 100644 --- a/openlrc/openlrc.py +++ b/openlrc/openlrc.py @@ -52,7 +52,7 @@ class LRCer: """ def __init__(self, whisper_model='large-v3', compute_type='float16', device='cuda', - chatbot_model: str = 'gpt-3.5-turbo', fee_limit=0.25, consumer_thread=4, asr_options=None, + chatbot_model: str = 'gpt-3.5-turbo', fee_limit=0.3, consumer_thread=4, asr_options=None, vad_options=None, preprocess_options=None, proxy=None, base_url_config=None, glossary: Union[dict, str, Path] = None, retry_model=None): self.chatbot_model = chatbot_model diff --git a/openlrc/prompter.py b/openlrc/prompter.py index 5db3747..fb2c4ae 100644 --- a/openlrc/prompter.py +++ b/openlrc/prompter.py @@ -191,8 +191,7 @@ def __init__(self, src_lang, target_lang): self.target_lang_display = Language.get(target_lang).display_name('en') def system(self): - return f'''Context: -You are a context reviewer responsible for ensuring the consistency and accuracy of translations between two languages. Your task involves reviewing and providing necessary contextual information for translations. + return f'''You are a context reviewer responsible for ensuring the consistency and accuracy of translations between two languages. Your task involves reviewing and providing necessary contextual information for translations. Objective: 1. Build a comprehensive glossary of key terms and phrases used in the {self.src_lang_display} to {self.target_lang_display} translations. The glossary should include technical terms, slang, and culturally specific references that need consistent translation or localization, focusing on terms that may cause confusion or inconsistency. @@ -202,7 +201,7 @@ def system(self): 5. Identify the target audience for the subtitles, considering factors such as age, cultural background, and language proficiency, and provide insights on how to tailor the subtitles accordingly. Style: -Formal and professional, with clear and precise language suitable for translation and localization contexts. +Formal and professional, with clear and precise language suitable for translation and localization contexts. Be concise and informative in your instructions. Tone: Informative and authoritative to ensure clarity and reliability in the instructions. @@ -211,8 +210,9 @@ def system(self): Translators, localization specialists, and proofreaders who need a detailed and consistent reference document for subtitling. Response Format: -The output should include the following sections: Glossary, Characters, Summary, Tone and Style, Target Audience. +The output should include the following sections: Glossary, Characters, Summary, Tone and Style, Target Audience. DO NOT include any other sections in the response. + Example Input: Please review the following text (title: The Detectors) and provide the necessary context for the translation from English to Chinese: John and Sarah discuss their plan to locate a suspect, deducing that he is likely in the uptown area. @@ -238,7 +238,16 @@ def system(self): ### Target Audience: The target audience is adult viewers with an interest in crime dramas. They are likely to be familiar with police procedurals and enjoy suspenseful storytelling. -''' + + +Note: +There was an issue with the previous translation. + +DO NOT add the translated sample text in the response. +DO NOT include any translation segment. +Sample Translation is NOT required for this task. +You should adhere to the same format as the previous response, add or delete section is not allowed. +Remember to include the glossary, characters, summary, tone and style, and target audience sections in your response.''' def user(self, text, title='', given_glossary: Optional[dict] = None): glossary_text = f'Given glossary: {given_glossary}' if given_glossary else '' @@ -293,8 +302,7 @@ def __init__(self): def system(self): return f'''Ignore all previous instructions. -You are a context validator, responsible for validating the context provided by the Context Reviewer. Your role is to validate if the context is good. -A good context should include a comprehensive glossary of key terms and phrases, character name translations, a concise story summary, tone and style guidelines, and target audience insights. +You are a context validator responsible for verifying the context provided by the context reviewers. Your duty is to initially confirm whether these contexts meet the most basic requirements. Only output True/False based on the provided context. # Example 1: @@ -330,20 +338,26 @@ def system(self): # Example 3: Input: -Key points for translation: +### Glossary: +- obedience: 服从 +- opinions: 意见 +- treasured: 珍贵的 + +### Characters: +- Mistress: 女主人,主导者 +- Listener: 听众 -1. The opening lines are a joke, likely setting a humorous tone for the video. -2. The main topic is about cable management in PC building. -3. There's a trend of moving cable connectors to the back of the motherboard to reduce clutter. -4. The speaker seems to approve of this trend. -5. The text mentions that not everyone likes this new trend. +### Summary: +In "Mistress and Listener," a powerful sorceress named Elara and a perceptive bard named Kael join forces to decipher a prophecy that threatens Elara's future, uncovering dark secrets and facing formidable adversaries along the way. Their journey transforms their lives, forging a deep bond and revealing the true extent of their powers. -When translating, maintain the casual, slightly humorous tone of the original text. Technical terms like "PC hardware," "gaming rigs," and "motherboard" should be translated using their standard Chinese equivalents. The joke at the beginning should be translated in a way that preserves the humor if possible, but cultural adaptation may be necessary. +### Tone and Style: +The tone of "Mistress and Listener" is dark and mysterious, filled with suspense. The style is richly descriptive and immersive, blending fantasy with deep character exploration. -Output: -False +### Target Audience: +The target audience is young adults and adults who enjoy dark fantasy, those who enjoy themes of hypnosis, submission. The content is explicitly sexual and intended for mature listeners only. -''' +Output: +True''' def user(self, context): return f'''Input:\n{context}\nOutput:''' diff --git a/openlrc/translate.py b/openlrc/translate.py index c57ad75..e8f0c93 100644 --- a/openlrc/translate.py +++ b/openlrc/translate.py @@ -29,7 +29,7 @@ def translate(self, texts: Union[str, List[str]], src_lang: str, target_lang: st class LLMTranslator(Translator): CHUNK_SIZE = 30 - def __init__(self, chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.25, chunk_size: int = CHUNK_SIZE, + def __init__(self, chatbot_model: str = 'gpt-3.5-turbo', fee_limit: float = 0.3, chunk_size: int = CHUNK_SIZE, intercept_line: Optional[int] = None, proxy: Optional[str] = None, base_url_config: Optional[dict] = None, retry_model: Optional[str] = None):