diff --git a/CHANGELOG.md b/CHANGELOG.md index 434d4ca..14f3dd9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## [0.1.10] - 2024-08-03 + +### Added +- New method to split long srt lines taking punctuation into account. + +### Acknowledgments +- Thanks to @dudil for the new splitting method. + +## [0.1.9] - 2024-03-02 + +### Changed +- Fix fractional timestamps from Whisper output. + ## [0.1.8] - 2023-12-08 ### Added diff --git a/README.md b/README.md index e49f558..cbe8181 100644 --- a/README.md +++ b/README.md @@ -62,14 +62,17 @@ the most profound of our time. ``` ### Algorithms -By default, this script uses greedy algorithm which splits the text at the rightmost possible space. +By default, this script uses `greedy` algorithm which splits the text at the rightmost possible space. -An alternative splitting algorithm can be used that will split longer lines at half instead of always trying to use maximum line length. This prevents producing lines with isolated word remainders. +An alternative splitting algorithm is `halving` which will split longer lines more evenly instead of always trying to use maximum line length. This prevents producing lines with isolated word remainders. + +Another alternative is the `punctuation` algorithm that takes punctuation (commas, periods, etc.) into account. ```python from srt_equalizer import srt_equalizer +# use "greedy", "halving" or "punctuation" for the method parameter srt_equalizer.equalize_srt_file("test.srt", "shortened.srt", 42, method='halving') ``` diff --git a/pyproject.toml b/pyproject.toml index 1ef2fa7..7bec58b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "srt_equalizer" -version = "0.1.9" +version = "0.1.10" description = "Transform subtitle line lengths, splitting into multiple subtitle fragments if necessary. " authors = ["Peter Krantz"] license = "MIT" diff --git a/src/srt_equalizer/srt_equalizer.py b/src/srt_equalizer/srt_equalizer.py index c79da58..c967003 100644 --- a/src/srt_equalizer/srt_equalizer.py +++ b/src/srt_equalizer/srt_equalizer.py @@ -1,3 +1,4 @@ +import re from datetime import timedelta from typing import List @@ -42,7 +43,7 @@ def split_subtitle(sub: srt.Subtitle, target_chars: int = 42, start_from_index: sub: A srt.Subtitle object. target_chars: The max number of characters for a subtitle line. start_from_index: The start index of the subtitle item. - method: algorithm for splitting - either "greedy" or "halving". + method: algorithm for splitting - either "greedy" (default), "halving" or "punctuation". Returns: An array of one or more subtitle items. @@ -53,21 +54,13 @@ def split_subtitle(sub: srt.Subtitle, target_chars: int = 42, start_from_index: sub.index = start_from_index + 1 return [sub] - if method == "greedy": - text_chunks = [] - current_chunk = "" - words = sub.content.split() - for word in words: - if len(current_chunk) + len(word) + 1 > target_chars: - text_chunks.append(current_chunk.strip()) - current_chunk = word + ' ' - else: - current_chunk += word + ' ' - if current_chunk: - text_chunks.append(current_chunk.strip()) - else: - assert method == "halving" + elif method == "greedy": + text_chunks = split_greedy(sub.content, target_chars) + elif method == "halving": text_chunks = split_at_half(sub.content, target_chars) + else: + assert method == "punctuation" + text_chunks = split_by_punctuation(sub.content, target_chars) # Create a new subtitle item for each text chunk, proportional to its length. split_subs = [] @@ -100,7 +93,7 @@ def split_subtitle(sub: srt.Subtitle, target_chars: int = 42, start_from_index: def equalize_srt_file(srt_path: str, output_srt_path: str, target_chars: int, method='greedy'): """Load subs from an SRT file and output equalized subtitles to a new SRT file. """ - assert method in {'greedy', 'halving'}, method + assert method in {'greedy', 'halving', 'punctuation'}, method subs = load_srt(srt_path) adjusted_subs = [] @@ -118,7 +111,28 @@ def equalize_srt_file(srt_path: str, output_srt_path: str, target_chars: int, me write_srt(filepath=output_srt_path, subs=adjusted_subs) +def split_greedy(sentance: str, target_chars: int) -> List[srt.Subtitle]: + """Split subtitles into chunks of target_chars length as soon as possible. + """ + + text_chunks = [] + current_chunk = '' + words = sentance.split() + for word in words: + if len(current_chunk) + len(word) + 1 > target_chars: + text_chunks.append(current_chunk.strip()) + current_chunk = word + ' ' + else: + current_chunk += word + ' ' + if current_chunk: + text_chunks.append(current_chunk.strip()) + + return text_chunks + + def split_at_half(sentence, target_chars): + """Try to split subtitles into similar line lengths takign commas into account.""" + if len(sentence) <= target_chars or ' ' not in sentence: return [sentence] @@ -139,4 +153,38 @@ def split_at_half(sentence, target_chars): # recursively call this function until the length is bellow limit left = sentence[:closest_space_to_center] right = sentence[closest_space_to_center+1:] - return split_at_half(left, target_chars) + split_at_half(right, target_chars) \ No newline at end of file + return split_at_half(left, target_chars) + split_at_half(right, target_chars) + + +def split_by_punctuation(sentance: str, target_chars: int) -> List[str]: + """Split subtitles into chunks of target_chars length by punctuation.""" + + if len(sentance) <= target_chars: + return [sentance] + + # use regex to split the sentance by punctuation + chunks = re.split(r'([.,!?])', sentance) + normalized_chunks = [] + for chunk in chunks: + # strip whitespace + chunk = chunk.strip() + + # if this chunk is an empty one, skip it + if not chunk: + continue + + if len(chunk) > target_chars: + normalized_chunks.extend(split_greedy(chunk, target_chars)) + continue + + if normalized_chunks: + if chunk in '.,!?': + # add pucturation to the last chunk + chunk = normalized_chunks.pop() + chunk + elif len(chunk) + len(normalized_chunks[-1]) <= target_chars: + # add this chunk to the last one since they still under the limit allowed + chunk = normalized_chunks.pop() + ' ' + chunk + + normalized_chunks.append(chunk) + + return normalized_chunks \ No newline at end of file diff --git a/tests/test_srt_equalizer.py b/tests/test_srt_equalizer.py index 12c9b65..9679e6a 100644 --- a/tests/test_srt_equalizer.py +++ b/tests/test_srt_equalizer.py @@ -20,10 +20,12 @@ def test_load_srt_file_not_found(): def test_split_subtitle(): """Test split subtitle.""" - sub = srt.Subtitle(index=1, - start=datetime.timedelta(seconds=0, milliseconds=0), - end=datetime.timedelta(seconds=1, milliseconds=0), - content="A string with more than 40 characters that should be split into several smaller ones.") + sub = srt.Subtitle( + index=1, + start=datetime.timedelta(seconds=0, milliseconds=0), + end=datetime.timedelta(seconds=1, milliseconds=0), + content="A string with more than 40 characters that should be split into several smaller ones.", + ) s = split_subtitle(sub, 42) # check that the line is split after "characters" @@ -41,13 +43,15 @@ def test_split_subtitle(): def test_split_subtitle_halving(): """Test split subtitle.""" - sub = srt.Subtitle(index=1, - start=datetime.timedelta(seconds=0, milliseconds=0), - end=datetime.timedelta(seconds=1, milliseconds=0), - content="A string with more than 40 characters that should be split into several smaller ones.") - s = split_subtitle(sub, 42, method='halving') - - reconstructed = ' '.join([x.content for x in s]) + sub = srt.Subtitle( + index=1, + start=datetime.timedelta(seconds=0, milliseconds=0), + end=datetime.timedelta(seconds=1, milliseconds=0), + content="A string with more than 40 characters that should be split into several smaller ones.", + ) + s = split_subtitle(sub, 42, method="halving") + + reconstructed = " ".join([x.content for x in s]) assert sub.content == reconstructed assert s[0].content == "A string with more than 40 characters that" @@ -63,7 +67,7 @@ def test_whisper_result_to_srt(): # Load example whipser result from pickle whisper_result = dict() - with open("tests/whisper_result_example.pkl", 'rb') as file: + with open("tests/whisper_result_example.pkl", "rb") as file: whisper_result = pickle.load(file) # check that fractional seconds are converted correctly @@ -72,3 +76,24 @@ def test_whisper_result_to_srt(): assert subs[0].start == datetime.timedelta(microseconds=123000) assert subs[0].end == datetime.timedelta(seconds=10, microseconds=789000) + + +def test_split_subtitle_punctuation(): + """Test split subtitle.""" + sub = srt.Subtitle( + index=1, + start=datetime.timedelta(seconds=0, milliseconds=0), + end=datetime.timedelta(seconds=1, milliseconds=0), + content="A string with more than 40 characters! This should be split into several, smaller ones.", + ) + s = split_subtitle(sub, 42, method="punctuation") + + reconstructed = " ".join([x.content for x in s]) + assert sub.content == reconstructed + + assert s[0].content == "A string with more than 40 characters!" + assert s[1].content == "This should be split into several," + assert s[2].content == "smaller ones." + + # check fragment timing + assert s[2].end == sub.end \ No newline at end of file