Skip to content

Commit

Permalink
Merge pull request #11 from dudil/split-by-punctuation
Browse files Browse the repository at this point in the history
New algorithm of splitting by punctuation. Thx @dudil.
  • Loading branch information
peterk committed Aug 3, 2024
2 parents b54a6d2 + 61bf5d0 commit 631cac1
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 32 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
## [0.1.10] - 2024-08-03

### Added
- New method to split long srt lines taking punctuation into account.

### Acknowledgments
- Thanks to @dudil for the new splitting method.

## [0.1.9] - 2024-03-02

### Changed
- Fix fractional timestamps from Whisper output.

## [0.1.8] - 2023-12-08

### Added
Expand Down
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,17 @@ the most profound of our time.
```

### Algorithms
By default, this script uses greedy algorithm which splits the text at the rightmost possible space.
By default, this script uses `greedy` algorithm which splits the text at the rightmost possible space.

An alternative splitting algorithm can be used that will split longer lines at half instead of always trying to use maximum line length. This prevents producing lines with isolated word remainders.
An alternative splitting algorithm is `halving` which will split longer lines more evenly instead of always trying to use maximum line length. This prevents producing lines with isolated word remainders.

Another alternative is the `punctuation` algorithm that takes punctuation (commas, periods, etc.) into account.

```python

from srt_equalizer import srt_equalizer

# use "greedy", "halving" or "punctuation" for the method parameter
srt_equalizer.equalize_srt_file("test.srt", "shortened.srt", 42, method='halving')
```

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "srt_equalizer"
version = "0.1.9"
version = "0.1.10"
description = "Transform subtitle line lengths, splitting into multiple subtitle fragments if necessary. "
authors = ["Peter Krantz"]
license = "MIT"
Expand Down
82 changes: 65 additions & 17 deletions src/srt_equalizer/srt_equalizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from datetime import timedelta
from typing import List

Expand Down Expand Up @@ -42,7 +43,7 @@ def split_subtitle(sub: srt.Subtitle, target_chars: int = 42, start_from_index:
sub: A srt.Subtitle object.
target_chars: The max number of characters for a subtitle line.
start_from_index: The start index of the subtitle item.
method: algorithm for splitting - either "greedy" or "halving".
method: algorithm for splitting - either "greedy" (default), "halving" or "punctuation".
Returns:
An array of one or more subtitle items.
Expand All @@ -53,21 +54,13 @@ def split_subtitle(sub: srt.Subtitle, target_chars: int = 42, start_from_index:
sub.index = start_from_index + 1
return [sub]

if method == "greedy":
text_chunks = []
current_chunk = ""
words = sub.content.split()
for word in words:
if len(current_chunk) + len(word) + 1 > target_chars:
text_chunks.append(current_chunk.strip())
current_chunk = word + ' '
else:
current_chunk += word + ' '
if current_chunk:
text_chunks.append(current_chunk.strip())
else:
assert method == "halving"
elif method == "greedy":
text_chunks = split_greedy(sub.content, target_chars)
elif method == "halving":
text_chunks = split_at_half(sub.content, target_chars)
else:
assert method == "punctuation"
text_chunks = split_by_punctuation(sub.content, target_chars)

# Create a new subtitle item for each text chunk, proportional to its length.
split_subs = []
Expand Down Expand Up @@ -100,7 +93,7 @@ def split_subtitle(sub: srt.Subtitle, target_chars: int = 42, start_from_index:
def equalize_srt_file(srt_path: str, output_srt_path: str, target_chars: int, method='greedy'):
"""Load subs from an SRT file and output equalized subtitles to a new SRT file.
"""
assert method in {'greedy', 'halving'}, method
assert method in {'greedy', 'halving', 'punctuation'}, method
subs = load_srt(srt_path)

adjusted_subs = []
Expand All @@ -118,7 +111,28 @@ def equalize_srt_file(srt_path: str, output_srt_path: str, target_chars: int, me
write_srt(filepath=output_srt_path, subs=adjusted_subs)


def split_greedy(sentance: str, target_chars: int) -> List[srt.Subtitle]:
"""Split subtitles into chunks of target_chars length as soon as possible.
"""

text_chunks = []
current_chunk = ''
words = sentance.split()
for word in words:
if len(current_chunk) + len(word) + 1 > target_chars:
text_chunks.append(current_chunk.strip())
current_chunk = word + ' '
else:
current_chunk += word + ' '
if current_chunk:
text_chunks.append(current_chunk.strip())

return text_chunks


def split_at_half(sentence, target_chars):
"""Try to split subtitles into similar line lengths takign commas into account."""

if len(sentence) <= target_chars or ' ' not in sentence:
return [sentence]

Expand All @@ -139,4 +153,38 @@ def split_at_half(sentence, target_chars):
# recursively call this function until the length is bellow limit
left = sentence[:closest_space_to_center]
right = sentence[closest_space_to_center+1:]
return split_at_half(left, target_chars) + split_at_half(right, target_chars)
return split_at_half(left, target_chars) + split_at_half(right, target_chars)


def split_by_punctuation(sentance: str, target_chars: int) -> List[str]:
"""Split subtitles into chunks of target_chars length by punctuation."""

if len(sentance) <= target_chars:
return [sentance]

# use regex to split the sentance by punctuation
chunks = re.split(r'([.,!?])', sentance)
normalized_chunks = []
for chunk in chunks:
# strip whitespace
chunk = chunk.strip()

# if this chunk is an empty one, skip it
if not chunk:
continue

if len(chunk) > target_chars:
normalized_chunks.extend(split_greedy(chunk, target_chars))
continue

if normalized_chunks:
if chunk in '.,!?':
# add pucturation to the last chunk
chunk = normalized_chunks.pop() + chunk
elif len(chunk) + len(normalized_chunks[-1]) <= target_chars:
# add this chunk to the last one since they still under the limit allowed
chunk = normalized_chunks.pop() + ' ' + chunk

normalized_chunks.append(chunk)

return normalized_chunks
49 changes: 37 additions & 12 deletions tests/test_srt_equalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ def test_load_srt_file_not_found():

def test_split_subtitle():
"""Test split subtitle."""
sub = srt.Subtitle(index=1,
start=datetime.timedelta(seconds=0, milliseconds=0),
end=datetime.timedelta(seconds=1, milliseconds=0),
content="A string with more than 40 characters that should be split into several smaller ones.")
sub = srt.Subtitle(
index=1,
start=datetime.timedelta(seconds=0, milliseconds=0),
end=datetime.timedelta(seconds=1, milliseconds=0),
content="A string with more than 40 characters that should be split into several smaller ones.",
)
s = split_subtitle(sub, 42)

# check that the line is split after "characters"
Expand All @@ -41,13 +43,15 @@ def test_split_subtitle():

def test_split_subtitle_halving():
"""Test split subtitle."""
sub = srt.Subtitle(index=1,
start=datetime.timedelta(seconds=0, milliseconds=0),
end=datetime.timedelta(seconds=1, milliseconds=0),
content="A string with more than 40 characters that should be split into several smaller ones.")
s = split_subtitle(sub, 42, method='halving')

reconstructed = ' '.join([x.content for x in s])
sub = srt.Subtitle(
index=1,
start=datetime.timedelta(seconds=0, milliseconds=0),
end=datetime.timedelta(seconds=1, milliseconds=0),
content="A string with more than 40 characters that should be split into several smaller ones.",
)
s = split_subtitle(sub, 42, method="halving")

reconstructed = " ".join([x.content for x in s])
assert sub.content == reconstructed

assert s[0].content == "A string with more than 40 characters that"
Expand All @@ -63,7 +67,7 @@ def test_whisper_result_to_srt():
# Load example whipser result from pickle
whisper_result = dict()

with open("tests/whisper_result_example.pkl", 'rb') as file:
with open("tests/whisper_result_example.pkl", "rb") as file:
whisper_result = pickle.load(file)

# check that fractional seconds are converted correctly
Expand All @@ -72,3 +76,24 @@ def test_whisper_result_to_srt():

assert subs[0].start == datetime.timedelta(microseconds=123000)
assert subs[0].end == datetime.timedelta(seconds=10, microseconds=789000)


def test_split_subtitle_punctuation():
"""Test split subtitle."""
sub = srt.Subtitle(
index=1,
start=datetime.timedelta(seconds=0, milliseconds=0),
end=datetime.timedelta(seconds=1, milliseconds=0),
content="A string with more than 40 characters! This should be split into several, smaller ones.",
)
s = split_subtitle(sub, 42, method="punctuation")

reconstructed = " ".join([x.content for x in s])
assert sub.content == reconstructed

assert s[0].content == "A string with more than 40 characters!"
assert s[1].content == "This should be split into several,"
assert s[2].content == "smaller ones."

# check fragment timing
assert s[2].end == sub.end

0 comments on commit 631cac1

Please sign in to comment.