Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the reading of epub to use the semantic structure of the epub file and not only as chunks of text #88

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions audiobook_generator/book_parsers/ast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from dataclasses import dataclass
from typing import List


@dataclass
class Item:
@property
def text(self):
raise NotImplementedError

def char_count(self):
"""Returns the number of characters in the item"""
raise NotImplementedError


@dataclass
class Items:
items: List["Item"]

@property
def text(self):
return "".join((item.text for item in self.items))

def char_count(self):
"""Returns the number of characters in the item"""
return sum(map(lambda x: x.char_count(), self.items))


@dataclass
class Chapter(Items):
title: str


@dataclass
class Text(Item):
_text: str

@property
def text(self):
return self._text

def char_count(self):
return len(self.text)


@dataclass
class Quote(Items):
pass


@dataclass
class Break(Item):
@property
def text(self):
return ""

def char_count(self):
return 0
3 changes: 2 additions & 1 deletion audiobook_generator/book_parsers/base_book_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List, Tuple

from audiobook_generator.config.general_config import GeneralConfig
from audiobook_generator.book_parsers import ast

EPUB = "epub"

Expand All @@ -26,7 +27,7 @@ def get_book_title(self) -> str:
def get_book_author(self) -> str:
raise NotImplementedError

def get_chapters(self, break_string) -> List[Tuple[str, str]]:
def get_chapters(self, break_string) -> List[ast.Chapter]:
raise NotImplementedError


Expand Down
94 changes: 62 additions & 32 deletions audiobook_generator/book_parsers/epub_book_parser.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,72 @@
import logging
import re
from typing import List, Tuple
from typing import List

import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub

from audiobook_generator.book_parsers.base_book_parser import BaseBookParser
from audiobook_generator.config.general_config import GeneralConfig
from audiobook_generator.book_parsers import ast

logger = logging.getLogger(__name__)


def _split_by_break(item: ast.Text) -> List[ast.Item]:
parts = item.text.split("\n")
return [elem for x in parts for elem in (ast.Text(x + " "), ast.Break()) if x][:-1]


def _merge_item(item: ast.Item, new_items: List[ast.Item]):
if isinstance(item, ast.Break):
if new_items and isinstance(new_items[-1], ast.Break):
return
else:
# add break
new_items.append(item)
elif isinstance(item, ast.Text):
if new_items and isinstance(new_items[-1], ast.Text):
new_items[-1]._text += item.text
else:
# add new text
new_items.append(item)
elif isinstance(item, ast.Items):
item.items = _split_and_merge(item.items)
new_items.append(item)
else:
new_items.append(item)


def _split_and_merge(items: List[ast.Item]) -> List[ast.Item]:
# split text by breaks
new_items = []
for item in items:
if isinstance(item, ast.Text):
new_items += _split_by_break(item)
else:
new_items.append(item)

# merge items together
items = []
for item in new_items:
_merge_item(item, items)

return items


def _parse(soup: BeautifulSoup) -> List[ast.Item]:
items = []
for item in soup:
if isinstance(item, str):
items.append(ast.Text(_text=item))
elif item.name == "blockquote":
items.append(ast.Quote(items=_parse(item)))
else:
items += _parse(item)
return items


class EpubBookParser(BaseBookParser):
def __init__(self, config: GeneralConfig):
super().__init__(config)
Expand Down Expand Up @@ -39,38 +94,13 @@ def get_book_author(self) -> str:
return self.book.get_metadata("DC", "creator")[0][0]
return "Unknown"

def get_chapters(self, break_string) -> List[Tuple[str, str]]:
def get_chapters(self, break_string) -> List[ast.Chapter]:
chapters = []
search_and_replaces = self.get_search_and_replaces()
search_and_replaces = []#self.get_search_and_replaces()
for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
content = item.get_content()
soup = BeautifulSoup(content, "lxml-xml")
raw = soup.get_text(strip=False)
logger.debug(f"Raw text: <{raw[:]}>")

# Replace excessive whitespaces and newline characters based on the mode
if self.config.newline_mode == "single":
cleaned_text = re.sub(r"[\n]+", break_string, raw.strip())
elif self.config.newline_mode == "double":
cleaned_text = re.sub(r"[\n]{2,}", break_string, raw.strip())
elif self.config.newline_mode == "none":
cleaned_text = re.sub(r"[\n]+", " ", raw.strip())
else:
raise ValueError(f"Invalid newline mode: {self.config.newline_mode}")

logger.debug(f"Cleaned text step 1: <{cleaned_text[:]}>")
cleaned_text = re.sub(r"\s+", " ", cleaned_text)
logger.debug(f"Cleaned text step 2: <{cleaned_text[:100]}>")

# Removes end-note numbers
if self.config.remove_endnotes:
cleaned_text = re.sub(r'(?<=[a-zA-Z.,!?;”")])\d+', "", cleaned_text)
logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>")

# Does user defined search and replaces
for search_and_replace in search_and_replaces:
cleaned_text = re.sub(search_and_replace['search'], search_and_replace['replace'], cleaned_text)
logger.debug(f"Cleaned text step 5: <{cleaned_text[:100]}>")
raw = _split_and_merge(_parse(soup))

# Get proper chapter title
if self.config.title_mode == "auto":
Expand All @@ -81,7 +111,7 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
title = soup.find(level).text
break
if title == "" or re.match(r'^\d{1,3}$',title) is not None:
title = cleaned_text[:60]
title = soup.get_text(strip=True)[:60]
elif self.config.title_mode == "tag_text":
title = ""
title_levels = ['title', 'h1', 'h2', 'h3']
Expand All @@ -92,14 +122,14 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
if title == "":
title = "<blank>"
elif self.config.title_mode == "first_few":
title = cleaned_text[:60]
title = soup.get_text(strip=True)[:60]
else:
raise ValueError("Unsupported title_mode")
logger.debug(f"Raw title: <{title}>")
title = self._sanitize_title(title, break_string)
logger.debug(f"Sanitized title: <{title}>")

chapters.append((title, cleaned_text))
chapters.append(ast.Chapter(title=title, items=raw))
soup.decompose()
return chapters

Expand Down
28 changes: 14 additions & 14 deletions audiobook_generator/core/audiobook_generator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging
from typing import List, Tuple
import os

from audiobook_generator.book_parsers.base_book_parser import get_book_parser
from audiobook_generator.config.general_config import GeneralConfig
from audiobook_generator.core.audio_tags import AudioTags
from audiobook_generator.tts_providers.base_tts_provider import get_tts_provider
from audiobook_generator.book_parsers import ast

logger = logging.getLogger(__name__)

Expand All @@ -17,11 +19,9 @@ def confirm_conversion():
exit(0)


def get_total_chars(chapters):
total_characters = 0
for title, text in chapters:
total_characters += len(text)
return total_characters

def get_total_chars(chapters: List[ast.Chapter]):
return sum(map(lambda x: x.char_count(), chapters))


class AudiobookGenerator:
Expand All @@ -39,7 +39,7 @@ def run(self):
os.makedirs(self.config.output_folder, exist_ok=True)
chapters = book_parser.get_chapters(tts_provider.get_break_string())
# Filter out empty or very short chapters
chapters = [(title, text) for title, text in chapters if text.strip()]
chapters = list(filter(lambda x: x.char_count() > 0, chapters))

logger.info(f"Chapters count: {len(chapters)}.")

Expand Down Expand Up @@ -76,34 +76,34 @@ def run(self):
confirm_conversion()

# Loop through each chapter and convert it to speech using the provided TTS provider
for idx, (title, text) in enumerate(chapters, start=1):
for idx, chapter in enumerate(chapters, start=1):
if idx < self.config.chapter_start:
continue
if idx > self.config.chapter_end:
break
logger.info(
f"Converting chapter {idx}/{len(chapters)}: {title}, characters: {len(text)}"
f"Converting chapter {idx}/{len(chapters)}: {chapter.title}, characters: {chapter.char_count()}"
)

if self.config.output_text:
text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{title}.txt")
text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{chapter.title}.txt")
with open(text_file, "w", encoding='utf-8') as file:
file.write(text)
file.write(chapter.text)

if self.config.preview:
continue

output_file = os.path.join(self.config.output_folder,
f"{idx:04d}_{title}.{tts_provider.get_output_file_extension()}")
f"{idx:04d}_{chapter.title}.{tts_provider.get_output_file_extension()}")

audio_tags = AudioTags(title, book_parser.get_book_author(), book_parser.get_book_title(), idx)
audio_tags = AudioTags(chapter.title, book_parser.get_book_author(), book_parser.get_book_title(), idx)
tts_provider.text_to_speech(
text,
chapter,
output_file,
audio_tags,
)
logger.info(
f"✅ Converted chapter {idx}/{len(chapters)}: {title}"
f"✅ Converted chapter {idx}/{len(chapters)}: {chapter.title}"
)
logger.info(f"All chapters converted. 🎉🎉🎉")

Expand Down
5 changes: 3 additions & 2 deletions audiobook_generator/tts_providers/azure_tts_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from audiobook_generator.config.general_config import GeneralConfig
from audiobook_generator.core.utils import split_text, set_audio_tags
from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider
from audiobook_generator.book_parsers import ast

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -84,14 +85,14 @@ def get_access_token(self) -> str:

def text_to_speech(
self,
text: str,
chapter: ast.Chapter,
output_file: str,
audio_tags: AudioTags,
):
# Adjust this value based on your testing
max_chars = 1800 if self.config.language.startswith("zh") else 3000

text_chunks = split_text(text, max_chars, self.config.language)
text_chunks = split_text(chapter.text, max_chars, self.config.language)

audio_segments = []

Expand Down
3 changes: 2 additions & 1 deletion audiobook_generator/tts_providers/base_tts_provider.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List

from audiobook_generator.config.general_config import GeneralConfig
from audiobook_generator.book_parsers import ast

TTS_AZURE = "azure"
TTS_OPENAI = "openai"
Expand All @@ -20,7 +21,7 @@ def __str__(self) -> str:
def validate_config(self):
raise NotImplementedError

def text_to_speech(self, *args, **kwargs):
def text_to_speech(self, chapter: ast.Chapter, *args, **kwargs):
raise NotImplementedError

def estimate_cost(self, total_chars):
Expand Down
Loading