p0n1 · hlyl · Sep 8, 2024 · Sep 9, 2024
diff --git a/audiobook_generator/book_parsers/ast.py b/audiobook_generator/book_parsers/ast.py
@@ -0,0 +1,58 @@
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass
+class Item:
+    @property
+    def text(self):
+        raise NotImplementedError
+
+    def char_count(self):
+        """Returns the number of characters in the item"""
+        raise NotImplementedError
+
+
+@dataclass
+class Items:
+    items: List["Item"]
+
+    @property
+    def text(self):
+        return "".join((item.text for item in self.items))
+
+    def char_count(self):
+        """Returns the number of characters in the item"""
+        return sum(map(lambda x: x.char_count(), self.items))
+
+
+@dataclass
+class Chapter(Items):
+    title: str
+
+
+@dataclass
+class Text(Item):
+    _text: str
+
+    @property
+    def text(self):
+        return self._text
+
+    def char_count(self):
+        return len(self.text)
+
+
+@dataclass
+class Quote(Items):
+    pass
+
+
+@dataclass
+class Break(Item):
+    @property
+    def text(self):
+        return ""
+
+    def char_count(self):
+        return 0
diff --git a/audiobook_generator/book_parsers/base_book_parser.py b/audiobook_generator/book_parsers/base_book_parser.py
@@ -1,6 +1,7 @@
 from typing import List, Tuple
 
 from audiobook_generator.config.general_config import GeneralConfig
+from audiobook_generator.book_parsers import ast
 
 EPUB = "epub"
 
@@ -26,7 +27,7 @@ def get_book_title(self) -> str:
     def get_book_author(self) -> str:
         raise NotImplementedError
 
-    def get_chapters(self, break_string) -> List[Tuple[str, str]]:
+    def get_chapters(self, break_string) -> List[ast.Chapter]:
         raise NotImplementedError
 
 

diff --git a/audiobook_generator/book_parsers/epub_book_parser.py b/audiobook_generator/book_parsers/epub_book_parser.py
@@ -1,17 +1,72 @@
 import logging
 import re
-from typing import List, Tuple
+from typing import List
 
 import ebooklib
 from bs4 import BeautifulSoup
 from ebooklib import epub
 
 from audiobook_generator.book_parsers.base_book_parser import BaseBookParser
 from audiobook_generator.config.general_config import GeneralConfig
+from audiobook_generator.book_parsers import ast
 
 logger = logging.getLogger(__name__)
 
 
+def _split_by_break(item: ast.Text) -> List[ast.Item]:
+    parts = item.text.split("\n")
+    return [elem for x in parts for elem in (ast.Text(x + " "), ast.Break()) if x][:-1]
+
+
+def _merge_item(item: ast.Item, new_items: List[ast.Item]):
+    if isinstance(item, ast.Break):
+        if new_items and isinstance(new_items[-1], ast.Break):
+            return
+        else:
+            # add break
+            new_items.append(item)
+    elif isinstance(item, ast.Text):
+        if new_items and isinstance(new_items[-1], ast.Text):
+            new_items[-1]._text += item.text
+        else:
+            # add new text
+            new_items.append(item)
+    elif isinstance(item, ast.Items):
+        item.items = _split_and_merge(item.items)
+        new_items.append(item)
+    else:
+        new_items.append(item)
+
+
+def _split_and_merge(items: List[ast.Item]) -> List[ast.Item]:
+    # split text by breaks
+    new_items = []
+    for item in items:
+        if isinstance(item, ast.Text):
+            new_items += _split_by_break(item)
+        else:
+            new_items.append(item)
+
+    # merge items together
+    items = []
+    for item in new_items:
+        _merge_item(item, items)
+
+    return items
+
+
+def _parse(soup: BeautifulSoup) -> List[ast.Item]:
+    items = []
+    for item in soup:
+        if isinstance(item, str):
+            items.append(ast.Text(_text=item))
+        elif item.name == "blockquote":
+            items.append(ast.Quote(items=_parse(item)))
+        else:
+            items += _parse(item)
+    return items
+
+
 class EpubBookParser(BaseBookParser):
     def __init__(self, config: GeneralConfig):
         super().__init__(config)
@@ -39,38 +94,13 @@ def get_book_author(self) -> str:
             return self.book.get_metadata("DC", "creator")[0][0]
         return "Unknown"
 
-    def get_chapters(self, break_string) -> List[Tuple[str, str]]:
+    def get_chapters(self, break_string) -> List[ast.Chapter]:
         chapters = []
-        search_and_replaces = self.get_search_and_replaces()
+        search_and_replaces = []#self.get_search_and_replaces()
         for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
             content = item.get_content()
             soup = BeautifulSoup(content, "lxml-xml")
-            raw = soup.get_text(strip=False)
-            logger.debug(f"Raw text: <{raw[:]}>")
-
-            # Replace excessive whitespaces and newline characters based on the mode
-            if self.config.newline_mode == "single":
-                cleaned_text = re.sub(r"[\n]+", break_string, raw.strip())
-            elif self.config.newline_mode == "double":
-                cleaned_text = re.sub(r"[\n]{2,}", break_string, raw.strip())
-            elif self.config.newline_mode == "none":
-                cleaned_text = re.sub(r"[\n]+", " ", raw.strip())
-            else:
-                raise ValueError(f"Invalid newline mode: {self.config.newline_mode}")
-
-            logger.debug(f"Cleaned text step 1: <{cleaned_text[:]}>")
-            cleaned_text = re.sub(r"\s+", " ", cleaned_text)
-            logger.debug(f"Cleaned text step 2: <{cleaned_text[:100]}>")
-
-            # Removes end-note numbers
-            if self.config.remove_endnotes:
-                cleaned_text = re.sub(r'(?<=[a-zA-Z.,!?;”")])\d+', "", cleaned_text)
-                logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>")
-
-            # Does user defined search and replaces
-            for search_and_replace in search_and_replaces:
-                cleaned_text = re.sub(search_and_replace['search'], search_and_replace['replace'], cleaned_text)
-            logger.debug(f"Cleaned text step 5: <{cleaned_text[:100]}>")
+            raw = _split_and_merge(_parse(soup))
 
             # Get proper chapter title
             if self.config.title_mode == "auto":
@@ -81,7 +111,7 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
                         title = soup.find(level).text
                         break
                 if title == "" or re.match(r'^\d{1,3}$',title) is not None:
-                    title = cleaned_text[:60]
+                    title = soup.get_text(strip=True)[:60]
             elif self.config.title_mode == "tag_text":
                 title = ""
                 title_levels = ['title', 'h1', 'h2', 'h3']
@@ -92,14 +122,14 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
                 if title == "":
                     title = "<blank>"
             elif self.config.title_mode == "first_few":
-                title = cleaned_text[:60]
+                title = soup.get_text(strip=True)[:60]
             else:
                 raise ValueError("Unsupported title_mode")
             logger.debug(f"Raw title: <{title}>")
             title = self._sanitize_title(title, break_string)
             logger.debug(f"Sanitized title: <{title}>")
 
-            chapters.append((title, cleaned_text))
+            chapters.append(ast.Chapter(title=title, items=raw))
             soup.decompose()
         return chapters
 

diff --git a/audiobook_generator/core/audiobook_generator.py b/audiobook_generator/core/audiobook_generator.py
@@ -1,10 +1,12 @@
 import logging
+from typing import List, Tuple
 import os
 
 from audiobook_generator.book_parsers.base_book_parser import get_book_parser
 from audiobook_generator.config.general_config import GeneralConfig
 from audiobook_generator.core.audio_tags import AudioTags
 from audiobook_generator.tts_providers.base_tts_provider import get_tts_provider
+from audiobook_generator.book_parsers import ast
 
 logger = logging.getLogger(__name__)
 
@@ -17,11 +19,9 @@ def confirm_conversion():
         exit(0)
 
 
-def get_total_chars(chapters):
-    total_characters = 0
-    for title, text in chapters:
-        total_characters += len(text)
-    return total_characters
+
+def get_total_chars(chapters: List[ast.Chapter]):
+    return sum(map(lambda x: x.char_count(), chapters))
 
 
 class AudiobookGenerator:
@@ -39,7 +39,7 @@ def run(self):
             os.makedirs(self.config.output_folder, exist_ok=True)
             chapters = book_parser.get_chapters(tts_provider.get_break_string())
             # Filter out empty or very short chapters
-            chapters = [(title, text) for title, text in chapters if text.strip()]
+            chapters = list(filter(lambda x: x.char_count() > 0, chapters))
 
             logger.info(f"Chapters count: {len(chapters)}.")
 
@@ -76,34 +76,34 @@ def run(self):
                 confirm_conversion()
 
             # Loop through each chapter and convert it to speech using the provided TTS provider
-            for idx, (title, text) in enumerate(chapters, start=1):
+            for idx, chapter in enumerate(chapters, start=1):
                 if idx < self.config.chapter_start:
                     continue
                 if idx > self.config.chapter_end:
                     break
                 logger.info(
-                    f"Converting chapter {idx}/{len(chapters)}: {title}, characters: {len(text)}"
+                    f"Converting chapter {idx}/{len(chapters)}: {chapter.title}, characters: {chapter.char_count()}"
                 )
 
                 if self.config.output_text:
-                    text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{title}.txt")
+                    text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{chapter.title}.txt")
                     with open(text_file, "w", encoding='utf-8') as file:
-                        file.write(text)
+                        file.write(chapter.text)
 
                 if self.config.preview:
                     continue
 
                 output_file = os.path.join(self.config.output_folder,
-                                           f"{idx:04d}_{title}.{tts_provider.get_output_file_extension()}")
+                                           f"{idx:04d}_{chapter.title}.{tts_provider.get_output_file_extension()}")
 
-                audio_tags = AudioTags(title, book_parser.get_book_author(), book_parser.get_book_title(), idx)
+                audio_tags = AudioTags(chapter.title, book_parser.get_book_author(), book_parser.get_book_title(), idx)
                 tts_provider.text_to_speech(
-                    text,
+                    chapter,
                     output_file,
                     audio_tags,
                 )
                 logger.info(
-                    f"✅ Converted chapter {idx}/{len(chapters)}: {title}"
+                    f"✅ Converted chapter {idx}/{len(chapters)}: {chapter.title}"
                 )
             logger.info(f"All chapters converted. 🎉🎉🎉")
 

diff --git a/audiobook_generator/tts_providers/azure_tts_provider.py b/audiobook_generator/tts_providers/azure_tts_provider.py
@@ -11,6 +11,7 @@
 from audiobook_generator.config.general_config import GeneralConfig
 from audiobook_generator.core.utils import split_text, set_audio_tags
 from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider
+from audiobook_generator.book_parsers import ast
 
 logger = logging.getLogger(__name__)
 
@@ -84,14 +85,14 @@ def get_access_token(self) -> str:
 
     def text_to_speech(
             self,
-            text: str,
+            chapter: ast.Chapter,
             output_file: str,
             audio_tags: AudioTags,
     ):
         # Adjust this value based on your testing
         max_chars = 1800 if self.config.language.startswith("zh") else 3000
 
-        text_chunks = split_text(text, max_chars, self.config.language)
+        text_chunks = split_text(chapter.text, max_chars, self.config.language)
 
         audio_segments = []
 

diff --git a/audiobook_generator/tts_providers/base_tts_provider.py b/audiobook_generator/tts_providers/base_tts_provider.py
@@ -1,6 +1,7 @@
 from typing import List
 
 from audiobook_generator.config.general_config import GeneralConfig
+from audiobook_generator.book_parsers import ast
 
 TTS_AZURE = "azure"
 TTS_OPENAI = "openai"
@@ -20,7 +21,7 @@ def __str__(self) -> str:
     def validate_config(self):
         raise NotImplementedError
 
-    def text_to_speech(self, *args, **kwargs):
+    def text_to_speech(self, chapter: ast.Chapter, *args, **kwargs):
         raise NotImplementedError
 
     def estimate_cost(self, total_chars):