Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: local tts support #18

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,8 @@ Thumbs.db
# audio files
audiobook_output/

private_examples/
# piper models
*.onnx
*.onnx.json

private_examples/
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ If you're interested in hearing a sample of the audiobook generated by this tool
- Python 3.6+ Or ***Docker***
- For using Azure TTS, A Microsoft Azure account with access to the [Microsoft Cognitive Services Speech Services](https://portal.azure.com/#create/Microsoft.CognitiveServicesSpeechServices) is required.
- For using OpenAI TTS, OpenAI [API Key](https://platform.openai.com/api-keys) is required.
- For using Local TTS, a local TTS app is required, example [piper](https://github.com/rhasspy/piper).

## Audiobookshelf Integration

Expand Down Expand Up @@ -312,6 +313,28 @@ Here are some examples that demonstrate various option combinations:
python3 epub_to_audiobook.py "path/to/book.epub" "path/to/output/folder" --tts openai --preview --output_text
```

### Examples Using Local TTS

1. **Local TTS with default settings**
This command will convert an EPUB file to an audiobook using local [piper tts](https://github.com/rhasspy/piper).

```sh
python3 epub_to_audiobook.py "path/to/book.epub" "path/to/output/folder" --tts local
```
2. **Local TTS with custom command**

```sh
python3 epub_to_audiobook.py "path/to/book.epub" "path/to/output/folder" --tts local --local_tts_cmd ./tts.sh
```

Example `tts.sh` powered by [coqui-ai/TTS](https://github.com/coqui-ai/TTS).

```bash
#!/bin/bash

tts --text="$(cat)" --out_path "$@"
```

## Troubleshooting

### ModuleNotFoundError: No module named 'importlib_metadata'
Expand Down
83 changes: 81 additions & 2 deletions epub_to_audiobook.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import io
import argparse
import subprocess
import html
import ebooklib
from ebooklib import epub
Expand Down Expand Up @@ -30,7 +31,7 @@

TTS_AZURE = "azure"
TTS_OPENAI = "openai"

TTS_LOCAL = "local"

@dataclasses.dataclass
class AudioTags:
Expand Down Expand Up @@ -255,6 +256,51 @@ def text_to_speech(self, text: str, output_file: str, audio_tags: AudioTags):

set_audio_tags(output_file, audio_tags)

class LocalTTSProvider(TTSProvider):
def __init__(self, general_config: GeneralConfig, cmd, format, max_chars, break_ssml):
super().__init__(general_config)
self.cmd = cmd
self.format = format
self.max_chars = max_chars
self.break_ssml = break_ssml

def __str__(self) -> str:
return (super().__str__() + f", cmd={self.cmd}")

def text_to_speech(self, text: str, output_file: str, audio_tags: AudioTags):
text_chunks = split_text(text, self.max_chars, self.general_config.language)

with open(output_file, "wb") as outfile:
for i, chunk in enumerate(text_chunks, 1):
logger.debug(
f"Processing chunk {i} of {len(text_chunks)}, length={len(chunk)}, text=[{chunk}]"
)
# replace MAGIC_BREAK_STRING with break SSML
chunk = chunk.replace(
MAGIC_BREAK_STRING.strip(),
self.break_ssml,
) # strip in case leading bank is missing
logger.info(
f"Processing chapter-{audio_tags.idx} <{audio_tags.title}>, chunk {i} of {len(text_chunks)}"
)

logger.debug(f"Text: [{chunk}], length={len(chunk)}")

chuck_file = f"{output_file}.{i}"
process = subprocess.Popen(f"{self.cmd} {chuck_file}", stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate(input=chunk.encode())
if stdout:
logger.info(stdout.decode())
if stderr:
logger.error(stderr.decode())

segment = io.FileIO(chuck_file)
segment.seek(0)
outfile.write(segment.read())
os.remove(chuck_file)

set_audio_tags(output_file, audio_tags)


def sanitize_title(title: str) -> str:
# replace MAGIC_BREAK_STRING with a blank space
Expand Down Expand Up @@ -429,6 +475,8 @@ def epub_to_audiobook(tts_provider: TTSProvider):
audio_suffix = f"{tts_provider.format}" # mp3, opus, aac, or flac
elif isinstance(tts_provider, AzureTTSProvider):
audio_suffix = "mp3" # only mp3 is supported for Azure TTS for now
elif isinstance(tts_provider, LocalTTSProvider):
audio_suffix = tts_provider.format
else:
raise ValueError(f"Invalid TTS provider: {tts_provider.general_config.tts}")

Expand Down Expand Up @@ -473,7 +521,7 @@ def main():
parser.add_argument("output_folder", help="Path to the output folder")
parser.add_argument(
"--tts",
choices=[TTS_AZURE, TTS_OPENAI],
choices=[TTS_AZURE, TTS_OPENAI, TTS_LOCAL],
default=TTS_AZURE,
help="Choose TTS provider (default: azure). azure: Azure Cognitive Services, openai: OpenAI TTS API. When using azure, environment variables MS_TTS_KEY and MS_TTS_REGION must be set. When using openai, environment variable OPENAI_API_KEY must be set.",
)
Expand Down Expand Up @@ -556,6 +604,29 @@ def main():
default="mp3",
help="Available OpenAI output options: mp3, opus, aac, and flac. Check https://platform.openai.com/docs/guides/text-to-speech/supported-output-formats.",
)
# Local TTS specific arguments
local_group = parser.add_argument_group("Local TTS Options")
local_group.add_argument(
"--local_tts_cmd",
default="piper --model en_US-lessac-medium --output_file",
help="command to use for local TTS, it should accept text via stdin and write the resulting audio to the filename passed as argument.",
)
local_group.add_argument(
"--local_tts_format",
default="wav",
help="The output format the local TTS service outputs (default: wav).",
)
local_group.add_argument(
"--local_tts_max_chars",
type=int,
default=10000,
help="The max chars the local TTS service could handle at once (default: 10000).",
)
local_group.add_argument(
"--local_tts_break_ssml",
default=" ",
help="The break SSML supported by the local TTS service (default: \" \").",
)

args = parser.parse_args()

Expand All @@ -574,6 +645,14 @@ def main():
tts_provider = OpenAITTSProvider(
general_config, args.openai_model, args.openai_voice, args.openai_format
)
elif args.tts == TTS_LOCAL:
tts_provider = LocalTTSProvider(
general_config,
args.local_tts_cmd,
args.local_tts_format,
args.local_tts_max_chars,
args.local_tts_break_ssml,
)
else:
raise ValueError(f"Invalid TTS provider: {args.tts}")

Expand Down