Skip to content

Commit

Permalink
feat: audio support (#189)
Browse files Browse the repository at this point in the history
  • Loading branch information
chr-hertel authored Jan 6, 2025
1 parent af5e00d commit 86b80f9
Show file tree
Hide file tree
Showing 19 changed files with 238 additions and 6 deletions.
37 changes: 34 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ The core feature of LLM Chain is to interact with language models via messages.
a **MessageBag** to a **Chain**, which takes care of LLM invocation and response handling.

Messages can be of different types, most importantly `UserMessage`, `SystemMessage`, or `AssistantMessage`, and can also
have different content types, like `Text` or `Image`.
have different content types, like `Text`, `Image` or `Audio`.

#### Example Chain call with messages

Expand Down Expand Up @@ -453,13 +453,13 @@ use PhpLlm\LlmChain\Model\Message\Content\Image;
use PhpLlm\LlmChain\Model\Message\Message;
use PhpLlm\LlmChain\Model\Message\MessageBag;

// Initialize Platoform, LLM & Chain
// Initialize Platform, LLM & Chain

$messages = new MessageBag(
Message::forSystem('You are an image analyzer bot that helps identify the content of images.'),
Message::ofUser(
'Describe the image as a comedian would do it.',
new Image(dirname(__DIR__).'/tests/Fixture/image.png'), // Path to an image file
new Image(dirname(__DIR__).'/tests/Fixture/image.jpg'), // Path to an image file
new Image('https://foo.com/bar.png'), // URL to an image
new Image('data:image/png;base64,...'), // Data URL of an image
),
Expand All @@ -472,6 +472,30 @@ $response = $chain->call($messages);
1. **Image Description**: [image-describer-binary.php](examples/image-describer-binary.php) (with binary file)
1. **Image Description**: [image-describer-url.php](examples/image-describer-url.php) (with URL)

### Audio Processing

Similar to images, some LLMs also support audio as input, which is just another `Content` type within the `UserMessage`:

```php
use PhpLlm\LlmChain\Model\Message\Content\Audio;
use PhpLlm\LlmChain\Model\Message\Message;
use PhpLlm\LlmChain\Model\Message\MessageBag;

// Initialize Platform, LLM & Chain

$messages = new MessageBag(
Message::ofUser(
'What is this recording about?',
new Audio(dirname(__DIR__).'/tests/Fixture/audio.mp3'), // Path to an audio file
),
);
$response = $chain->call($messages);
```

#### Code Examples

1. **Audio Description**: [audio-describer.php](examples/audio-describer.php)

### Embeddings

Creating embeddings of word, sentences or paragraphs is a typical use case around the interaction with LLMs and
Expand Down Expand Up @@ -617,3 +641,10 @@ Contributions are always welcome, so feel free to join the development of this l
[![LLM Chain Contributors](https://contrib.rocks/image?repo=php-llm/llm-chain 'LLM Chain Contributors')](https://github.com/php-llm/llm-chain/graphs/contributors)

Made with [contrib.rocks](https://contrib.rocks).

### Fixture Licenses

For testing multi-modal features, the repository contains binary media content, with the following owners and licenses:

* `tests/Fixture/image.jpg`: Chris F., Creative Commons, see [pexels.com](https://www.pexels.com/photo/blauer-und-gruner-elefant-mit-licht-1680755/)
* `tests/Fixture/audio.mp3`: davidbain, Creative Commons, see [freesound.org](https://freesound.org/people/davidbain/sounds/136777/)
31 changes: 31 additions & 0 deletions examples/audio-describer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?php

use PhpLlm\LlmChain\Bridge\OpenAI\GPT;
use PhpLlm\LlmChain\Bridge\OpenAI\PlatformFactory;
use PhpLlm\LlmChain\Chain;
use PhpLlm\LlmChain\Model\Message\Content\Audio;
use PhpLlm\LlmChain\Model\Message\Message;
use PhpLlm\LlmChain\Model\Message\MessageBag;
use Symfony\Component\Dotenv\Dotenv;

require_once dirname(__DIR__).'/vendor/autoload.php';
(new Dotenv())->loadEnv(dirname(__DIR__).'/.env');

if (empty($_ENV['OPENAI_API_KEY'])) {
echo 'Please set the OPENAI_API_KEY environment variable.'.PHP_EOL;
exit(1);
}

$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']);
$llm = new GPT(GPT::GPT_4O_AUDIO);

$chain = new Chain($platform, $llm);
$messages = new MessageBag(
Message::ofUser(
'What is this recording about?',
new Audio(dirname(__DIR__).'/tests/Fixture/audio.mp3'),
),
);
$response = $chain->call($messages);

echo $response->getContent().PHP_EOL;
2 changes: 1 addition & 1 deletion examples/image-describer-binary.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
Message::forSystem('You are an image analyzer bot that helps identify the content of images.'),
Message::ofUser(
'Describe the image as a comedian would do it.',
new Image(dirname(__DIR__).'/tests/Fixture/image.png'),
new Image(dirname(__DIR__).'/tests/Fixture/image.jpg'),
),
);
$response = $chain->call($messages);
Expand Down
5 changes: 5 additions & 0 deletions src/Bridge/Anthropic/Claude.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ public function getOptions(): array
return $this->options;
}

public function supportsAudioInput(): bool
{
return false;
}

public function supportsImageInput(): bool
{
return false; // it does, but implementation here is still open.
Expand Down
5 changes: 5 additions & 0 deletions src/Bridge/Meta/Llama.php
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ public function getOptions(): array
return $this->options;
}

public function supportsAudioInput(): bool
{
return false;
}

public function supportsImageInput(): bool
{
return false; // it does, but implementation here is still open.
Expand Down
11 changes: 11 additions & 0 deletions src/Bridge/OpenAI/GPT.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ final class GPT implements LanguageModel
public const GPT_4_TURBO = 'gpt-4-turbo';
public const GPT_4O = 'gpt-4o';
public const GPT_4O_MINI = 'gpt-4o-mini';
public const GPT_4O_AUDIO = 'gpt-4o-audio-preview';
public const O1_MINI = 'o1-mini';
public const O1_PREVIEW = 'o1-preview';

Expand All @@ -23,9 +24,14 @@ final class GPT implements LanguageModel
public function __construct(
private readonly string $version = self::GPT_4O,
private readonly array $options = ['temperature' => 1.0],
private bool $supportsAudioInput = false,
private bool $supportsImageInput = false,
private bool $supportsStructuredOutput = false,
) {
if (false === $this->supportsAudioInput) {
$this->supportsAudioInput = self::GPT_4O_AUDIO === $this->version;
}

if (false === $this->supportsImageInput) {
$this->supportsImageInput = in_array($this->version, [self::GPT_4_TURBO, self::GPT_4O, self::GPT_4O_MINI, self::O1_MINI, self::O1_PREVIEW], true);
}
Expand All @@ -45,6 +51,11 @@ public function getOptions(): array
return $this->options;
}

public function supportsAudioInput(): bool
{
return $this->supportsImageInput;
}

public function supportsImageInput(): bool
{
return $this->supportsImageInput;
Expand Down
4 changes: 4 additions & 0 deletions src/Chain.php
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ public function call(MessageBagInterface $messages, array $options = []): Respon
$messages = $input->messages;
$options = $input->getOptions();

if ($messages->containsAudio() && !$llm->supportsAudioInput()) {
throw MissingModelSupport::forAudioInput($llm::class);
}

if ($messages->containsImage() && !$llm->supportsImageInput()) {
throw MissingModelSupport::forImageInput($llm::class);
}
Expand Down
5 changes: 5 additions & 0 deletions src/Exception/MissingModelSupport.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ public static function forToolCalling(string $model): self
return new self($model, 'tool calling');
}

public static function forAudioInput(string $model): self
{
return new self($model, 'audio input');
}

public static function forImageInput(string $model): self
{
return new self($model, 'image input');
Expand Down
2 changes: 2 additions & 0 deletions src/Model/LanguageModel.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

interface LanguageModel extends Model
{
public function supportsAudioInput(): bool;

public function supportsImageInput(): bool;

public function supportsStreaming(): bool;
Expand Down
35 changes: 35 additions & 0 deletions src/Model/Message/Content/Audio.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Model\Message\Content;

use PhpLlm\LlmChain\Exception\InvalidArgumentException;

final readonly class Audio implements Content
{
public function __construct(
public string $path,
) {
if (!is_readable($path) || false === file_get_contents($path)) {
throw new InvalidArgumentException(sprintf('The file "%s" does not exist or is not readable.', $path));
}
}

/**
* @return array{type: 'input_audio', input_audio: array{data: string, format: string}}
*/
public function jsonSerialize(): array
{
$data = file_get_contents($this->path);
$format = pathinfo($this->path, PATHINFO_EXTENSION);

return [
'type' => 'input_audio',
'input_audio' => [
'data' => base64_encode($data),
'format' => $format,
],
];
}
}
11 changes: 11 additions & 0 deletions src/Model/Message/MessageBag.php
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,17 @@ public function prepend(MessageInterface $message): self
return $messages;
}

public function containsAudio(): bool
{
foreach ($this->messages as $message) {
if ($message instanceof UserMessage && $message->hasAudioContent()) {
return true;
}
}

return false;
}

public function containsImage(): bool
{
foreach ($this->messages as $message) {
Expand Down
2 changes: 2 additions & 0 deletions src/Model/Message/MessageBagInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,7 @@ public function withoutSystemMessage(): self;

public function prepend(MessageInterface $message): self;

public function containsAudio(): bool;

public function containsImage(): bool;
}
12 changes: 12 additions & 0 deletions src/Model/Message/UserMessage.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

namespace PhpLlm\LlmChain\Model\Message;

use PhpLlm\LlmChain\Model\Message\Content\Audio;
use PhpLlm\LlmChain\Model\Message\Content\Content;
use PhpLlm\LlmChain\Model\Message\Content\Image;
use PhpLlm\LlmChain\Model\Message\Content\Text;
Expand All @@ -26,6 +27,17 @@ public function getRole(): Role
return Role::User;
}

public function hasAudioContent(): bool
{
foreach ($this->content as $content) {
if ($content instanceof Audio) {
return true;
}
}

return false;
}

public function hasImageContent(): bool
{
foreach ($this->content as $content) {
Expand Down
Binary file added tests/Fixture/audio.mp3
Binary file not shown.
Binary file added tests/Fixture/image.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed tests/Fixture/image.png
Binary file not shown.
60 changes: 60 additions & 0 deletions tests/Model/Message/Content/AudioTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Tests\Model\Message\Content;

use PhpLlm\LlmChain\Model\Message\Content\Audio;
use PHPUnit\Framework\Attributes\CoversClass;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\Attributes\Small;
use PHPUnit\Framework\Attributes\Test;
use PHPUnit\Framework\TestCase;

#[CoversClass(Audio::class)]
#[Small]
final class AudioTest extends TestCase
{
#[Test]
public function constructWithValidPath(): void
{
$audio = new Audio(dirname(__DIR__, 3).'/Fixture/audio.mp3');

self::assertSame(dirname(__DIR__, 3).'/Fixture/audio.mp3', $audio->path);
}

#[Test]
#[DataProvider('provideValidPaths')]
public function jsonSerializeWithValid(string $path, array $expected): void
{
$audio = new Audio($path);

$expected = [
'type' => 'input_audio',
'input_audio' => $expected,
];

$actual = $audio->jsonSerialize();

// shortening the base64 data
$actual['input_audio']['data'] = substr($actual['input_audio']['data'], 0, 30);

self::assertSame($expected, $actual);
}

public static function provideValidPaths(): \Generator
{
yield 'mp3' => [dirname(__DIR__, 3).'/Fixture/audio.mp3', [
'data' => 'SUQzBAAAAAAAfVREUkMAAAAMAAADMj', // shortened
'format' => 'mp3',
]];
}

#[Test]
public function constructWithInvalidPath(): void
{
$this->expectExceptionMessage('The file "foo.mp3" does not exist or is not readable.');

new Audio('foo.mp3');
}
}
4 changes: 2 additions & 2 deletions tests/Model/Message/Content/ImageTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ public function constructWithValidDataUrl(): void
#[Test]
public function withValidFile(): void
{
$image = new Image(dirname(__DIR__, 3).'/Fixture/image.png');
$image = new Image(dirname(__DIR__, 3).'/Fixture/image.jpg');

self::assertStringStartsWith('data:image/png;base64,', $image->url);
self::assertStringStartsWith('data:image/jpg;base64,', $image->url);
}

#[Test]
Expand Down
18 changes: 18 additions & 0 deletions tests/Model/Message/UserMessageTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

namespace PhpLlm\LlmChain\Tests\Model\Message;

use PhpLlm\LlmChain\Model\Message\Content\Audio;
use PhpLlm\LlmChain\Model\Message\Content\Image;
use PhpLlm\LlmChain\Model\Message\Content\Text;
use PhpLlm\LlmChain\Model\Message\Role;
Expand All @@ -17,6 +18,7 @@

#[CoversClass(UserMessage::class)]
#[UsesClass(Text::class)]
#[UsesClass(Audio::class)]
#[UsesClass(Image::class)]
#[UsesClass(Role::class)]
#[Small]
Expand All @@ -39,6 +41,22 @@ public function constructionIsPossibleWithMultipleContent(): void
self::assertCount(2, $message->content);
}

#[Test]
public function hasAudioContentWithoutAudio(): void
{
$message = new UserMessage(new Text('foo'), new Text('bar'));

self::assertFalse($message->hasAudioContent());
}

#[Test]
public function hasAudioContentWithAudio(): void
{
$message = new UserMessage(new Text('foo'), new Audio(dirname(__DIR__, 2).'/Fixture/audio.mp3'));

self::assertTrue($message->hasAudioContent());
}

#[Test]
public function hasImageContentWithoutImage(): void
{
Expand Down

0 comments on commit 86b80f9

Please sign in to comment.