diff --git a/README.md b/README.md index 44d725c9..a2aac541 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ The core feature of LLM Chain is to interact with language models via messages. a **MessageBag** to a **Chain**, which takes care of LLM invocation and response handling. Messages can be of different types, most importantly `UserMessage`, `SystemMessage`, or `AssistantMessage`, and can also -have different content types, like `Text` or `Image`. +have different content types, like `Text`, `Image` or `Audio`. #### Example Chain call with messages @@ -453,13 +453,13 @@ use PhpLlm\LlmChain\Model\Message\Content\Image; use PhpLlm\LlmChain\Model\Message\Message; use PhpLlm\LlmChain\Model\Message\MessageBag; -// Initialize Platoform, LLM & Chain +// Initialize Platform, LLM & Chain $messages = new MessageBag( Message::forSystem('You are an image analyzer bot that helps identify the content of images.'), Message::ofUser( 'Describe the image as a comedian would do it.', - new Image(dirname(__DIR__).'/tests/Fixture/image.png'), // Path to an image file + new Image(dirname(__DIR__).'/tests/Fixture/image.jpg'), // Path to an image file new Image('https://foo.com/bar.png'), // URL to an image new Image('data:image/png;base64,...'), // Data URL of an image ), @@ -472,6 +472,30 @@ $response = $chain->call($messages); 1. **Image Description**: [image-describer-binary.php](examples/image-describer-binary.php) (with binary file) 1. **Image Description**: [image-describer-url.php](examples/image-describer-url.php) (with URL) +### Audio Processing + +Similar to images, some LLMs also support audio as input, which is just another `Content` type within the `UserMessage`: + +```php +use PhpLlm\LlmChain\Model\Message\Content\Audio; +use PhpLlm\LlmChain\Model\Message\Message; +use PhpLlm\LlmChain\Model\Message\MessageBag; + +// Initialize Platform, LLM & Chain + +$messages = new MessageBag( + Message::ofUser( + 'What is this recording about?', + new Audio(dirname(__DIR__).'/tests/Fixture/audio.mp3'), // Path to an audio file + ), +); +$response = $chain->call($messages); +``` + +#### Code Examples + +1. **Audio Description**: [audio-describer.php](examples/audio-describer.php) + ### Embeddings Creating embeddings of word, sentences or paragraphs is a typical use case around the interaction with LLMs and @@ -617,3 +641,10 @@ Contributions are always welcome, so feel free to join the development of this l [![LLM Chain Contributors](https://contrib.rocks/image?repo=php-llm/llm-chain 'LLM Chain Contributors')](https://github.com/php-llm/llm-chain/graphs/contributors) Made with [contrib.rocks](https://contrib.rocks). + +### Fixture Licenses + +For testing multi-modal features, the repository contains binary media content, with the following owners and licenses: + +* `tests/Fixture/image.jpg`: Chris F., Creative Commons, see [pexels.com](https://www.pexels.com/photo/blauer-und-gruner-elefant-mit-licht-1680755/) +* `tests/Fixture/audio.mp3`: davidbain, Creative Commons, see [freesound.org](https://freesound.org/people/davidbain/sounds/136777/) diff --git a/examples/audio-describer.php b/examples/audio-describer.php new file mode 100755 index 00000000..4f571d51 --- /dev/null +++ b/examples/audio-describer.php @@ -0,0 +1,31 @@ +loadEnv(dirname(__DIR__).'/.env'); + +if (empty($_ENV['OPENAI_API_KEY'])) { + echo 'Please set the OPENAI_API_KEY environment variable.'.PHP_EOL; + exit(1); +} + +$platform = PlatformFactory::create($_ENV['OPENAI_API_KEY']); +$llm = new GPT(GPT::GPT_4O_AUDIO); + +$chain = new Chain($platform, $llm); +$messages = new MessageBag( + Message::ofUser( + 'What is this recording about?', + new Audio(dirname(__DIR__).'/tests/Fixture/audio.mp3'), + ), +); +$response = $chain->call($messages); + +echo $response->getContent().PHP_EOL; diff --git a/examples/image-describer-binary.php b/examples/image-describer-binary.php index 980c35bd..23f3e647 100755 --- a/examples/image-describer-binary.php +++ b/examples/image-describer-binary.php @@ -24,7 +24,7 @@ Message::forSystem('You are an image analyzer bot that helps identify the content of images.'), Message::ofUser( 'Describe the image as a comedian would do it.', - new Image(dirname(__DIR__).'/tests/Fixture/image.png'), + new Image(dirname(__DIR__).'/tests/Fixture/image.jpg'), ), ); $response = $chain->call($messages); diff --git a/src/Bridge/Anthropic/Claude.php b/src/Bridge/Anthropic/Claude.php index 0c597920..47d4c509 100644 --- a/src/Bridge/Anthropic/Claude.php +++ b/src/Bridge/Anthropic/Claude.php @@ -32,6 +32,11 @@ public function getOptions(): array return $this->options; } + public function supportsAudioInput(): bool + { + return false; + } + public function supportsImageInput(): bool { return false; // it does, but implementation here is still open. diff --git a/src/Bridge/Meta/Llama.php b/src/Bridge/Meta/Llama.php index c7b99aa5..6f60db5c 100644 --- a/src/Bridge/Meta/Llama.php +++ b/src/Bridge/Meta/Llama.php @@ -42,6 +42,11 @@ public function getOptions(): array return $this->options; } + public function supportsAudioInput(): bool + { + return false; + } + public function supportsImageInput(): bool { return false; // it does, but implementation here is still open. diff --git a/src/Bridge/OpenAI/GPT.php b/src/Bridge/OpenAI/GPT.php index d4d64232..195c5373 100644 --- a/src/Bridge/OpenAI/GPT.php +++ b/src/Bridge/OpenAI/GPT.php @@ -14,6 +14,7 @@ final class GPT implements LanguageModel public const GPT_4_TURBO = 'gpt-4-turbo'; public const GPT_4O = 'gpt-4o'; public const GPT_4O_MINI = 'gpt-4o-mini'; + public const GPT_4O_AUDIO = 'gpt-4o-audio-preview'; public const O1_MINI = 'o1-mini'; public const O1_PREVIEW = 'o1-preview'; @@ -23,9 +24,14 @@ final class GPT implements LanguageModel public function __construct( private readonly string $version = self::GPT_4O, private readonly array $options = ['temperature' => 1.0], + private bool $supportsAudioInput = false, private bool $supportsImageInput = false, private bool $supportsStructuredOutput = false, ) { + if (false === $this->supportsAudioInput) { + $this->supportsAudioInput = self::GPT_4O_AUDIO === $this->version; + } + if (false === $this->supportsImageInput) { $this->supportsImageInput = in_array($this->version, [self::GPT_4_TURBO, self::GPT_4O, self::GPT_4O_MINI, self::O1_MINI, self::O1_PREVIEW], true); } @@ -45,6 +51,11 @@ public function getOptions(): array return $this->options; } + public function supportsAudioInput(): bool + { + return $this->supportsImageInput; + } + public function supportsImageInput(): bool { return $this->supportsImageInput; diff --git a/src/Chain.php b/src/Chain.php index d1166650..9f6e7121 100644 --- a/src/Chain.php +++ b/src/Chain.php @@ -60,6 +60,10 @@ public function call(MessageBagInterface $messages, array $options = []): Respon $messages = $input->messages; $options = $input->getOptions(); + if ($messages->containsAudio() && !$llm->supportsAudioInput()) { + throw MissingModelSupport::forAudioInput($llm::class); + } + if ($messages->containsImage() && !$llm->supportsImageInput()) { throw MissingModelSupport::forImageInput($llm::class); } diff --git a/src/Exception/MissingModelSupport.php b/src/Exception/MissingModelSupport.php index 97f898d1..ecb2c645 100644 --- a/src/Exception/MissingModelSupport.php +++ b/src/Exception/MissingModelSupport.php @@ -16,6 +16,11 @@ public static function forToolCalling(string $model): self return new self($model, 'tool calling'); } + public static function forAudioInput(string $model): self + { + return new self($model, 'audio input'); + } + public static function forImageInput(string $model): self { return new self($model, 'image input'); diff --git a/src/Model/LanguageModel.php b/src/Model/LanguageModel.php index d9794368..e18df6f7 100644 --- a/src/Model/LanguageModel.php +++ b/src/Model/LanguageModel.php @@ -6,6 +6,8 @@ interface LanguageModel extends Model { + public function supportsAudioInput(): bool; + public function supportsImageInput(): bool; public function supportsStreaming(): bool; diff --git a/src/Model/Message/Content/Audio.php b/src/Model/Message/Content/Audio.php new file mode 100644 index 00000000..fe04f9ae --- /dev/null +++ b/src/Model/Message/Content/Audio.php @@ -0,0 +1,35 @@ +path); + $format = pathinfo($this->path, PATHINFO_EXTENSION); + + return [ + 'type' => 'input_audio', + 'input_audio' => [ + 'data' => base64_encode($data), + 'format' => $format, + ], + ]; + } +} diff --git a/src/Model/Message/MessageBag.php b/src/Model/Message/MessageBag.php index 23921195..f0fad99d 100644 --- a/src/Model/Message/MessageBag.php +++ b/src/Model/Message/MessageBag.php @@ -78,6 +78,17 @@ public function prepend(MessageInterface $message): self return $messages; } + public function containsAudio(): bool + { + foreach ($this->messages as $message) { + if ($message instanceof UserMessage && $message->hasAudioContent()) { + return true; + } + } + + return false; + } + public function containsImage(): bool { foreach ($this->messages as $message) { diff --git a/src/Model/Message/MessageBagInterface.php b/src/Model/Message/MessageBagInterface.php index 8f77ba4f..c5c832d8 100644 --- a/src/Model/Message/MessageBagInterface.php +++ b/src/Model/Message/MessageBagInterface.php @@ -23,5 +23,7 @@ public function withoutSystemMessage(): self; public function prepend(MessageInterface $message): self; + public function containsAudio(): bool; + public function containsImage(): bool; } diff --git a/src/Model/Message/UserMessage.php b/src/Model/Message/UserMessage.php index eea38bcf..ef925c5c 100644 --- a/src/Model/Message/UserMessage.php +++ b/src/Model/Message/UserMessage.php @@ -4,6 +4,7 @@ namespace PhpLlm\LlmChain\Model\Message; +use PhpLlm\LlmChain\Model\Message\Content\Audio; use PhpLlm\LlmChain\Model\Message\Content\Content; use PhpLlm\LlmChain\Model\Message\Content\Image; use PhpLlm\LlmChain\Model\Message\Content\Text; @@ -26,6 +27,17 @@ public function getRole(): Role return Role::User; } + public function hasAudioContent(): bool + { + foreach ($this->content as $content) { + if ($content instanceof Audio) { + return true; + } + } + + return false; + } + public function hasImageContent(): bool { foreach ($this->content as $content) { diff --git a/tests/Fixture/audio.mp3 b/tests/Fixture/audio.mp3 new file mode 100644 index 00000000..509aa0fc Binary files /dev/null and b/tests/Fixture/audio.mp3 differ diff --git a/tests/Fixture/image.jpg b/tests/Fixture/image.jpg new file mode 100644 index 00000000..bae677ad Binary files /dev/null and b/tests/Fixture/image.jpg differ diff --git a/tests/Fixture/image.png b/tests/Fixture/image.png deleted file mode 100644 index bb2161ce..00000000 Binary files a/tests/Fixture/image.png and /dev/null differ diff --git a/tests/Model/Message/Content/AudioTest.php b/tests/Model/Message/Content/AudioTest.php new file mode 100644 index 00000000..1824167c --- /dev/null +++ b/tests/Model/Message/Content/AudioTest.php @@ -0,0 +1,60 @@ +path); + } + + #[Test] + #[DataProvider('provideValidPaths')] + public function jsonSerializeWithValid(string $path, array $expected): void + { + $audio = new Audio($path); + + $expected = [ + 'type' => 'input_audio', + 'input_audio' => $expected, + ]; + + $actual = $audio->jsonSerialize(); + + // shortening the base64 data + $actual['input_audio']['data'] = substr($actual['input_audio']['data'], 0, 30); + + self::assertSame($expected, $actual); + } + + public static function provideValidPaths(): \Generator + { + yield 'mp3' => [dirname(__DIR__, 3).'/Fixture/audio.mp3', [ + 'data' => 'SUQzBAAAAAAAfVREUkMAAAAMAAADMj', // shortened + 'format' => 'mp3', + ]]; + } + + #[Test] + public function constructWithInvalidPath(): void + { + $this->expectExceptionMessage('The file "foo.mp3" does not exist or is not readable.'); + + new Audio('foo.mp3'); + } +} diff --git a/tests/Model/Message/Content/ImageTest.php b/tests/Model/Message/Content/ImageTest.php index 1e9b537e..0fad58a9 100644 --- a/tests/Model/Message/Content/ImageTest.php +++ b/tests/Model/Message/Content/ImageTest.php @@ -33,9 +33,9 @@ public function constructWithValidDataUrl(): void #[Test] public function withValidFile(): void { - $image = new Image(dirname(__DIR__, 3).'/Fixture/image.png'); + $image = new Image(dirname(__DIR__, 3).'/Fixture/image.jpg'); - self::assertStringStartsWith('data:image/png;base64,', $image->url); + self::assertStringStartsWith('data:image/jpg;base64,', $image->url); } #[Test] diff --git a/tests/Model/Message/UserMessageTest.php b/tests/Model/Message/UserMessageTest.php index 571b6bc3..3b1e4e59 100644 --- a/tests/Model/Message/UserMessageTest.php +++ b/tests/Model/Message/UserMessageTest.php @@ -4,6 +4,7 @@ namespace PhpLlm\LlmChain\Tests\Model\Message; +use PhpLlm\LlmChain\Model\Message\Content\Audio; use PhpLlm\LlmChain\Model\Message\Content\Image; use PhpLlm\LlmChain\Model\Message\Content\Text; use PhpLlm\LlmChain\Model\Message\Role; @@ -17,6 +18,7 @@ #[CoversClass(UserMessage::class)] #[UsesClass(Text::class)] +#[UsesClass(Audio::class)] #[UsesClass(Image::class)] #[UsesClass(Role::class)] #[Small] @@ -39,6 +41,22 @@ public function constructionIsPossibleWithMultipleContent(): void self::assertCount(2, $message->content); } + #[Test] + public function hasAudioContentWithoutAudio(): void + { + $message = new UserMessage(new Text('foo'), new Text('bar')); + + self::assertFalse($message->hasAudioContent()); + } + + #[Test] + public function hasAudioContentWithAudio(): void + { + $message = new UserMessage(new Text('foo'), new Audio(dirname(__DIR__, 2).'/Fixture/audio.mp3')); + + self::assertTrue($message->hasAudioContent()); + } + #[Test] public function hasImageContentWithoutImage(): void {