Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Phi-3.5-vision-instruct VLM model #975

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 14 additions & 77 deletions camel/configs/vllm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,90 +13,27 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
from __future__ import annotations

from typing import Sequence, Union
from typing import List, Optional

from openai._types import NOT_GIVEN, NotGiven
from pydantic import Field

from camel.configs.base_config import BaseConfig


# flake8: noqa: E501
class VLLMConfig(BaseConfig):
r"""Defines the parameters for generating chat completions using the
OpenAI API.

Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html

Args:
temperature (float, optional): Sampling temperature to use, between
:obj:`0` and :obj:`2`. Higher values make the output more random,
while lower values make it more focused and deterministic.
(default: :obj:`0.2`)
top_p (float, optional): An alternative to sampling with temperature,
called nucleus sampling, where the model considers the results of
the tokens with top_p probability mass. So :obj:`0.1` means only
the tokens comprising the top 10% probability mass are considered.
(default: :obj:`1.0`)
n (int, optional): How many chat completion choices to generate for
each input message. (default: :obj:`1`)
response_format (object, optional): An object specifying the format
that the model must output. Compatible with GPT-4 Turbo and all
GPT-3.5 Turbo models newer than gpt-3.5-turbo-1106. Setting to
{"type": "json_object"} enables JSON mode, which guarantees the
message the model generates is valid JSON. Important: when using
JSON mode, you must also instruct the model to produce JSON
yourself via a system or user message. Without this, the model
may generate an unending stream of whitespace until the generation
reaches the token limit, resulting in a long-running and seemingly
"stuck" request. Also note that the message content may be
partially cut off if finish_reason="length", which indicates the
generation exceeded max_tokens or the conversation exceeded the
max context length.
stream (bool, optional): If True, partial message deltas will be sent
as data-only server-sent events as they become available.
(default: :obj:`False`)
stop (str or list, optional): Up to :obj:`4` sequences where the API
will stop generating further tokens. (default: :obj:`None`)
max_tokens (int, optional): The maximum number of tokens to generate
in the chat completion. The total length of input tokens and
generated tokens is limited by the model's context length.
(default: :obj:`None`)
presence_penalty (float, optional): Number between :obj:`-2.0` and
:obj:`2.0`. Positive values penalize new tokens based on whether
they appear in the text so far, increasing the model's likelihood
to talk about new topics. See more information about frequency and
presence penalties. (default: :obj:`0.0`)
frequency_penalty (float, optional): Number between :obj:`-2.0` and
:obj:`2.0`. Positive values penalize new tokens based on their
existing frequency in the text so far, decreasing the model's
likelihood to repeat the same line verbatim. See more information
about frequency and presence penalties. (default: :obj:`0.0`)
logit_bias (dict, optional): Modify the likelihood of specified tokens
appearing in the completion. Accepts a json object that maps tokens
(specified by their token ID in the tokenizer) to an associated
bias value from :obj:`-100` to :obj:`100`. Mathematically, the bias
is added to the logits generated by the model prior to sampling.
The exact effect will vary per model, but values between:obj:` -1`
and :obj:`1` should decrease or increase likelihood of selection;
values like :obj:`-100` or :obj:`100` should result in a ban or
exclusive selection of the relevant token. (default: :obj:`{}`)
user (str, optional): A unique identifier representing your end-user,
which can help OpenAI to monitor and detect abuse.
(default: :obj:`""`)
"""

temperature: float = 0.2 # openai default: 1.0
top_p: float = 1.0
n: int = 1
stream: bool = False
stop: Union[str, Sequence[str], NotGiven] = NOT_GIVEN
max_tokens: Union[int, NotGiven] = NOT_GIVEN
presence_penalty: float = 0.0
response_format: Union[dict, NotGiven] = NOT_GIVEN
frequency_penalty: float = 0.0
logit_bias: dict = Field(default_factory=dict)
user: str = ""
model: str = "microsoft/Phi-3.5-vision-instruct"
trust_remote_code: bool = True
max_model_len: int = 4096
limit_mm_per_prompt: dict = Field(default_factory=lambda: {"image": 2})
temperature: float = 0.0
max_tokens: int = 128
stop_token_ids: Optional[List[int]] = None
method: str = "generate"
image_urls: List[str] = Field(default_factory=list)
question: str = ""

class Config:
arbitrary_types_allowed = True


VLLM_API_PARAMS = {param for param in VLLMConfig.model_fields.keys()}
4 changes: 2 additions & 2 deletions camel/models/model_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ def create(
model_class = MistralModel
elif model_platform.is_reka and model_type.is_reka:
model_class = RekaModel
elif model_platform.is_vllm and model_type.is_vllm:
model_class = VLLMModel
elif model_type == ModelType.STUB:
model_class = StubModel
else:
Expand All @@ -109,8 +111,6 @@ def create(
return model_class(
model_type, model_config_dict, url, token_counter
)
elif model_platform.is_vllm:
model_class = VLLMModel
elif model_platform.is_litellm:
model_class = LiteLLMModel
elif model_platform.is_openai_compatibility_model:
Expand Down
194 changes: 87 additions & 107 deletions camel/models/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,79 +11,41 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
import os
import subprocess
from typing import Any, Dict, List, Optional, Union

from openai import OpenAI, Stream
from vllm import LLM, SamplingParams
from vllm.multimodal.utils import fetch_image

from camel.configs import VLLM_API_PARAMS
from camel.configs import VLLM_API_PARAMS, VLLMConfig
from camel.messages import OpenAIMessage
from camel.models import BaseModelBackend
from camel.types import ChatCompletion, ChatCompletionChunk, ModelType
from camel.utils import BaseTokenCounter, OpenAITokenCounter


# flake8: noqa: E501
class VLLMModel:
r"""vLLM service interface."""

class VLLMModel(BaseModelBackend):
def __init__(
self,
model_type: str,
model_type: ModelType,
model_config_dict: Dict[str, Any],
url: Optional[str] = None,
api_key: Optional[str] = None,
url: Optional[str] = None,
token_counter: Optional[BaseTokenCounter] = None,
) -> None:
r"""Constructor for vLLM backend with OpenAI compatibility.

# Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html

Args:
model_type (str): Model for which a backend is created.
model_config_dict (Dict[str, Any]): A dictionary that will
be fed into openai.ChatCompletion.create().
url (Optional[str]): The url to the model service. (default:
:obj:`"http://localhost:8000/v1"`)
api_key (Optional[str]): The API key for authenticating with the
model service.
token_counter (Optional[BaseTokenCounter]): Token counter to use
for the model. If not provided, `OpenAITokenCounter(ModelType.
GPT_4O_MINI)` will be used.
"""
self.model_type = model_type
self.model_config_dict = model_config_dict
self._url = (
url
or os.environ.get("VLLM_BASE_URL")
or "http://localhost:8000/v1"
)
if not url and not os.environ.get("VLLM_BASE_URL"):
self._start_server()
# Use OpenAI cilent as interface call vLLM
self._client = OpenAI(
timeout=60,
max_retries=3,
base_url=self._url,
super().__init__(
model_type=model_type,
model_config_dict=model_config_dict,
api_key=api_key,
url=url,
)
self._token_counter = token_counter
self.check_model_config()

def _start_server(self) -> None:
r"""Starts the vllm server in a subprocess."""
try:
subprocess.Popen(
["vllm", "server", "--port", "8000"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
print(
f"vllm server started on http://localhost:8000/v1 "
f"for {self.model_type} model."
)
except Exception as e:
print(f"Failed to start vllm server: {e}.")
self.config = VLLMConfig(**model_config_dict)
self.llm = LLM(
model=self.config.model,
trust_remote_code=self.config.trust_remote_code,
max_model_len=self.config.max_model_len,
limit_mm_per_prompt=self.config.limit_mm_per_prompt,
)

@property
def token_counter(self) -> BaseTokenCounter:
Expand All @@ -97,66 +59,84 @@ def token_counter(self) -> BaseTokenCounter:
self._token_counter = OpenAITokenCounter(ModelType.GPT_4O_MINI)
return self._token_counter

def check_model_config(self):
r"""Check whether the model configuration contains any
unexpected arguments to vLLM API.

Raises:
ValueError: If the model configuration dictionary contains any
unexpected arguments to OpenAI API.
"""
for param in self.model_config_dict:
if param not in VLLM_API_PARAMS:
raise ValueError(
f"Unexpected argument `{param}` is "
"input into vLLM model backend."
)

def run(
self,
messages: List[OpenAIMessage],
) -> Union[ChatCompletion, Stream[ChatCompletionChunk]]:
r"""Runs inference of OpenAI chat completion.

Args:
messages (List[OpenAIMessage]): Message list with the chat history
in OpenAI API format.

Returns:
Union[ChatCompletion, Stream[ChatCompletionChunk]]:
`ChatCompletion` in the non-stream mode, or
`Stream[ChatCompletionChunk]` in the stream mode.
"""
) -> Union[ChatCompletion, ChatCompletionChunk]:
question = messages[-1]['content']
image_urls = self.config.image_urls
image_data = [fetch_image(url) for url in image_urls]

sampling_params = SamplingParams(
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
stop_token_ids=self.config.stop_token_ids,
)

response = self._client.chat.completions.create(
messages=messages,
model=self.model_type,
**self.model_config_dict,
if self.config.method == "generate":
placeholders = "\n".join(
f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
)
prompt = (
f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
)
outputs = self.llm.generate(
{"prompt": prompt, "multi_modal_data": {"image": image_data}},
sampling_params=sampling_params,
)
elif self.config.method == "chat":
outputs = self.llm.chat(
[
{
"role": "user",
"content": [{"type": "text", "text": question}]
+ [
{"type": "image_url", "image_url": {"url": url}}
for url in image_urls
],
}
],
sampling_params=sampling_params,
)
else:
raise ValueError(f"Invalid method: {self.config.method}")

# Convert vLLM output to OpenAI-like format
response = ChatCompletion(
id="vllm_response",
object="chat.completion",
created=0,
model=self.config.model,
choices=[
{
"index": 0,
"message": {
"role": "assistant",
"content": outputs[0].outputs[0].text,
},
"finish_reason": "stop",
}
],
usage={
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
},
)
return response

def check_model_config(self):
for param in self.model_config_dict:
if param not in VLLM_API_PARAMS:
raise ValueError(
f"Unexpected argument `{param}` is "
"input into VLLM model backend."
)

@property
def token_limit(self) -> int:
r"""Returns the maximum token limit for the given model.

Returns:
int: The maximum token limit for the given model.
"""
max_tokens = self.model_config_dict.get("max_tokens")
if isinstance(max_tokens, int):
return max_tokens
print(
"Must set `max_tokens` as an integer in `model_config_dict` when"
" setting up the model. Using 4096 as default value."
)
return 4096
return self.config.max_model_len

@property
def stream(self) -> bool:
r"""Returns whether the model is in stream mode, which sends partial
results each time.

Returns:
bool: Whether the model is in stream mode.
"""
return self.model_config_dict.get('stream', False)
return False # VLLM doesn't support streaming in this implementation
15 changes: 15 additions & 0 deletions camel/types/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ class ModelType(Enum):
REKA_FLASH = "reka-flash"
REKA_EDGE = "reka-edge"

# VLLM models
PHI_3_5_VISION = "microsoft/Phi-3.5-vision-instruct"

@property
def value_for_tiktoken(self) -> str:
if self.is_openai:
Expand Down Expand Up @@ -221,6 +224,17 @@ def is_reka(self) -> bool:
ModelType.REKA_FLASH,
}

@property
def is_vllm(self) -> bool:
r"""Returns whether this type of models is a VLLM model.

Returns:
bool: Whether this type of models is VLLM.
"""
return self in {
ModelType.PHI_3_5_VISION,
}

@property
def token_limit(self) -> int:
r"""Returns the maximum token limit for a given model.
Expand All @@ -240,6 +254,7 @@ def token_limit(self) -> int:
ModelType.REKA_CORE,
ModelType.REKA_EDGE,
ModelType.REKA_FLASH,
ModelType.PHI_3_5_VISION,
}:
return 4_096
elif self in {
Expand Down
Loading
Loading