Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BFCL] Add Llama-3.2-1B-Instruct, Llama-3.2-3B-Instruct, Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct #657

Merged
merged 19 commits into from
Oct 5, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions berkeley-function-call-leaderboard/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen
- `microsoft/Phi-3-mini-128k-instruct`
- `microsoft/Phi-3-mini-4k-instruct`
- [Sept 25, 2024] [#660](https://github.com/ShishirPatil/gorilla/pull/660): Bug fix in `parse_nested_value` function to handle nested dictionary values properly.
- [Sept 24, 2024] [#657](https://github.com/ShishirPatil/gorilla/pull/657): Add the following new models to the leaderboard:
- `meta-llama/Llama-3.2-1B-Instruct`
- `meta-llama/Llama-3.2-1B-Instruct-FC`
- `meta-llama/Llama-3.2-3B-Instruct`
- `meta-llama/Llama-3.2-3B-Instruct-FC`
- `meta-llama/Llama-3.1-8B-Instruct`
- `meta-llama/Llama-3.1-8B-Instruct-FC`
- `meta-llama/Llama-3.1-70B-Instruct`
- `meta-llama/Llama-3.1-70B-Instruct-FC`
- [Sept 19, 2024] [#644](https://github.com/ShishirPatil/gorilla/pull/644): BFCL V3 release:
- Introduce new multi-turn dataset and state-based evaluation metric
- Separate ast_checker and executable_checker for readability
Expand Down
6 changes: 5 additions & 1 deletion berkeley-function-call-leaderboard/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,11 @@ Below is _a table of models we support_ to run our leaderboard evaluation agains
|google/gemma-7b-it 💻| Prompt|
|meetkai/functionary-medium-v3.1-FC| Function Calling|
|meetkai/functionary-small-{v3.1,v3.2}-FC| Function Calling|
|meta-llama/Meta-Llama-3-{8B,70B}-Instruct | Prompt|
|meta-llama/Meta-Llama-3-{8B,70B}-Instruct 💻| Prompt|
|meta-llama/Llama-3.1-{8B,70B}-Instruct-FC 💻| Function Calling|
|meta-llama/Llama-3.1-{8B,70B}-Instruct 💻| Prompt|
|meta-llama/Llama-3.2-{1B,3B}-Instruct-FC 💻| Function Calling|
|meta-llama/Llama-3.2-{1B,3B}-Instruct 💻| Prompt|
|open-mixtral-{8x7b,8x22b} | Prompt|
|open-mixtral-8x22b-FC | Function Calling|
|open-mistral-nemo-2407 | Prompt|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,54 @@
"Meta",
"Meta Llama 3 Community",
],
"meta-llama/Llama-3.1-8B-Instruct": [
"Llama-3.1-8B-Instruct (Prompt)",
"https://llama.meta.com/llama3",
"Meta",
"Meta Llama 3 Community",
],
"meta-llama/Llama-3.1-70B-Instruct": [
"Llama-3.1-70B-Instruct (Prompt)",
"https://llama.meta.com/llama3",
"Meta",
"Meta Llama 3 Community",
],
"meta-llama/Llama-3.2-1B-Instruct": [
"Llama-3.2-1B-Instruct (Prompt)",
"https://llama.meta.com/llama3",
"Meta",
"Meta Llama 3 Community",
],
"meta-llama/Llama-3.2-3B-Instruct": [
"Llama-3.2-3B-Instruct (Prompt)",
"https://llama.meta.com/llama3",
"Meta",
"Meta Llama 3 Community",
],
"meta-llama/Llama-3.1-8B-Instruct-FC": [
"Llama-3.1-8B-Instruct (FC)",
"https://llama.meta.com/llama3",
"Meta",
"Meta Llama 3 Community",
],
"meta-llama/Llama-3.1-70B-Instruct-FC": [
"Llama-3.1-70B-Instruct (FC)",
"https://llama.meta.com/llama3",
"Meta",
"Meta Llama 3 Community",
],
"meta-llama/Llama-3.2-1B-Instruct-FC": [
"Llama-3.2-1B-Instruct (FC)",
"https://llama.meta.com/llama3",
"Meta",
"Meta Llama 3 Community",
],
"meta-llama/Llama-3.2-3B-Instruct-FC": [
"Llama-3.2-3B-Instruct (FC)",
"https://llama.meta.com/llama3",
"Meta",
"Meta Llama 3 Community",
],
"command-r-plus-FC": [
"Command-R-Plus (FC) (Original)",
"https://txt.cohere.com/command-r-plus-microsoft-azure",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from bfcl.model_handler.oss_model.hammer import HammerHandler
from bfcl.model_handler.oss_model.hermes import HermesHandler
from bfcl.model_handler.oss_model.llama import LlamaHandler
from bfcl.model_handler.oss_model.llama_fc import LlamaFCHandler
from bfcl.model_handler.oss_model.phi import PhiHandler
from bfcl.model_handler.oss_model.salesforce import SalesforceHandler
from bfcl.model_handler.proprietary_model.claude import ClaudeHandler
Expand All @@ -21,7 +22,7 @@
from bfcl.model_handler.proprietary_model.openai import OpenAIHandler
from bfcl.model_handler.proprietary_model.yi import YiHandler

# TODO: Add Deepseek V2 and Gemma V2
# TODO: Add Deepseek V2 and Gemma V2, meta-llama/Llama-3.1-405B-Instruct
handler_map = {
# Inference through API calls
"gorilla-openfunctions-v2": GorillaHandler,
Expand Down Expand Up @@ -72,6 +73,14 @@
# Inference through local hosting
"meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler,
"meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler,
"meta-llama/Llama-3.1-8B-Instruct-FC": LlamaFCHandler,
"meta-llama/Llama-3.1-70B-Instruct-FC": LlamaFCHandler,
"meta-llama/Llama-3.2-1B-Instruct-FC": LlamaFCHandler,
"meta-llama/Llama-3.2-3B-Instruct-FC": LlamaFCHandler,
"meta-llama/Llama-3.1-8B-Instruct": LlamaHandler,
"meta-llama/Llama-3.1-70B-Instruct": LlamaHandler,
"meta-llama/Llama-3.2-1B-Instruct": LlamaHandler,
"meta-llama/Llama-3.2-3B-Instruct": LlamaHandler,
"Salesforce/xLAM-1b-fc-r": SalesforceHandler,
"Salesforce/xLAM-7b-fc-r": SalesforceHandler,
"Salesforce/xLAM-7b-r": SalesforceHandler,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
class OSSHandler(BaseHandler):
def __init__(self, model_name, temperature, dtype="bfloat16") -> None:
super().__init__(model_name, temperature)
self.model_name_huggingface = model_name
self.model_style = ModelStyle.OSSMODEL
self.dtype = dtype
self.client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
Expand Down Expand Up @@ -57,7 +58,7 @@ def batch_inference(
[
"vllm",
"serve",
str(self.model_name),
str(self.model_name_huggingface),
"--port",
str(VLLM_PORT),
"--dtype",
Expand Down Expand Up @@ -205,15 +206,15 @@ def _query_prompting(self, inference_data: dict):

if hasattr(self, "stop_token_ids"):
api_response = self.client.completions.create(
model=self.model_name,
model=self.model_name_huggingface,
temperature=self.temperature,
prompt=formatted_prompt,
stop_token_ids=self.stop_token_ids,
max_tokens=4096, # TODO: Is there a better way to handle this?
)
else:
api_response = self.client.completions.create(
model=self.model_name,
model=self.model_name_huggingface,
temperature=self.temperature,
prompt=formatted_prompt,
max_tokens=4096,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from bfcl.model_handler.oss_model.base_oss_handler import OSSHandler


# Note: This is the handler for the Llama models in prompring mode.
# For function call mode, use LlamaFCHandler instead.
# Llama 3 series are benchmarked in prompting mode while the Llama 3.1 series are benchmarked in function call mode.
class LlamaHandler(OSSHandler):
def __init__(self, model_name, temperature) -> None:
super().__init__(model_name, temperature)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
import json

from bfcl.model_handler.oss_model.base_oss_handler import OSSHandler
from bfcl.model_handler.utils import func_doc_language_specific_pre_processing

# TODO: Merge with LlamaHandler


class LlamaFCHandler(OSSHandler):
def __init__(self, model_name, temperature) -> None:
super().__init__(model_name, temperature)
self.model_name_huggingface = model_name.replace("-FC", "")

@staticmethod
def _format_prompt(messages, function):
"""
"bos_token": "<|begin_of_text|>",
"chat_template":
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
{%- set date_string = "26 Jul 2024" %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = "" %}
{%- endif %}

{#- System message + builtin tools #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if builtin_tools is defined or tools is not none %}
{{- "Environment: ipython\n" }}
{%- endif %}
{%- if builtin_tools is defined %}
{{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
{%- endif %}
{{- "Cutting Knowledge Date: December 2023\n" }}
{{- "Today Date: " + date_string + "\n\n" }}
{%- if tools is not none and not tools_in_user_message %}
{{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- system_message }}
{{- "<|eot_id|>" }}

{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- set first_user_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
{{- "Given the following functions, please respond with a JSON for a function call " }}
{{- "with its proper arguments that best answers the given prompt.\n\n" }}
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
{{- "Do not use variables.\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- first_user_message + "<|eot_id|>"}}
{%- endif %}

{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
{%- elif 'tool_calls' in message %}
{%- if not message.tool_calls|length == 1 %}
{{- raise_exception("This model only supports single tool-calls at once!") }}
{%- endif %}
{%- set tool_call = message.tool_calls[0].function %}
{%- if builtin_tools is defined and tool_call.name in builtin_tools %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
{{- "<|python_tag|>" + tool_call.name + ".call(" }}
{%- for arg_name, arg_val in tool_call.arguments | items %}
{{- arg_name + '="' + arg_val + '"' }}
{%- if not loop.last %}
{{- ", " }}
{%- endif %}
{%- endfor %}
{{- ")" }}
{%- else %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
{{- '{"name": "' + tool_call.name + '", ' }}
{{- '"parameters": ' }}
{{- tool_call.arguments | tojson }}
{{- "}" }}
{%- endif %}
{%- if builtin_tools is defined %}
{#- This means we're in ipython mode #}
{{- "<|eom_id|>" }}
{#- This means we're in ipython mode #}
{{- "<|eom_id|>" }}
{{- "<|eom_id|>" }}
{%- else %}
{{- "<|eot_id|>" }}
{%- endif %}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
{%- if message.content is mapping or message.content is iterable %}
{{- message.content | tojson }}
{%- else %}
{{- message.content }}
{%- endif %}
{{- "<|eot_id|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}
"""
formatted_prompt = "<|begin_of_text|>"

system_message = ""
remaining_messages = messages
if messages[0]["role"] == "system":
system_message = messages[0]["content"].strip()
remaining_messages = messages[1:]

formatted_prompt += "<|start_header_id|>system<|end_header_id|>\n\n"
formatted_prompt += "Environment: ipython\n"
formatted_prompt += "Cutting Knowledge Date: December 2023\n"
formatted_prompt += "Today Date: 26 Jul 2024\n\n"
formatted_prompt += system_message + "<|eot_id|>"

# Llama pass in custom tools in first user message
is_first_user_message = True
for message in remaining_messages:
if message["role"] == "user" and is_first_user_message:
is_first_user_message = False
formatted_prompt += "<|start_header_id|>user<|end_header_id|>\n\n"
formatted_prompt += "Given the following functions, please respond with a JSON for a function call "
formatted_prompt += (
"with its proper arguments that best answers the given prompt.\n\n"
)
formatted_prompt += 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.'
formatted_prompt += "Do not use variables.\n\n"
for func in function:
formatted_prompt += json.dumps(func, indent=4) + "\n\n"
formatted_prompt += f"{message['content'].strip()}<|eot_id|>"

elif message["role"] == "tool":
formatted_prompt += "<|start_header_id|>ipython<|end_header_id|>\n\n"
if isinstance(message["content"], (dict, list)):
formatted_prompt += json.dumps(message["content"])
else:
formatted_prompt += message["content"]
formatted_prompt += "<|eot_id|>"

else:
formatted_prompt += f"<|start_header_id|>{message['role']}<|end_header_id|>\n\n{message['content'].strip()}<|eot_id|>"

formatted_prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"

return formatted_prompt

def decode_ast(self, result, language="Python"):
result = result.replace("<|python_tag|>", "")
# Llama sometimes separates the function calls with `;` and sometimes with `,`
if ";" in result:
"""
"<|python_tag|>{\"name\": \"calc_binomial_probability\", \"parameters\": {\"n\": \"10\", \"k\": \"3\", \"p\": \"0\"}}; {\"name\": \"calc_binomial_probability\", \"parameters\": {\"n\": \"15\", \"k\": \"5\", \"p\": \"0\"}}; {\"name\": \"calc_binomial_probability\", \"parameters\": {\"n\": \"20\", \"k\": \"7\", \"p\": \"0\"}}"
"""
function_calls = result.split(";")
function_calls = [json.loads(func_call) for func_call in function_calls]
else:
"""
"[\n {\"name\": \"calculate_permutations\", \"parameters\": {\"n\": \"20\", \"k\": \"5\"}},\n {\"name\": \"calculate_permutations\", \"parameters\": {\"n\": \"12\", \"k\": \"5\"}},\n {\"name\": \"calculate_permutations\", \"parameters\": {\"n\": \"10\", \"k\": \"3\"}}\n]"
"""
function_calls = eval(result)
if type(function_calls) == dict:
function_calls = [function_calls]

decoded_output = []
for func_call in function_calls:
name = func_call["name"]
params = func_call["parameters"]
decoded_output.append({name: params})

return decoded_output

def decode_execute(self, result):
result = result.replace("<|python_tag|>", "")
# Llama sometimes separates the function calls with `;` and sometimes with `,`
if ";" in result:
function_calls = result.split(";")
function_calls = [json.loads(func_call) for func_call in function_calls]
else:
function_calls = eval(result)
if type(function_calls) == dict:
function_calls = [function_calls]

execution_list = []
for func_call in function_calls:
name = func_call["name"]
params = func_call["parameters"]
execution_list.append(
f"{name}({','.join([f'{k}={repr(v)}' for k,v in params.items()])})"
)

return execution_list

def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
functions: list = test_entry["function"]
test_category: str = test_entry["id"].rsplit("_", 1)[0]

functions = func_doc_language_specific_pre_processing(functions, test_category)

# Llama use its own system prompt

return {"message": [], "function": functions}