ShishirPatil · ShishirPatil · Sep 29, 2024 · Sep 14, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.
 
+- [Sept 27, 2024] [#640](https://github.com/ShishirPatil/gorilla/pull/640): Add the following new models to the leaderboard:
+  - `microsoft/Phi-3.5-mini-instruct`
+  - `microsoft/Phi-3-medium-128k-instruct`
+  - `microsoft/Phi-3-medium-4k-instruct`
+  - `microsoft/Phi-3-small-128k-instruct`
+  - `microsoft/Phi-3-small-8k-instruct`
+  - `microsoft/Phi-3-mini-128k-instruct`
+  - `microsoft/Phi-3-mini-4k-instruct`
 - [Sept 25, 2024] [#660](https://github.com/ShishirPatil/gorilla/pull/660): Bug fix in `parse_nested_value` function to handle nested dictionary values properly. 
 - [Sept 19, 2024] [#644](https://github.com/ShishirPatil/gorilla/pull/644): BFCL V3 release:
   - Introduce new multi-turn dataset and state-based evaluation metric

diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -170,6 +170,10 @@ Below is _a table of models we support_ to run our leaderboard evaluation agains
 |Salesforce/xLAM-7b-r 💻| Function Calling|
 |Salesforce/xLAM-8x7b-r 💻| Function Calling|
 |Salesforce/xLAM-8x22b-r 💻| Function Calling|
+|microsoft/Phi-3.5-mini-instruct 💻| Prompt|
+|microsoft/Phi-3-medium-{4k,128k}-instruct 💻| Prompt|
+|microsoft/Phi-3-small-{8k,128k}-instruct 💻| Prompt|
+|microsoft/Phi-3-mini-{4k,128k}-instruct 💻| Prompt|
 |nvidia/nemotron-4-340b-instruct| Prompt|
 |THUDM/glm-4-9b-chat 💻| Function Calling|
 |ibm-granite/granite-20b-functioncalling 💻| Function Calling|

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py
@@ -454,7 +454,49 @@
         "https://huggingface.co/MadeAgents/Hammer-7b",
         "MadeAgents",
         "cc-by-nc-4.0",
-    ]
+    ],
+    "microsoft/Phi-3-mini-4k-instruct": [
+        "Phi-3-mini-4k-instruct (Prompt)",
+        "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
+        "Microsoft",
+        "MIT",
+    ],
+    "microsoft/Phi-3-mini-128k-instruct": [
+        "Phi-3-mini-128k-instruct (Prompt)",
+        "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct",
+        "Microsoft",
+        "MIT",
+    ],
+    "microsoft/Phi-3-small-8k-instruct": [
+        "Phi-3-small-8k-instruct (Prompt)",
+        "https://huggingface.co/microsoft/Phi-3-small-8k-instruct",
+        "Microsoft",
+        "MIT",
+    ],
+    "microsoft/Phi-3-small-128k-instruct": [
+        "Phi-3-small-128k-instruct (Prompt)",
+        "https://huggingface.co/microsoft/Phi-3-small-128k-instruct",
+        "Microsoft",
+        "MIT",
+    ],
+    "microsoft/Phi-3-medium-4k-instruct": [
+        "Phi-3-medium-4k-instruct (Prompt)",
+        "https://huggingface.co/microsoft/Phi-3-medium-4k-instruct",
+        "Microsoft",
+        "MIT",
+    ],
+    "microsoft/Phi-3-medium-128k-instruct": [
+        "Phi-3-medium-128k-instruct (Prompt)",
+        "https://huggingface.co/microsoft/Phi-3-medium-128k-instruct",
+        "Microsoft",
+        "MIT",
+    ],
+    "microsoft/Phi-3.5-mini-instruct": [
+        "Phi-3.5-mini-instruct (Prompt)",
+        "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
+        "Microsoft",
+        "MIT",
+    ],
 }
 
 INPUT_PRICE_PER_MILLION_TOKEN = {

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py
@@ -6,6 +6,7 @@
 from bfcl.model_handler.oss_model.hammer import HammerHandler
 from bfcl.model_handler.oss_model.hermes import HermesHandler
 from bfcl.model_handler.oss_model.llama import LlamaHandler
+from bfcl.model_handler.oss_model.phi import PhiHandler
 from bfcl.model_handler.oss_model.salesforce import SalesforceHandler
 from bfcl.model_handler.proprietary_model.claude import ClaudeHandler
 from bfcl.model_handler.proprietary_model.cohere import CohereHandler
@@ -76,6 +77,13 @@
     "Salesforce/xLAM-7b-r": SalesforceHandler,
     "Salesforce/xLAM-8x22b-r": SalesforceHandler,
     "Salesforce/xLAM-8x7b-r": SalesforceHandler,
+    "microsoft/Phi-3-mini-4k-instruct": PhiHandler,
+    "microsoft/Phi-3-mini-128k-instruct": PhiHandler,
+    "microsoft/Phi-3-small-8k-instruct": PhiHandler,
+    "microsoft/Phi-3-small-128k-instruct": PhiHandler,
+    "microsoft/Phi-3-medium-4k-instruct": PhiHandler,
+    "microsoft/Phi-3-medium-128k-instruct": PhiHandler,
+    "microsoft/Phi-3.5-mini-instruct": PhiHandler,
     "NousResearch/Hermes-2-Pro-Mistral-7B": HermesHandler,
     "NousResearch/Hermes-2-Pro-Llama-3-8B": HermesHandler,
     "NousResearch/Hermes-2-Theta-Llama-3-8B": HermesHandler,

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base_oss_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base_oss_handler.py
@@ -34,8 +34,7 @@ def inference(self, test_entry: dict, include_debugging_log: bool):
             "OSS Models should call the batch_inference method instead."
         )
 
-    @staticmethod
-    def _format_prompt(messages, function):
+    def _format_prompt(self, messages, function):
         raise NotImplementedError(
             "OSS Models should implement their own prompt formatting."
         )
@@ -66,6 +65,7 @@ def batch_inference(
                 str(num_gpus),
                 "--gpu-memory-utilization",
                 str(gpu_memory_utilization),
+                "--trust-remote-code",
             ],
             stdout=subprocess.PIPE,  # Capture stdout
             stderr=subprocess.PIPE,  # Capture stderr

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py
@@ -6,8 +6,7 @@ class DeepseekHandler(OSSHandler):
     def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
 
-    @staticmethod
-    def _format_prompt(messages, function):
+    def _format_prompt(self, messages, function):
         """
         "bos_token": {
             "__type": "AddedToken",

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/gemma.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/gemma.py
@@ -12,8 +12,7 @@ class GemmaHandler(OSSHandler):
     def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
 
-    @staticmethod
-    def _format_prompt(messages, function):
+    def _format_prompt(self, messages, function):
         """
         "bos_token": "<bos>",
         "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/glm.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/glm.py
@@ -15,8 +15,7 @@ def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
         self.stop_token_ids = [151329, 151336, 151338]
 
-    @staticmethod
-    def _format_prompt(messages, function):
+    def _format_prompt(self, messages, function):
         """
         "chat_template": "[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n在调用上述函数时，请使用 Json 格式表示调用的参数。{% elif tool['type'] == 'python' %}\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时，该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出，或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中，`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用，这些在线内容的访问将不会成功。{% elif tool['type'] == 'simple_browser' %}\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数：\n`search(query: str, recency_days: int)`：使用搜索引擎进行查询并显示结果，可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`：获取一系列指定 id 的页面内容。每次调用时，须选择3-10个页面。选择多个角度的页面，同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的，你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`：打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤：1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL，也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息，也可以再次使用 `search` 进行搜索。{% elif tool['type'] == 'cogview' %}\n\n## cogview\n\n如果用户的请求中包含了对图像的描述，你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述，规则：\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求，需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明，所在地为中国，持有中国立场并遵循中国社会主义价值观。{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
         """

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/granite.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/granite.py
@@ -13,8 +13,7 @@ class GraniteHandler(OSSHandler):
     def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
 
-    @staticmethod
-    def _format_prompt(messages, function):
+    def _format_prompt(self, messages, function):
         """
         "chat_template": "{% set function_str = messages.get('functions_str', {}) %}\n{% set query = messages['query'] %}\n{% set sys_prompt = 'You are a helpful assistant with access to the following function calls. Your task is to produce a sequence of function calls necessary to generate response to the user utterance. Use the following function calls as required. ' %}\n{% set funcstr = function_str|join('\n') %}\n{{ 'SYSTEM: ' + sys_prompt + '\n<|function_call_library|>\n' + funcstr + '\n\nIf none of the functions are relevant or the given question lacks the parameters required by the function, please output \"<function_call> {\"name\": \"no_function\", \"arguments\": {}}\".\n\nUSER: ' + query}}\n{% if add_generation_prompt %}\n{{ 'ASSISTANT:' }}{% endif %}",
         """

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hammer.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hammer.py
@@ -26,8 +26,7 @@ class HammerHandler(SalesforceHandler):
     def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
 
-    @staticmethod
-    def _format_prompt(messages, function):
+    def _format_prompt(self, messages, function):
         """
         "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
         """

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hermes.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hermes.py
@@ -13,8 +13,7 @@ def __init__(self, model_name, temperature) -> None:
         if model_name == "NousResearch/Hermes-2-Pro-Llama-3-8B":
             self.dtype = "float16"
 
-    @staticmethod
-    def _format_prompt(messages, function):
+    def _format_prompt(self, messages, function):
         # Hermes use Langchain to OpenAI conversion. It does not use tool call but function call.
         function = convert_to_tool(function, GORILLA_TO_OPENAPI, ModelStyle.OSSMODEL)
         pydantic_format = """{"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}"""

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py
@@ -4,8 +4,7 @@ class LlamaHandler(OSSHandler):
     def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
 
-    @staticmethod
-    def _format_prompt(messages, function):
+    def _format_prompt(self, messages, function):
         formatted_prompt = "<|begin_of_text|>"
 
         for message in messages:

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/phi.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/phi.py
@@ -0,0 +1,59 @@
+from bfcl.model_handler.constant import DEFAULT_SYSTEM_PROMPT
+from bfcl.model_handler.oss_model.base_oss_handler import OSSHandler
+from bfcl.model_handler.utils import (
+    combine_consecutive_user_prompts,
+    convert_system_prompt_into_user_prompt,
+    func_doc_language_specific_pre_processing,
+    system_prompt_pre_processing_chat_model,
+)
+
+
+class PhiHandler(OSSHandler):
+    def __init__(self, model_name, temperature) -> None:
+        super().__init__(model_name, temperature)
+
+    def _format_prompt(self, messages, function):
+        if "Phi-3-small" in self.model_name:
+            # Phi-3-small
+            """
+            "bos_token": "<|endoftext|>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+            "eos_token": "<|endoftext|>",
+            """
+            formatted_prompt = "<|endoftext|>"
+        else:
+            # Phi-3.5-mini, Phi-3-medium, Phi-3-mini
+            """
+            "bos_token": "<s>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+            """
+            formatted_prompt = ""
+
+        for message in messages:
+            formatted_prompt += f"<|{message['role']}|>\n{message['content']}<|end|>\n"
+
+        formatted_prompt += f"<|assistant|>\n"
+
+        return formatted_prompt
+
+    def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
+        functions: list = test_entry["function"]
+        test_category: str = test_entry["id"].rsplit("_", 1)[0]
+
+        functions = func_doc_language_specific_pre_processing(functions, test_category)
+
+        test_entry["question"][0] = system_prompt_pre_processing_chat_model(
+            test_entry["question"][0], DEFAULT_SYSTEM_PROMPT, functions
+        )
+
+        if "Phi-3-small" in self.model_name:
+            # Phi-3-small doesn't allow system role
+            for round_idx in range(len(test_entry["question"])):
+                test_entry["question"][round_idx] = convert_system_prompt_into_user_prompt(
+                    test_entry["question"][round_idx]
+                )
+                test_entry["question"][round_idx] = combine_consecutive_user_prompts(
+                    test_entry["question"][round_idx]
+                )
+
+        return {"message": [], "function": functions}