From b1b67b1be665bdde19d5bc9817415155711fd191 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sun, 7 Jul 2024 18:47:48 -0400 Subject: [PATCH 01/35] Move `*.jsonl` files from `eval_checker` to `data` dir --- .../api_status_check_ground_truth_REST.jsonl} | 0 .../api_status_check_ground_truth_executable.jsonl} | 0 .../{eval_checker => data}/rest-eval-response_v5.jsonl | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename berkeley-function-call-leaderboard/{eval_checker/api_status_check_ground_truth_REST.json => data/api_status_check_ground_truth_REST.jsonl} (100%) rename berkeley-function-call-leaderboard/{eval_checker/api_status_check_ground_truth_executable.json => data/api_status_check_ground_truth_executable.jsonl} (100%) rename berkeley-function-call-leaderboard/{eval_checker => data}/rest-eval-response_v5.jsonl (100%) diff --git a/berkeley-function-call-leaderboard/eval_checker/api_status_check_ground_truth_REST.json b/berkeley-function-call-leaderboard/data/api_status_check_ground_truth_REST.jsonl similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/api_status_check_ground_truth_REST.json rename to berkeley-function-call-leaderboard/data/api_status_check_ground_truth_REST.jsonl diff --git a/berkeley-function-call-leaderboard/eval_checker/api_status_check_ground_truth_executable.json b/berkeley-function-call-leaderboard/data/api_status_check_ground_truth_executable.jsonl similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/api_status_check_ground_truth_executable.json rename to berkeley-function-call-leaderboard/data/api_status_check_ground_truth_executable.jsonl diff --git a/berkeley-function-call-leaderboard/eval_checker/rest-eval-response_v5.jsonl b/berkeley-function-call-leaderboard/data/rest-eval-response_v5.jsonl similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/rest-eval-response_v5.jsonl rename to berkeley-function-call-leaderboard/data/rest-eval-response_v5.jsonl From 1a0a9322422133eab0861b7783b2ca94d0f5bf9d Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sun, 7 Jul 2024 18:49:12 -0400 Subject: [PATCH 02/35] Add `pyproject.toml` file --- .../pyproject.toml | 42 +++++++++++++++++++ .../requirements.txt | 11 ----- 2 files changed, 42 insertions(+), 11 deletions(-) create mode 100644 berkeley-function-call-leaderboard/pyproject.toml delete mode 100644 berkeley-function-call-leaderboard/requirements.txt diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml new file mode 100644 index 000000000..48d16c087 --- /dev/null +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -0,0 +1,42 @@ +[tool.poetry] +name = "bfcl" +version = "0.1.0" +description = "Berkeley Function Calling Leaderboard (BFCL)" +authors = ["NAME "] +readme = "README.md" +repository = "https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard" + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +requests = "*" +tqdm = "*" +numpy = "*" +shortuuid = "*" +huggingface_hub = "*" +pydantic = "^2.8.2" +python-dotenv = "^1.0.1" +tree-sitter = "^0.21.0" +datasets = "^2.19.2" +openai = "^1.35.10" +tree-sitter-java = "0.21.0" +tree-sitter-javascript = "0.21.4" +vllm = { version = "0.5.1", optional = true } +mistralai = { version = "^0.4.2", optional = true } +anthropic = { version = "^0.29.0", optional = true } +cohere = { version = "^5.2.5", optional = true } + +[tool.poetry.extras] +oss_eval = ["vllm"] +proprietary_eval = [ + "mistralai", + "anthropic", + "cohere" +] + +[tool.poetry.scripts] +bfcl_benchmark = "bfcl.benchmark:main" +bfcl_eval = "bfcl.evaluate:main" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/berkeley-function-call-leaderboard/requirements.txt b/berkeley-function-call-leaderboard/requirements.txt deleted file mode 100644 index 974024adf..000000000 --- a/berkeley-function-call-leaderboard/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -requests -tqdm -tree_sitter~=0.21.0 -torch -ray -shortuuid -mistralai -anthropic~=0.29.0 -openai -numpy -cohere~=5.2.5 From ae8984147e9f9acbae9759bfd6cb7fda7a0cd472 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sun, 7 Jul 2024 18:49:40 -0400 Subject: [PATCH 03/35] Ignore `poetry.lock` and `.cache` dir --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 02096f5be..ae2f97ef7 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,5 @@ berkeley-function-call-leaderboard/score/ .direnv/ .venv +poetry.lock +.cache \ No newline at end of file From 76e1bde32fe6da49181b85016a929d1af89a8faa Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sun, 7 Jul 2024 18:50:49 -0400 Subject: [PATCH 04/35] Add `.env.example` containing all the env vars --- .../.env.example | 19 +++++++++++++++++++ .../function_credential_config.json | 1 - 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 berkeley-function-call-leaderboard/.env.example delete mode 100644 berkeley-function-call-leaderboard/function_credential_config.json diff --git a/berkeley-function-call-leaderboard/.env.example b/berkeley-function-call-leaderboard/.env.example new file mode 100644 index 000000000..0105854d1 --- /dev/null +++ b/berkeley-function-call-leaderboard/.env.example @@ -0,0 +1,19 @@ +# [OPTIONAL] Only required for respective proprietary model evaluation +OPENAI_API_KEY=sk-XXXXXX +MISTRAL_API_KEY= +FIREWORKS_API_KEY= +ANTHROPIC_API_KEY= +NVIDIA_API_KEY=nvapi-XXXXXX +GEMINI_GCP_PROJECT_ID= + +COHERE_API_KEY= +USE_COHERE_OPTIMIZATION=False # True/False + +DATABRICKS_API_KEY= +DATABRICKS_AZURE_ENDPOINT_URL= + +# [OPTIONAL] Only required for evaluating executable test categories +RAPID_API_KEY= +EXCHANGERATE_API_KEY= +OMDB_API_KEY= +GEOCODE_API_KEY= diff --git a/berkeley-function-call-leaderboard/function_credential_config.json b/berkeley-function-call-leaderboard/function_credential_config.json deleted file mode 100644 index 9d36e9bbd..000000000 --- a/berkeley-function-call-leaderboard/function_credential_config.json +++ /dev/null @@ -1 +0,0 @@ -[{"RAPID-API-KEY" : ""},{"EXCHANGERATE-API-KEY" : ""},{"OMDB-API-KEY" : ""}, {"GEOCODE-API-KEY": ""}] \ No newline at end of file From e11240d5a407838d0bc02f29a067949bd28587e8 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sun, 7 Jul 2024 18:53:45 -0400 Subject: [PATCH 05/35] Remove `changelog` from `README` --- .../CHANGELOG.md | 42 +++++++++++ berkeley-function-call-leaderboard/README.md | 71 +------------------ 2 files changed, 44 insertions(+), 69 deletions(-) create mode 100644 berkeley-function-call-leaderboard/CHANGELOG.md diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md new file mode 100644 index 000000000..7479358e4 --- /dev/null +++ b/berkeley-function-call-leaderboard/CHANGELOG.md @@ -0,0 +1,42 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +* [July 3, 2024] [#489](https://github.com/ShishirPatil/gorilla/pull/489): Add new model `nvidia/nemotron-4-340b-instruct` to the leaderboard. +* [June 18, 2024] [#470](https://github.com/ShishirPatil/gorilla/pull/470): Add new model `firefunction-v2-FC` to the leaderboard. +* [June 15, 2024] [#437](https://github.com/ShishirPatil/gorilla/pull/437): Fix prompting issues for `Nexusflow-Raven-v2 (FC)`. +* [June 7, 2024] [#407](https://github.com/ShishirPatil/gorilla/pull/407), [#462](https://github.com/ShishirPatil/gorilla/pull/462): Update the AST evaluation logic to allow the use of `int` values for Python parameters expecting `float` values. This is to accommodate the Python auto-conversion feature from `int` to `float`. +* [May 14, 2024] [#426](https://github.com/ShishirPatil/gorilla/pull/426): + - Add the following new models to the leaderboard: + + `gpt-4o-2024-05-13` + + `gpt-4o-2024-05-13-FC` + + `gemini-1.5-pro-preview-0514` + + `gemini-1.5-flash-preview-0514` + - Update price for the following models: + + All Gemini Series + + `Claude-2.1 (Prompt)` and `Claude-instant-1.2 (Prompt)` + + `Mistral-large` and `Mistral-Small` + + `GPT-3.5-Turbo-0125` +* [May 8, 2024] [#406](https://github.com/ShishirPatil/gorilla/pull/406) and [#421](https://github.com/ShishirPatil/gorilla/pull/421): Update the `gemini_handler.py` to better handle parallel function calls for Gemini models. +* [May 6, 2024] [#412](https://github.com/ShishirPatil/gorilla/pull/412): Bug fix in evaluation dataset for AST categories. This includes updates to both prompts and function docs. +* [May 2, 2024] [#405](https://github.com/ShishirPatil/gorilla/pull/405): Bug fix in the possible answers for the AST Simple evaluation dataset. Prompt and function docs are not affected. +* [April 28, 2024] [#397](https://github.com/ShishirPatil/gorilla/pull/397): Add new model `snowflake/arctic` to the leaderboard. Note that there are multiple ways to inference the model, and we choose to do it via Nvidia API catalog. +* [April 27, 2024] [#390](https://github.com/ShishirPatil/gorilla/pull/390): Bug fix in cost and latency calculation for open-source models, which are now all calculated when serving the model with [vLLM](https://github.com/vllm-project/vllm) using 8 V100 GPUs for consistency. $$\text{Cost} = \text{Latency per 1000 function call} * (\text{8xV100 azure-pay-as-you-go-price per hour / 3600})$$ +* [April 25, 2024] [#386](https://github.com/ShishirPatil/gorilla/pull/386): Add 5 new models to the leaderboard: `meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `gemini-1.5-pro-preview-0409`, `command-r-plus`, `command-r-plus-FC`. +* [April 19, 2024] [#377](https://github.com/ShishirPatil/gorilla/pull/377): + - Bug fix for the evaluation dataset in the executable test categories. This includes updates to both prompts and function docs. + - The `evaluation_result` field has been removed to accommodate the variability in API execution results across different evaluation runs. Instead, a human-verified `ground_truth` is now included for the executable test categories. During each evaluation run, `evaluation_result` is generated anew using the `ground_truth`, and then compared against the model output. + - A stricter metric has been adopted when using the `structural_match` (aka. type match) evaluation criteria ---- For `list` results, the lengths are compared; for `dict` results, the keys are matched. This is to account for the fast-changing nature of some of the real-time API results while ensuring the evaluation remains meaningful. + - Added another evaluation criteria `real_time_match` for the executable category, which is a looser form of `exact_match` specifically for numerical execution results. The execution result must be within a certain percentage threshold (20%) from the expected result to accommodate the live updates of API responses. User can change this threshold value in `eval_checker_constant.py`. +* [April 18, 2024] [#375](https://github.com/ShishirPatil/gorilla/pull/375): A more comprehensive API sanity check is included; the APIs that are invoked during the non-REST executable evaluation process will also be checked for their availability before running the evaluation. Also, add support for the shortcut `-s` for the `--skip-api-sanity-check` flag, based on the community feedback. +* [April 16, 2024] [#366](https://github.com/ShishirPatil/gorilla/pull/366): Switch to use Anthropic's new Tool Use Beta `tools-2024-04-04` when generating Claude 3 FC series data. `gpt-4-turbo-2024-04-09` and `gpt-4-turbo-2024-04-09-FC` are also added to the leaderboard. +* [April 11, 2024] [#347](https://github.com/ShishirPatil/gorilla/pull/347): Add the 95th percentile latency to the leaderboard statistics. This metric is useful for understanding the latency distribution of the models, especially the worst-case scenario. +* [April 10, 2024] [#339](https://github.com/ShishirPatil/gorilla/pull/339): Introduce REST API sanity check for the REST executable test category. It ensures that all the API endpoints involved during the execution evaluation process are working properly. If any of them are not behaving as expected, the evaluation process will be stopped by default as the result will be inaccurate. Users can choose to bypass this check by setting the `--skip-api-sanity-check` flag or `-s` for short. +* [April 9, 2024] [#338](https://github.com/ShishirPatil/gorilla/pull/338): Bug fix in the evaluation datasets (including both prompts and function docs). Bug fix for possible answers as well. +* [April 8, 2024] [#330](https://github.com/ShishirPatil/gorilla/pull/330): Fixed an oversight that was introduced in [#299](https://github.com/ShishirPatil/gorilla/pull/299). For function-calling (FC) models that cannot take `float` type in input, when the parameter type is a `float`, the evaluation procedure will convert that type to `number` in the model input and mention in the parameter description that `This is a float type value.`. An additional field `format: float` will also be included in the model input to make it clear about the type. Updated the model handler for Claude, Mistral, and OSS to better parse the model output. +* [April 8, 2024] [#327](https://github.com/ShishirPatil/gorilla/pull/327): Add new model `NousResearch/Hermes-2-Pro-Mistral-7B` to the leaderboard. +* [April 3, 2024] [#309](https://github.com/ShishirPatil/gorilla/pull/309): Bug fix for evaluation dataset possible answers. Implement **string standardization** for the AST evaluation pipeline, i.e. removing white spaces and a subset of punctuations (`,./-_*^`) to make the AST evaluation more robust and accurate. Fixed AST evaluation issue for type `tuple`. Add 2 new models `meetkai/functionary-small-v2.4 (FC)`, `meetkai/functionary-medium-v2.4 (FC)` to the leaderboard. +* [April 1, 2024] [#299](https://github.com/ShishirPatil/gorilla/pull/299): Leaderboard update with new models (`Claude-3-Haiku`, `Databrick-DBRX-Instruct`), more advanced AST evaluation procedure, and updated evaluation datasets. Cost and latency statistics during evaluation are also measured. We also released the manual that our evaluation procedure is based on, available [here](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#metrics). +* [Mar 11, 2024] [#254](https://github.com/ShishirPatil/gorilla/pull/254): Leaderboard update with 3 new models: `Claude-3-Opus-20240229 (Prompt)`, `Claude-3-Sonnet-20240229 (Prompt)`, and `meetkai/functionary-medium-v2.2 (FC)` +* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `gemma`. +* [Feb 29, 2024] [#223](https://github.com/ShishirPatil/gorilla/pull/223): modifications to REST evaluation. diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index 2a3c78b7a..812d4f34b 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -12,9 +12,8 @@ We present Berkeley Function Leaderboard, the **first comprehensive and executab Read more about the technical details and interesting insights in our blog post! ![image](./architecture_diagram.png) -### Install Dependencies -Before generating the leaderboard statistics, you should install dependencies using the following command: +### Install Dependencies ```bash conda create -n BFCL python=3.10 @@ -22,25 +21,6 @@ conda activate BFCL pip install -r requirements.txt # Inside ./berkeley-function-call-leaderboard pip install vllm # If you have vLLM supported GPU(s) and want to run our evaluation data against self-hosted OSS models. ``` -If you plan to evaluate on OSS models, we are using vLLM for inference and refer to https://github.com/vllm-project/vllm for detail. We recommend to inference on at least V100s, A100s, and latest GPUs that are supported by vLLM. - -### Checker Setup (required for Java, JavaScript test categories) -We use `tree-sitter` to do the AST parsing for Java and JavaScript test categories. Thus, you need to install `tree-sitter`. - -The git clones need to be under the `/berkeley-function-call-leaderboard/eval_checker` folder. - -```bash -cd ./eval_checker -git clone https://github.com/tree-sitter/tree-sitter-java.git -git clone https://github.com/tree-sitter/tree-sitter-javascript.git -``` - -Now, move back to `/berkeley-function-call-leaderboard` by `cd ..`, and create two symbolic links to the `tree-sitter-java` and `tree-sitter-javascript` directories. This is required to run `openfunctions_evaluation.py`. - -``` -ln -s eval_checker/tree-sitter-java tree-sitter-java -ln -s eval_checker/tree-sitter-javascript tree-sitter-javascript -``` ## Prepare Evaluation Dataset @@ -50,14 +30,12 @@ To download the evaluation dataset from huggingface, from the current directory huggingface-cli download gorilla-llm/Berkeley-Function-Calling-Leaderboard --local-dir ./data --repo-type dataset ``` - This will download our dataset to `data` repository. ## Evaluation Dataset The evaluation datasets are now stored in the `./data` folder. The possible answers are stored in the `./data/possible_answer` folder. - ## Execution Evaluation Data Post-processing Input your API keys into `function_credential_config.json`, so that the original placeholder values in questions, params, and answers will be cleaned. @@ -233,50 +211,7 @@ For Mistral large and small models, we provide evaluation on both of their `Any` For inferencing `Gemini-1.0-pro`, you need to fill in `model_handler/gemini_handler.py` with your GCP project ID that has access to Vertex AI endpoint. -For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure workspace and setup an endpoint for inference. - - -## Changelog - -* [July 3, 2024] [#489](https://github.com/ShishirPatil/gorilla/pull/489): Add new model `nvidia/nemotron-4-340b-instruct` to the leaderboard. -* [June 18, 2024] [#470](https://github.com/ShishirPatil/gorilla/pull/470): Add new model `firefunction-v2-FC` to the leaderboard. -* [June 15, 2024] [#437](https://github.com/ShishirPatil/gorilla/pull/437): Fix prompting issues for `Nexusflow-Raven-v2 (FC)`. -* [June 7, 2024] [#407](https://github.com/ShishirPatil/gorilla/pull/407), [#462](https://github.com/ShishirPatil/gorilla/pull/462): Update the AST evaluation logic to allow the use of `int` values for Python parameters expecting `float` values. This is to accommodate the Python auto-conversion feature from `int` to `float`. -* [May 14, 2024] [#426](https://github.com/ShishirPatil/gorilla/pull/426): - - Add the following new models to the leaderboard: - + `gpt-4o-2024-05-13` - + `gpt-4o-2024-05-13-FC` - + `gemini-1.5-pro-preview-0514` - + `gemini-1.5-flash-preview-0514` - - Update price for the following models: - + All Gemini Series - + `Claude-2.1 (Prompt)` and `Claude-instant-1.2 (Prompt)` - + `Mistral-large` and `Mistral-Small` - + `GPT-3.5-Turbo-0125` -* [May 8, 2024] [#406](https://github.com/ShishirPatil/gorilla/pull/406) and [#421](https://github.com/ShishirPatil/gorilla/pull/421): Update the `gemini_handler.py` to better handle parallel function calls for Gemini models. -* [May 6, 2024] [#412](https://github.com/ShishirPatil/gorilla/pull/412): Bug fix in evaluation dataset for AST categories. This includes updates to both prompts and function docs. -* [May 2, 2024] [#405](https://github.com/ShishirPatil/gorilla/pull/405): Bug fix in the possible answers for the AST Simple evaluation dataset. Prompt and function docs are not affected. -* [April 28, 2024] [#397](https://github.com/ShishirPatil/gorilla/pull/397): Add new model `snowflake/arctic` to the leaderboard. Note that there are multiple ways to inference the model, and we choose to do it via Nvidia API catalog. -* [April 27, 2024] [#390](https://github.com/ShishirPatil/gorilla/pull/390): Bug fix in cost and latency calculation for open-source models, which are now all calculated when serving the model with [vLLM](https://github.com/vllm-project/vllm) using 8 V100 GPUs for consistency. $$\text{Cost} = \text{Latency per 1000 function call} * (\text{8xV100 azure-pay-as-you-go-price per hour / 3600})$$ -* [April 25, 2024] [#386](https://github.com/ShishirPatil/gorilla/pull/386): Add 5 new models to the leaderboard: `meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `gemini-1.5-pro-preview-0409`, `command-r-plus`, `command-r-plus-FC`. -* [April 19, 2024] [#377](https://github.com/ShishirPatil/gorilla/pull/377): - - Bug fix for the evaluation dataset in the executable test categories. This includes updates to both prompts and function docs. - - The `evaluation_result` field has been removed to accommodate the variability in API execution results across different evaluation runs. Instead, a human-verified `ground_truth` is now included for the executable test categories. During each evaluation run, `evaluation_result` is generated anew using the `ground_truth`, and then compared against the model output. - - A stricter metric has been adopted when using the `structural_match` (aka. type match) evaluation criteria ---- For `list` results, the lengths are compared; for `dict` results, the keys are matched. This is to account for the fast-changing nature of some of the real-time API results while ensuring the evaluation remains meaningful. - - Added another evaluation criteria `real_time_match` for the executable category, which is a looser form of `exact_match` specifically for numerical execution results. The execution result must be within a certain percentage threshold (20%) from the expected result to accommodate the live updates of API responses. User can change this threshold value in `eval_checker_constant.py`. -* [April 18, 2024] [#375](https://github.com/ShishirPatil/gorilla/pull/375): A more comprehensive API sanity check is included; the APIs that are invoked during the non-REST executable evaluation process will also be checked for their availability before running the evaluation. Also, add support for the shortcut `-s` for the `--skip-api-sanity-check` flag, based on the community feedback. -* [April 16, 2024] [#366](https://github.com/ShishirPatil/gorilla/pull/366): Switch to use Anthropic's new Tool Use Beta `tools-2024-04-04` when generating Claude 3 FC series data. `gpt-4-turbo-2024-04-09` and `gpt-4-turbo-2024-04-09-FC` are also added to the leaderboard. -* [April 11, 2024] [#347](https://github.com/ShishirPatil/gorilla/pull/347): Add the 95th percentile latency to the leaderboard statistics. This metric is useful for understanding the latency distribution of the models, especially the worst-case scenario. -* [April 10, 2024] [#339](https://github.com/ShishirPatil/gorilla/pull/339): Introduce REST API sanity check for the REST executable test category. It ensures that all the API endpoints involved during the execution evaluation process are working properly. If any of them are not behaving as expected, the evaluation process will be stopped by default as the result will be inaccurate. Users can choose to bypass this check by setting the `--skip-api-sanity-check` flag or `-s` for short. -* [April 9, 2024] [#338](https://github.com/ShishirPatil/gorilla/pull/338): Bug fix in the evaluation datasets (including both prompts and function docs). Bug fix for possible answers as well. -* [April 8, 2024] [#330](https://github.com/ShishirPatil/gorilla/pull/330): Fixed an oversight that was introduced in [#299](https://github.com/ShishirPatil/gorilla/pull/299). For function-calling (FC) models that cannot take `float` type in input, when the parameter type is a `float`, the evaluation procedure will convert that type to `number` in the model input and mention in the parameter description that `This is a float type value.`. An additional field `format: float` will also be included in the model input to make it clear about the type. Updated the model handler for Claude, Mistral, and OSS to better parse the model output. -* [April 8, 2024] [#327](https://github.com/ShishirPatil/gorilla/pull/327): Add new model `NousResearch/Hermes-2-Pro-Mistral-7B` to the leaderboard. -* [April 3, 2024] [#309](https://github.com/ShishirPatil/gorilla/pull/309): Bug fix for evaluation dataset possible answers. Implement **string standardization** for the AST evaluation pipeline, i.e. removing white spaces and a subset of punctuations (`,./-_*^`) to make the AST evaluation more robust and accurate. Fixed AST evaluation issue for type `tuple`. Add 2 new models `meetkai/functionary-small-v2.4 (FC)`, `meetkai/functionary-medium-v2.4 (FC)` to the leaderboard. -* [April 1, 2024] [#299](https://github.com/ShishirPatil/gorilla/pull/299): Leaderboard update with new models (`Claude-3-Haiku`, `Databrick-DBRX-Instruct`), more advanced AST evaluation procedure, and updated evaluation datasets. Cost and latency statistics during evaluation are also measured. We also released the manual that our evaluation procedure is based on, available [here](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#metrics). -* [Mar 11, 2024] [#254](https://github.com/ShishirPatil/gorilla/pull/254): Leaderboard update with 3 new models: `Claude-3-Opus-20240229 (Prompt)`, `Claude-3-Sonnet-20240229 (Prompt)`, and `meetkai/functionary-medium-v2.2 (FC)` -* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `gemma`. -* [Feb 29, 2024] [#223](https://github.com/ShishirPatil/gorilla/pull/223): modifications to REST evaluation. - +For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure workspace and setup an endpoint for inference. ## Contributing @@ -296,8 +231,6 @@ To add a new model to the Function Calling Leaderboard, here are a few things yo 5. Raise a [Pull Request](https://github.com/ShishirPatil/gorilla/pulls) with your new Model Handler. We will run the model handler if an endpoint is established. If self-hosting is required and the model size is large, we might not be able to accommodate model hosting therefore an OpenAI compatible endpoint for evaluation is desired. 6. Feel Free to join [Gorilla Discord](https://discord.gg/grXXvj9Whz) `#leaderboard` and reach out to us for any questions or concerns about adding new models. We are happy to help you! - All the leaderboard statistics, and data used to train the models are released under Apache 2.0. Gorilla is an open source effort from UC Berkeley and we welcome contributors. Please email us your comments, criticisms, and questions. More information about the project can be found at [https://gorilla.cs.berkeley.edu/](https://gorilla.cs.berkeley.edu/) - From ebc21420be7b930da4e8573a4997e37fe43df854 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sun, 7 Jul 2024 18:55:43 -0400 Subject: [PATCH 06/35] Refactor `model_handler` - Move `model_handler` to `bfcl/model_handler` - Separate `oss` and `proprietary` model - Move java and javascript parsers to `bfcl/parser` - Standardize model handlers and remove duplicate methods --- .../bfcl/model_handler/__init__.py | 0 .../bfcl/model_handler/base.py | 75 ++++++++ .../model_handler/constants.py} | 41 ++-- .../bfcl/model_handler/oss_model/__init__.py | 22 +++ .../bfcl/model_handler/oss_model/base.py | 96 ++++++++++ .../bfcl/model_handler/oss_model/deepseek.py | 45 +++++ .../model_handler/oss_model/functionary.py | 25 +++ .../bfcl/model_handler/oss_model/gemma.py | 44 +++++ .../bfcl/model_handler/oss_model/glaive.py | 32 ++++ .../bfcl/model_handler/oss_model/hermes.py | 96 ++++++++++ .../bfcl/model_handler/oss_model/llama.py | 45 +++++ .../bfcl/model_handler/parser/__init__.py | 7 + .../model_handler/parser}/java_parser.py | 11 +- .../parser/javascript_parser.py} | 6 +- .../proprietary_model/__init__.py | 33 ++++ .../proprietary_model/anthropic/__init__.py | 7 + .../proprietary_model/anthropic/handler.py | 81 ++++++++ .../anthropic/prompt_handler.py} | 164 ++++++++-------- .../proprietary_model/cohere.py} | 106 +++++------ .../proprietary_model/databricks.py} | 61 +++--- .../proprietary_model/firework_ai.py} | 40 ++-- .../proprietary_model/gemini.py} | 125 +++++++------ .../proprietary_model/gorilla.py} | 86 +++++---- .../proprietary_model/mistral.py} | 79 ++++---- .../model_handler/proprietary_model/nexus.py} | 176 +++++++++--------- .../proprietary_model/nvidia.py} | 69 +++---- .../proprietary_model/openai.py} | 78 ++++---- .../proprietary_model/snowflake.py | 10 + .../{ => bfcl}/model_handler/utils.py | 98 +++++----- .../model_handler/arctic_handler.py | 41 ---- .../model_handler/claude_fc_handler.py | 88 --------- .../model_handler/deepseek_handler.py | 46 ----- .../model_handler/functionary_handler.py | 26 --- .../model_handler/gemma_handler.py | 55 ------ .../model_handler/glaive_handler.py | 45 ----- .../model_handler/handler.py | 50 ----- .../model_handler/handler_map.py | 79 -------- .../model_handler/hermes_handler.py | 92 --------- .../model_handler/llama_handler.py | 48 ----- .../model_handler/model_style.py | 14 -- .../model_handler/oss_handler.py | 152 --------------- 41 files changed, 1169 insertions(+), 1325 deletions(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/__init__.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/base.py rename berkeley-function-call-leaderboard/{model_handler/constant.py => bfcl/model_handler/constants.py} (63%) create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/functionary.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/gemma.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/glaive.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hermes.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/parser/__init__.py rename berkeley-function-call-leaderboard/{model_handler => bfcl/model_handler/parser}/java_parser.py (95%) rename berkeley-function-call-leaderboard/{model_handler/js_parser.py => bfcl/model_handler/parser/javascript_parser.py} (92%) create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/__init__.py create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/handler.py rename berkeley-function-call-leaderboard/{model_handler/claude_prompt_handler.py => bfcl/model_handler/proprietary_model/anthropic/prompt_handler.py} (71%) rename berkeley-function-call-leaderboard/{model_handler/cohere_handler.py => bfcl/model_handler/proprietary_model/cohere.py} (63%) rename berkeley-function-call-leaderboard/{model_handler/databricks_handler.py => bfcl/model_handler/proprietary_model/databricks.py} (57%) rename berkeley-function-call-leaderboard/{model_handler/firework_ai_handler.py => bfcl/model_handler/proprietary_model/firework_ai.py} (67%) rename berkeley-function-call-leaderboard/{model_handler/gemini_handler.py => bfcl/model_handler/proprietary_model/gemini.py} (61%) rename berkeley-function-call-leaderboard/{model_handler/gorilla_handler.py => bfcl/model_handler/proprietary_model/gorilla.py} (73%) rename berkeley-function-call-leaderboard/{model_handler/mistral_handler.py => bfcl/model_handler/proprietary_model/mistral.py} (63%) rename berkeley-function-call-leaderboard/{model_handler/nexus_handler.py => bfcl/model_handler/proprietary_model/nexus.py} (67%) rename berkeley-function-call-leaderboard/{model_handler/nvidia_handler.py => bfcl/model_handler/proprietary_model/nvidia.py} (56%) rename berkeley-function-call-leaderboard/{model_handler/gpt_handler.py => bfcl/model_handler/proprietary_model/openai.py} (65%) create mode 100644 berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/snowflake.py rename berkeley-function-call-leaderboard/{ => bfcl}/model_handler/utils.py (91%) delete mode 100644 berkeley-function-call-leaderboard/model_handler/arctic_handler.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/deepseek_handler.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/functionary_handler.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/gemma_handler.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/glaive_handler.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/handler.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/handler_map.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/hermes_handler.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/llama_handler.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/model_style.py delete mode 100644 berkeley-function-call-leaderboard/model_handler/oss_handler.py diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/base.py b/berkeley-function-call-leaderboard/bfcl/model_handler/base.py new file mode 100644 index 000000000..0af6a099b --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/base.py @@ -0,0 +1,75 @@ +import json +from typing import List, Dict +from pathlib import Path +from enum import Enum +from abc import ABC, abstractmethod + + +class ModelStyle(str, Enum): + GORILLA = "gorilla" + OPENAI = "openai" + ANTHROPIC_FC = "claude" + ANTHROPIC_PROMPT = "claude" + MISTRAL = "mistral" + GOOGLE = "google" + COHERE = "cohere" + FIREWORK_AI = "firework_ai" + NEXUS = "nexus" + OSS_MODEL = "oss_model" + + +class BaseHandler(ABC): + model_style: str + + def __init__( + self, + model_name: str, + temperature: float = 0.7, + top_p: int = 1, + max_tokens: int = 1000, + ) -> None: + self.model_name = model_name + self.temperature = temperature + self.top_p = top_p + self.max_tokens = max_tokens + + self.result_dir = Path.cwd() / 'result' + self.result_dir.mkdir(exist_ok=True) + + @classmethod + @abstractmethod + def supported_models(cls) -> List[str]: + pass + + @abstractmethod + def inference(self): + """Fetch response from the model.""" + pass + + @abstractmethod + def decode_ast(self, result, language): + """Takes raw model output and converts it to the standard AST checker input.""" + pass + + @abstractmethod + def decode_execute(self, result): + """Takes raw model output and converts it to the standard execute checker input.""" + pass + + def write(self, responses: List[Dict], file_name): + """Write the model responses to the file.""" + + model_dir = self.result_dir / self.model_name.replace('/', '--') + model_dir.mkdir(exist_ok=True, parents=True) + file_path = model_dir / file_name + with open(file_path, 'w') as file: + for response in responses: + file.write(json.dumps(response) + '\n') + print(f'Saved model responses at "{file_path}".') + + def load_result(self, file_path): + """Load the result from the file.""" + + with open(file_path, 'r') as f: + result = [json.loads(line) for line in f] + return result diff --git a/berkeley-function-call-leaderboard/model_handler/constant.py b/berkeley-function-call-leaderboard/bfcl/model_handler/constants.py similarity index 63% rename from berkeley-function-call-leaderboard/model_handler/constant.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/constants.py index ee34d8cff..d57d3e67f 100644 --- a/berkeley-function-call-leaderboard/model_handler/constant.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/constants.py @@ -1,17 +1,18 @@ -USE_COHERE_OPTIMIZATION = False +import os -SYSTEM_PROMPT_FOR_CHAT_MODEL = """ - You are an expert in composing functions. You are given a question and a set of possible functions. - Based on the question, you will need to make one or more function/tool calls to achieve the purpose. - If none of the function can be used, point it out. If the given question lacks the parameters required by the function, - also point it out. You should only return the function call in tools call sections. - """ +USE_COHERE_OPTIMIZATION = os.getenv('USE_COHERE_OPTIMIZATION', False) + +SYSTEM_PROMPT_FOR_CHAT_MODEL = """\ +You are an expert in composing functions. You are given a question and a set of possible functions. +Based on the question, you will need to make one or more function/tool calls to achieve the purpose. +If none of the function can be used, point it out. If the given question lacks the parameters required by the function, +also point it out. You should only return the function call in tools call sections.""" + +USER_PROMPT_FOR_CHAT_MODEL = """\ +Questions:{user_input}\nHere is a list of functions in JSON format that you can invoke:\n{functions}. +Should you decide to return the function call(s),Put it in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)]\n +NO other text MUST be included.""" -USER_PROMPT_FOR_CHAT_MODEL = """ - Questions:{user_prompt}\nHere is a list of functions in JSON format that you can invoke:\n{functions}. - Should you decide to return the function call(s),Put it in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)]\n - NO other text MUST be included. -""" GORILLA_TO_OPENAPI = { "integer": "integer", "number": "number", @@ -144,19 +145,3 @@ "command-r-plus-FC", "command-r-plus-FC-optimized", ] - -TEST_CATEGORIES = { - "executable_simple": "gorilla_openfunctions_v1_test_executable_simple.json", - "executable_parallel_function": "gorilla_openfunctions_v1_test_executable_parallel_function.json", - "executable_multiple_function": "gorilla_openfunctions_v1_test_executable_multiple_function.json", - "executable_parallel_multiple_function": "gorilla_openfunctions_v1_test_executable_parallel_multiple_function.json", - "simple": "gorilla_openfunctions_v1_test_simple.json", - "relevance": "gorilla_openfunctions_v1_test_relevance.json", - "parallel_function": "gorilla_openfunctions_v1_test_parallel_function.json", - "multiple_function": "gorilla_openfunctions_v1_test_multiple_function.json", - "parallel_multiple_function": "gorilla_openfunctions_v1_test_parallel_multiple_function.json", - "java": "gorilla_openfunctions_v1_test_java.json", - "javascript": "gorilla_openfunctions_v1_test_javascript.json", - "rest": "gorilla_openfunctions_v1_test_rest.json", - "sql": "gorilla_openfunctions_v1_test_sql.json", -} diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py new file mode 100644 index 000000000..322eef851 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py @@ -0,0 +1,22 @@ +from .deepseek import DeepseekHandler +from .functionary import FunctionaryHandler +from .gemma import GemmaHandler +from .glaive import GlaiveHandler +from .hermes import HermesHandler +from .llama import LlamaHandler + +__all__ = [ + 'DeepseekHandler', + 'FunctionaryHandler', + 'GemmaHandler', + 'GlaiveHandler', + 'HermesHandler', + 'LlamaHandler', +] + +MODEL_TO_HANDLER_CLS = {} +for handler_name in __all__: + module = globals()[handler_name] + handler_class = getattr(module, handler_name) + for model in handler_class.supported_models(): + MODEL_TO_HANDLER_CLS[model] = handler_class \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py new file mode 100644 index 000000000..63703163f --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py @@ -0,0 +1,96 @@ +import json + +import ray +import torch +from vllm import LLM, SamplingParams + +from bfcl.model_handler import utils +from bfcl.model_handler.base import BaseHandler, ModelStyle + + +class OssModelHandler(BaseHandler): + model_style = ModelStyle.OSS_MODEL + system_message = 'You are a helpful assistant with access to the following functions. Use them if required -' + prompt_template = 'SYSTEM: {system_message}\n{functions}\nUSER: {user_input}\nASSISTANT: ' + + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + super().__init__(model_name, temperature, top_p, max_tokens) + self.sampling_params = SamplingParams( + temperature=self.temperature, + max_tokens=self.max_tokens, + top_p=self.top_p + ) + self._init_model() + + @classmethod + def supported_models(cls): + raise NotImplementedError + + def _init_model(self) -> None: + ray.init(ignore_reinit_error=True, num_cpus=8) + + def get_prompt(self, user_input, functions, test_category) -> str: + if isinstance(functions, list): + functions = json.dumps(functions) + return self.prompt_template.format( + system_message=self.system_message, + functions=functions, + user_input=user_input, + ) + + def inference(self, inputs, test_category, num_gpus): + chunk_size = len(inputs) // num_gpus + futures = [] + for i in range(0, len(inputs), chunk_size): + futures.append( + self._batch_generate.remote( + inputs[i: i + chunk_size], + test_category, + self.model_name, + self.sampling_params, + get_prompt_func=self.get_prompt, + ) + ) + responses = [] + for future in futures: + responses.extend(ray.get(future)) + return responses + + def decode_ast(self, result, language="python"): + func = result + if " " == func[0]: + func = func[1:] + if not func.startswith("["): + func = "[" + func + if not func.endswith("]"): + func = func + "]" + decode_output = utils.ast_parse(func, language) + return decode_output + + def decode_execute(self, result): + return result + + @ray.remote(num_gpus=1) + @torch.inference_mode() + def _batch_generate( + inputs, + model_path, + sampling_params: SamplingParams, + get_prompt_func + ): + prompts = [] + for line in inputs: + test_category = _input["test_category"] + _input = line + prompt = utils.augment_prompt_by_languge(_input["question"], test_category) + functions = utils.language_specific_pre_processing(_input["function"], test_category, False) + prompts.append(get_prompt_func(prompt, functions, test_category)) + + print(f'Getting responses for {len(prompts)} samples...') + llm = LLM(model=model_path, dtype="float16", trust_remote_code=True) + outputs = llm.generate(prompts, sampling_params) + responses = [ + dict(id=_input['id'], response=output.outputs[0].text) + for output, _input in zip(outputs, inputs) + ] + return responses diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py new file mode 100644 index 000000000..32f93b081 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/deepseek.py @@ -0,0 +1,45 @@ +import re + +from bfcl.model_handler.utils import ast_parse +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class DeepseekHandler(OssModelHandler): + system_message = ( + 'You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n' + '### Instruction:\n' + 'You are a helpful assistant with access to the following functions. Use them if required -' + ) + prompt_template = ( + '{system_message}\n' + '{functions}\n' + 'Here is the question you need to answer:\n' + '{user_input}\n' + 'Your job is to solve the above question using ONLY and strictly ONE line of python code given the above functions. If you think no function should be invoked return "[]".\n' + 'If you think one or more function should be invoked, return the function call in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)] wrapped in python code' + '### Response:\n' + ) + + @classmethod + def supported_models(cls): + return [ + 'deepseek-ai/deepseek-coder-6.7b-instruct', + ] + + def decode_ast(self, result, language="python"): + function_call = result.split("```")[1] + matches = re.findall(r"\[[^\]]*\]", function_call) + decoded_output = ast_parse(matches[0], language) + return decoded_output + + def decode_execute(self, result): + function_call = result.split("```")[1] + matches = re.findall(r"\[[^\]]*\]", function_call) + decoded_output = ast_parse(matches[0]) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/functionary.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/functionary.py new file mode 100644 index 000000000..7f34a8566 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/functionary.py @@ -0,0 +1,25 @@ +from openai import OpenAI + +from bfcl.model_handler.proprietary_model.openai import OpenAIHandler + + +# For setup instructions, please refer to https://github.com/MeetKai/functionary +class FunctionaryHandler(OpenAIHandler): + def __init__( + self, + model_name: str, + temperature: float = 0.7, + top_p: int = 1, + max_tokens: int = 1000, + ) -> None: + super().__init__(model_name, temperature, top_p, max_tokens) + self.client = OpenAI(base_url="http://localhost:8000/v1", api_key="functionary") + + @classmethod + def supported_models(cls): + return [ + 'meetkai/functionary-small-v2.2-FC', + 'meetkai/functionary-medium-v2.2-FC', + 'meetkai/functionary-small-v2.4-FC', + 'meetkai/functionary-medium-v2.4-FC', + ] diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/gemma.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/gemma.py new file mode 100644 index 000000000..95a698c25 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/gemma.py @@ -0,0 +1,44 @@ +import re + +from bfcl.model_handler.utils import ast_parse +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class GemmaHandler(OssModelHandler): + prompt_template = ( + 'user\n' + '{system_message}\n' + '{functions}\n' + 'Here is the question you need to answer:\n' + '{user_input}\n' + 'Your job is to solve the above question using ONLY and strictly ONE line of python code given the above functions. If you think no function should be invoked return "[]".\n' + 'If you think one or more function should be invoked, return the function call in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)] wrapped in python code' + '\n' + 'model\n' + ) + + @classmethod + def supported_models(cls): + return [ + 'google/gemma-7b-it', + ] + + def decode_ast(self, result, language="python"): + match = re.search(r"\[(.*)\]", result, re.DOTALL) + raw_input = match.group(1) + func = "[" + raw_input + "]" + decoded_output = ast_parse(func, language=language) + return decoded_output + + def decode_execute(self, result): + match = re.search(r"\[(.*)\]", result, re.DOTALL) + raw_input = match.group(1) + func = "[" + raw_input + "]" + decoded_output = ast_parse(func) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/glaive.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/glaive.py new file mode 100644 index 000000000..08c28915b --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/glaive.py @@ -0,0 +1,32 @@ +import json + +from bfcl.model_handler.utils import convert_to_function_call +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class GlaiveHandler(OssModelHandler): + prompt_template = 'SYSTEM: {system_message}\n{functions}\nUSER: {user_input}\n' + + @classmethod + def supported_models(cls): + return [ + 'glaiveai/glaive-function-calling-v1', + ] + + def decode_ast(self, result, language="python"): + function_call = result.split("")[-1] + function_call = function_call.replace("'", "") + decoded_function = json.loads(function_call) + for key, value in decoded_function["arguments"].items(): + if language.lower() != "python": + # all values of the json are casted to string for java and javascript + decoded_function["arguments"][key] = str(decoded_function["arguments"][key]) + decoded_result = [{decoded_function["name"]: decoded_function["arguments"]}] + return decoded_result + + def decode_execute(self, result): + function_call = result.split("")[-1] + function_call = function_call.replace("'", "") + decoded_function = json.loads(function_call) + decoded_result = [{decoded_function["name"]: decoded_function["arguments"]}] + return convert_to_function_call(decoded_result) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hermes.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hermes.py new file mode 100644 index 000000000..cbb316646 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/hermes.py @@ -0,0 +1,96 @@ +import json + +from bfcl.model_handler.constants import GORILLA_TO_OPENAPI +from bfcl.model_handler.utils import convert_to_tool +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class HermesHandler(OssModelHandler): + prompt_template = ( + '<|im_start|>system\n' + 'You are a function calling AI model. You are provided with function signatures within XML tags. ' + "You may call one or more functions to assist with the user query. Don't make assumptions about what values to " + 'plug into functions. Here are the available tools: {functions} Use the following pydantic model ' + 'json schema for each tool call you will make: {pydantic_func_schema}. ' + 'For each function call return a json object with function name and arguments within XML tags as follows:\n' + '{{"arguments": , "name": }}<|im_end|>' + '<|im_start|>user\n{user_input}<|im_end|>' + ) + + @classmethod + def supported_models(cls): + return [ + 'NousResearch/Hermes-2-Pro-Mistral-7B', + ] + + def get_prompt(self, user_input, functions, test_category) -> str: + # Hermes use Langchain to OpenAI conversion. It does not use tool call but function call. + function = convert_to_tool(function, GORILLA_TO_OPENAPI, self.model_style, test_category, True) + pydantic_func_schema = { + "properties": { + "arguments": { + "title": "Arguments", + "type": "object" + }, + "name": { + "title": "Name", + "type": "string" + } + }, + "required": ["arguments", "name"], + "title": "FunctionCall", + "type": "object" + } + return self.prompt_template.format( + pydantic_func_schema=pydantic_func_schema, + functions=functions, + user_input=user_input, + ) + + def decode_ast(self, result, language="python"): + lines = result.split("\n") + flag = False + func_call = [] + for line in lines: + if "" == line: + flag = True + elif "" == line: + flag = False + else: + if flag: + line = line.replace("'", '"') + tool_result = json.loads(line) + if language.lower() != "python": + # all values of the json are casted to string for java and javascript + for key in tool_result["arguments"]: + tool_result["arguments"][key] = str( + tool_result["arguments"][key] + ) + func_call.append({tool_result["name"]: tool_result["arguments"]}) + flag = False + return func_call + + def decode_execute(self, result): + lines = result.split("\n") + flag = False + function_call_list = [] + for line in lines: + if "" == line: + flag = True + elif "" == line: + flag = False + else: + if flag: + line = line.replace("'", '"') + tool_result = json.loads(line) + function_call_list.append( + {tool_result["name"]: tool_result["arguments"]} + ) + flag = False + execution_list = [] + for function_call in function_call_list: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k,v in value.items()])})" + ) + return execution_list diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py new file mode 100644 index 000000000..26a1d3cb3 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py @@ -0,0 +1,45 @@ +from bfcl.model_handler import constants +from bfcl.model_handler.utils import ast_parse +from bfcl.model_handler.oss_model.base import OssModelHandler + + +class LlamaHandler(OssModelHandler): + system_message = constants.SYSTEM_PROMPT_FOR_CHAT_MODEL + prompt_template = ( + '<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_message}<|eot_id|><|start_header_id|>' + f'user<|end_header_id|>{constants.USER_PROMPT_FOR_CHAT_MODEL}' + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>' + ) + + @classmethod + def supported_models(cls): + return [ + 'meta-llama/Meta-Llama-3-8B-Instruct', + 'meta-llama/Meta-Llama-3-70B-Instruct', + ] + + def decode_ast(self, result, language="python"): + func = result + func = func.replace("\n", "") # remove new line characters + if not func.startswith("["): + func = "[" + func + if not func.endswith("]"): + func = func + "]" + decoded_output = ast_parse(func, language) + return decoded_output + + def decode_execute(self, result): + func = result + func = func.replace("\n", "") # remove new line characters + if not func.startswith("["): + func = "[" + func + if not func.endswith("]"): + func = func + "]" + decode_output = ast_parse(func) + execution_list = [] + for function_call in decode_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/parser/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/__init__.py new file mode 100644 index 000000000..6f4d4295c --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/__init__.py @@ -0,0 +1,7 @@ +from .java_parser import parse_java_function_call +from .javascript_parser import parse_javascript_function_call + +__all__ = [ + 'parse_java_function_call', + 'parse_javascript_function_call', +] diff --git a/berkeley-function-call-leaderboard/model_handler/java_parser.py b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/java_parser.py similarity index 95% rename from berkeley-function-call-leaderboard/model_handler/java_parser.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/parser/java_parser.py index 71118c0a9..40a9ebd09 100644 --- a/berkeley-function-call-leaderboard/model_handler/java_parser.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/java_parser.py @@ -1,13 +1,8 @@ -import json from tree_sitter import Language, Parser +import tree_sitter_java -Language.build_library( - # Store the library in the `build` directory - "build/tree_sitter.so", - # Include one or more languages - ["./tree-sitter-java"], -) -JAVA_LANGUAGE = Language("build/tree_sitter.so", "java") + +JAVA_LANGUAGE = Language(tree_sitter_java.language(), "java") parser = Parser() parser.set_language(JAVA_LANGUAGE) diff --git a/berkeley-function-call-leaderboard/model_handler/js_parser.py b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/javascript_parser.py similarity index 92% rename from berkeley-function-call-leaderboard/model_handler/js_parser.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/parser/javascript_parser.py index a3b60130a..2e1f83142 100644 --- a/berkeley-function-call-leaderboard/model_handler/js_parser.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/parser/javascript_parser.py @@ -1,10 +1,8 @@ -import json from tree_sitter import Language, Parser +import tree_sitter_javascript -# Load your language grammar and create a parser -Language.build_library("build/tree_sitter_js.so", ["./tree-sitter-javascript"]) -JS_LANGUAGE = Language("build/tree_sitter_js.so", "javascript") +JS_LANGUAGE = Language(tree_sitter_javascript.language(), "javascript") parser = Parser() parser.set_language(JS_LANGUAGE) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py new file mode 100644 index 000000000..70f7cf387 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py @@ -0,0 +1,33 @@ +from .anthropic import AnthropicFCHandler, AnthropicPromptHandler +from .cohere import CohereHandler +from .databricks import DatabricksHandler +from .firework_ai import FireworkAIHandler +from .gemini import GeminiHandler +from .gorilla import GorillaHandler +from .mistral import MistralHandler +from .nexus import NexusHandler +from .nvidia import NvidiaHandler +from .openai import OpenAIHandler +from .snowflake import SnowflakeHandler + +__all__ = [ + 'AnthropicFCHandler', + 'AnthropicPromptHandler', + 'CohereHandler', + 'DatabricksHandler', + 'FireworkAIHandler', + 'GeminiHandler', + 'GorillaHandler', + 'MistralHandler', + 'NexusHandler', + 'NvidiaHandler', + 'OpenAIHandler', + 'SnowflakeHandler', +] + +MODEL_TO_HANDLER_CLS = {} +for handler_name in __all__: + module = globals()[handler_name] + handler_class = getattr(module, handler_name) + for model in handler_class.supported_models(): + MODEL_TO_HANDLER_CLS[model] = handler_class \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/__init__.py new file mode 100644 index 000000000..a0c393bde --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/__init__.py @@ -0,0 +1,7 @@ +from .handler import AnthropicFCHandler +from .prompt_handler import AnthropicPromptHandler + +__all__ = [ + 'AnthropicFCHandler', + 'AnthropicPromptHandler', +] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/handler.py new file mode 100644 index 000000000..09762720d --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/handler.py @@ -0,0 +1,81 @@ +import json +import time + +from anthropic.types import TextBlock, ToolUseBlock + +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import ModelStyle +from bfcl.model_handler.proprietary_model.anthropic.prompt_handler import AnthropicPromptHandler + + +class AnthropicFCHandler(AnthropicPromptHandler): + model_style = ModelStyle.ANTHROPIC_FC + + @classmethod + def supported_models(cls): + return [ + 'claude-3-opus-20240229-FC', + 'claude-3-sonnet-20240229-FC', + 'claude-3-5-sonnet-20240620-FC', + 'claude-3-haiku-20240307-FC', + ] + + def inference(self, prompt, functions, test_category): + if "FC" not in self.model_name: + return super().inference(prompt, functions, test_category) + + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, True) + if type(functions) is not list: + functions = [functions] + claude_tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_OPENAPI, self.model_style, test_category, True + ) + message = [{"role": "user", "content": prompt}] + start_time = time.time() + + response = self.client.messages.create( + model=self.model_name.strip("-FC"), + max_tokens=self.max_tokens, + tools=claude_tool, + messages=message, + ) + latency = time.time() - start_time + text_outputs = [] + tool_call_outputs = [] + for content in response.content: + if isinstance(content, TextBlock): + text_outputs.append(content.text) + elif isinstance(content, ToolUseBlock): + tool_call_outputs.append({content.name: json.dumps(content.input)}) + result = tool_call_outputs if tool_call_outputs else text_outputs[0] + return result, {"input_tokens": response.usage.input_tokens, "output_tokens": response.usage.output_tokens, "latency": latency} + + def decode_ast(self, result, language="python"): + if "FC" not in self.model_name: + decoded_output = utils.ast_parse(result,language) + else: + decoded_output = [] + for invoked_function in result: + name = list(invoked_function.keys())[0] + params = json.loads(invoked_function[name]) + if language.lower() != "python": + for key in params: + params[key] = str(params[key]) + decoded_output.append({name: params}) + return decoded_output + + def decode_execute(self, result): + if "FC" not in self.model_name: + decoded_output = utils.ast_parse(result) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list + else: + function_call = utils.convert_to_function_call(result) + return function_call diff --git a/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/prompt_handler.py similarity index 71% rename from berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/prompt_handler.py index 04ab78ef2..5d4934e6c 100644 --- a/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/anthropic/prompt_handler.py @@ -1,109 +1,53 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - convert_to_tool, - ast_parse, - augment_prompt_by_languge, - language_specific_pre_processing, - construct_tool_use_system_prompt, - _function_calls_valid_format_and_invoke_extraction, - _convert_value, -) -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, - GORILLA_TO_PYTHON, -) -import os, time +import os +import time + from anthropic import Anthropic +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle + -class ClaudePromptingHandler(BaseHandler): +class AnthropicPromptHandler(BaseHandler): + model_style = ModelStyle.ANTHROPIC_PROMPT + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Anthropic_Prompt - self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) - def _get_claude_function_calling_response(self, prompt, functions, test_category): - input_tool = convert_to_tool( - functions, GORILLA_TO_PYTHON, self.model_style, test_category, True - ) - system_prompt = construct_tool_use_system_prompt(input_tool) - start = time.time() - response = self.client.messages.create( - model=self.model_name.strip("-FC"), - max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, - system=system_prompt, - messages=[{"role": "user", "content": prompt}], - ) - latency = time.time() - start - result = [] - if ( - "invokes" - not in _function_calls_valid_format_and_invoke_extraction( - response.content[0].text - ).keys() - ): - return "Error", {"input_tokens": 0, "output_tokens": 0, "latency": latency} - for invoked_function in _function_calls_valid_format_and_invoke_extraction( - response.content[0].text - )["invokes"]: - name = invoked_function["tool_name"] - select_func = None - for func in input_tool: - if func["name"] == name: - select_func = func - break - if select_func is None: - result.append({}) - continue - param_dict = {} - for param in invoked_function["parameters_with_values"]: - param_name = param[0] - param_value = param[1] - try: - param_type = select_func["parameters"]["properties"][param_name][ - "type" - ] - except: - param_type = "str" - param_value = _convert_value(param_value, param_type) - param_dict[param_name] = param_value - result.append({name: param_dict}) - metadata = {} - metadata["input_tokens"] = response.usage.input_tokens - metadata["output_tokens"] = response.usage.output_tokens - metadata["latency"] = latency - return result, metadata + @classmethod + def supported_models(cls): + return [ + 'claude-instant-1.2', + 'claude-2.1', + 'claude-3-opus-20240229', + 'claude-3-sonnet-20240229', + 'claude-3-5-sonnet-20240620', + 'claude-3-haiku-20240307', + ] def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) + prompt = utils.augment_prompt_by_languge(prompt, test_category) if "FC" in self.model_name: - functions = language_specific_pre_processing(functions, test_category, True) + functions = utils.language_specific_pre_processing(functions, test_category, True) result, metadata = self._get_claude_function_calling_response( prompt, functions, test_category ) return result, metadata else: start = time.time() - functions = language_specific_pre_processing( - functions, test_category, False - ) + functions = utils.language_specific_pre_processing(functions, test_category, False) response = self.client.messages.create( model=self.model_name, max_tokens=self.max_tokens, temperature=self.temperature, top_p=self.top_p, - system=SYSTEM_PROMPT_FOR_CHAT_MODEL, + system=constants.SYSTEM_PROMPT_FOR_CHAT_MODEL, messages=[ { "role": "user", - "content": USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + "content": constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), } ], ) @@ -115,9 +59,9 @@ def inference(self, prompt, functions, test_category): result = response.content[0].text return result, metadata - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): if "FC" in self.model_name: - if language == "Python": + if language.lower() == "python": return result else: # result is a list of dictionaries, make sure each value of dictionary is string @@ -134,7 +78,7 @@ def decode_ast(self, result, language="Python"): func = "[" + func if not func.endswith("]"): func = func + "]" - decode_output = ast_parse(func, language) + decode_output = utils.ast_parse(func, language) return decode_output def decode_execute(self, result): @@ -158,7 +102,7 @@ def decode_execute(self, result): func = "[" + func if not func.endswith("]"): func = func + "]" - decode_output = ast_parse(func) + decode_output = utils.ast_parse(func) execution_list = [] for function_call in decode_output: for key, value in function_call.items(): @@ -166,3 +110,51 @@ def decode_execute(self, result): f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" ) return execution_list + + def _get_claude_function_calling_response(self, prompt, functions, test_category): + input_tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_PYTHON, self.model_style, test_category, True + ) + system_prompt = utils.construct_tool_use_system_prompt(input_tool) + start = time.time() + response = self.client.messages.create( + model=self.model_name.strip("-FC"), + max_tokens=self.max_tokens, + temperature=self.temperature, + top_p=self.top_p, + system=system_prompt, + messages=[{"role": "user", "content": prompt}], + ) + latency = time.time() - start + result = [] + out = utils.function_calls_valid_format_and_invoke_extraction(response.content[0].text) + if "invokes" not in out.keys(): + return "Error", {"input_tokens": 0, "output_tokens": 0, "latency": latency} + for invoked_function in out["invokes"]: + name = invoked_function["tool_name"] + select_func = None + for func in input_tool: + if func["name"] == name: + select_func = func + break + if select_func is None: + result.append({}) + continue + param_dict = {} + for param in invoked_function["parameters_with_values"]: + param_name = param[0] + param_value = param[1] + try: + param_type = select_func["parameters"]["properties"][param_name][ + "type" + ] + except: + param_type = "str" + param_value = utils.convert_value(param_value, param_type) + param_dict[param_name] = param_value + result.append({name: param_dict}) + metadata = {} + metadata["input_tokens"] = response.usage.input_tokens + metadata["output_tokens"] = response.usage.output_tokens + metadata["latency"] = latency + return result, metadata diff --git a/berkeley-function-call-leaderboard/model_handler/cohere_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/cohere.py similarity index 63% rename from berkeley-function-call-leaderboard/model_handler/cohere_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/cohere.py index def3be47e..74038ce02 100644 --- a/berkeley-function-call-leaderboard/model_handler/cohere_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/cohere.py @@ -1,86 +1,88 @@ import os - -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - augment_prompt_by_languge, - language_specific_pre_processing, - convert_to_tool, - ast_parse, - convert_to_function_call, -) -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, - GORILLA_TO_PYTHON, -) import time + import cohere -from model_handler.constant import USE_COHERE_OPTIMIZATION +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle -class CohereHandler(BaseHandler): - client: cohere.Client +OPTIMIZED_PREAMBLE = """## Task & Context +You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you can use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.COHERE +When a question is irrelevant or unrelated to the available tools you should choose to directly answer. This is especially important when the question or available tools are about specialist subject like math or biology or physics: DO NOT ANSWER UNRELATED QUESTIONS. - self.client = cohere.Client(api_key=os.getenv("COHERE_API_KEY")) +## Style Guide +Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. +""" - # System prompt for function calling. - if USE_COHERE_OPTIMIZATION: - self.preamble = """## Task & Context - You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you can use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. +PREAMBLE = """## Task & Context +You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. - When a question is irrelevant or unrelated to the available tools you should choose to directly answer. This is especially important when the question or available tools are about specialist subject like math or biology or physics: DO NOT ANSWER UNRELATED QUESTIONS. +## Style Guide +Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. +""" - ## Style Guide - Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. - """ - else: - self.preamble = """ - ## Task & Context - You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. - ## Style Guide - Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. - """ +class CohereHandler(BaseHandler): + model_style = ModelStyle.COHERE + + def __init__( + self, + model_name, + temperature=0.7, + top_p=1, + max_tokens=1000, + use_cohere_optimization: bool = constants.USE_COHERE_OPTIMIZATION + ) -> None: + + super().__init__(model_name, temperature, top_p, max_tokens) + self.use_cohere_optimization = use_cohere_optimization + self.client = cohere.Client(api_key=os.getenv("COHERE_API_KEY")) + self.preamble = OPTIMIZED_PREAMBLE if use_cohere_optimization else PREAMBLE + + @classmethod + def supported_models(cls): + return [ + 'command-r-plus', + 'command-r-plus-FC', + 'command-r-plus-optimized', + 'command-r-plus-FC-optimized', + ] def inference(self, prompt, functions, test_category): if "FC" not in self.model_name: - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing( + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing( functions, test_category, False ) - message = USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ) + message = constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)) start_time = time.time() response = self.client.chat( message=message, model=self.model_name, temperature=self.temperature, max_tokens=self.max_tokens, - preamble=SYSTEM_PROMPT_FOR_CHAT_MODEL, + preamble=constants.SYSTEM_PROMPT_FOR_CHAT_MODEL, ) latency = time.time() - start_time result = response.text else: - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, True) + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, True) if type(functions) is not list: functions = [functions] message = prompt # Convert JSON schema into R+ compatible function calls. - cohere_tool = convert_to_tool( - functions, GORILLA_TO_PYTHON, self.model_style, test_category, True + cohere_tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_PYTHON, self.model_style, test_category, True ) start_time = time.time() if len(cohere_tool) > 0: try: - if USE_COHERE_OPTIMIZATION: + if self.use_cohere_optimization: response = self.client.chat( message=message, model=self.model_name.replace("-FC", ""), @@ -129,13 +131,13 @@ def inference(self, prompt, functions, test_category): metadata["latency"] = latency return result, metadata - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): if "FC" not in self.model_name: if not result.startswith("["): result = "[" + result if not result.endswith("]"): result = result + "]" - decoded_output = ast_parse(result, language) + decoded_output = utils.ast_parse(result, language) else: decoded_output = [] for invoked_function in result: @@ -144,7 +146,7 @@ def decode_ast(self, result, language="Python"): if language == "Python": pass else: - if USE_COHERE_OPTIMIZATION: + if self.use_cohere_optimization: # all values of the json are cast to string for java and javascript for key, value in params.items(): value = str(value) @@ -165,7 +167,7 @@ def decode_execute(self, result): result = "[" + result if not result.endswith("]"): result = result + "]" - decoded_output = ast_parse(result) + decoded_output = utils.ast_parse(result) execution_list = [] for function_call in decoded_output: for key, value in function_call.items(): diff --git a/berkeley-function-call-leaderboard/model_handler/databricks_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/databricks.py similarity index 57% rename from berkeley-function-call-leaderboard/model_handler/databricks_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/databricks.py index fa1201c6a..5b53c5765 100644 --- a/berkeley-function-call-leaderboard/model_handler/databricks_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/databricks.py @@ -1,42 +1,41 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import language_specific_pre_processing, ast_parse -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, - GORILLA_TO_OPENAPI, -) +import os +import re import time + from openai import OpenAI -import re + +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle class DatabricksHandler(BaseHandler): + model_style = ModelStyle.OPENAI + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.model_name = model_name - self.model_style = ModelStyle.OpenAI - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens + super().__init__(model_name, temperature, top_p, max_tokens) + assert (api_key := os.getenv('DATABRICKS_API_KEY')), \ + 'Please provide your `DATABRICKS_API_KEY` in the .env file.' + assert (base_url := os.getenv('DATABRICKS_AZURE_ENDPOINT_URL')), \ + 'Please provide your `DATABRICKS_AZURE_ENDPOINT_URL` in the .env file.' + self.client = OpenAI(api_key=api_key, base_url=base_url) - # NOTE: To run the Databricks model, you need to provide your own Databricks API key and your own Azure endpoint URL. - self.client = OpenAI( - api_key="{YOUR_DATABRICKS_API_KEY}", - base_url="{YOUR_DATABRICKS_AZURE_ENDPOINT_URL}", - ) + @classmethod + def supported_models(cls): + return [ + 'databricks-dbrx-instruct', + ] def inference(self, prompt, functions, test_category): - functions = language_specific_pre_processing(functions, test_category, False) + functions = utils.language_specific_pre_processing(functions, test_category, False) if type(functions) is not list: functions = [functions] message = [ - {"role": "system", "content": SYSTEM_PROMPT_FOR_CHAT_MODEL}, + {"role": "system", "content": constants.SYSTEM_PROMPT_FOR_CHAT_MODEL}, { "role": "user", - "content": "Questions:" - + USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + "content": "Questions:" + constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), }, ] start_time = time.time() @@ -55,7 +54,7 @@ def inference(self, prompt, functions, test_category): metadata["latency"] = latency return result, metadata - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): func = re.sub(r"'([^']*)'", r"\1", result) func = func.replace("\n ", "") if not func.startswith("["): @@ -65,12 +64,12 @@ def decode_ast(self, result, language="Python"): if func.startswith("['"): func = func.replace("['", "[") try: - decode_output = ast_parse(func, language) + decode_output = utils.ast_parse(func, language) except: - decode_output = ast_parse(result, language) + decode_output = utils.ast_parse(result, language) return decode_output - def decode_execute(self, result, language="Python"): + def decode_execute(self, result, language="python"): func = re.sub(r"'([^']*)'", r"\1", result) func = func.replace("\n ", "") if not func.startswith("["): @@ -80,9 +79,9 @@ def decode_execute(self, result, language="Python"): if func.startswith("['"): func = func.replace("['", "[") try: - decode_output = ast_parse(func, language) + decode_output = utils.ast_parse(func, language) except: - decode_output = ast_parse(result, language) + decode_output = utils.ast_parse(result, language) execution_list = [] for function_call in decode_output: for key, value in function_call.items(): diff --git a/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/firework_ai.py similarity index 67% rename from berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/firework_ai.py index 74895ef73..e4643c165 100644 --- a/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/firework_ai.py @@ -1,38 +1,32 @@ -import json import os import time -from model_handler.constant import GORILLA_TO_OPENAPI -from model_handler.gpt_handler import OpenAIHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import convert_to_tool, language_specific_pre_processing from openai import OpenAI +from bfcl.model_handler.constants import GORILLA_TO_OPENAPI +from bfcl.model_handler.base import ModelStyle +from bfcl.model_handler.proprietary_model.openai import OpenAIHandler +from bfcl.model_handler.utils import convert_to_tool, language_specific_pre_processing + + class FireworkAIHandler(OpenAIHandler): - def __init__(self, model_name, temperature=0.0, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.FIREWORK_AI - self.temperature = 0.0 + model_style = ModelStyle.FIREWORK_AI + def __init__(self, model_name, temperature=0.0, top_p=1, max_tokens=1000) -> None: + super().__init__(model_name=model_name, temperature=0.0, top_p=top_p, max_tokens=max_tokens) self.client = OpenAI( base_url="https://api.fireworks.ai/inference/v1", - api_key=os.getenv("FIRE_WORKS_API_KEY"), + api_key=os.getenv("FIREWORKS_API_KEY"), ) - def write(self, result, file_to_open): - # This method is used to write the result to the file. - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists(f"./result/{self.model_name}"): - os.mkdir(f"./result/{self.model_name}") - with open( - f"./result/{self.model_name}/" - + file_to_open.replace(".json", "_result.json"), - "a+", - ) as f: - f.write(json.dumps(result) + "\n") - + @classmethod + def supported_models(cls): + return [ + 'firefunction-v1-FC', + 'firefunction-v2-FC', + ] + def inference(self, prompt, functions, test_category): functions = language_specific_pre_processing(functions, test_category, True) if type(functions) is not list: diff --git a/berkeley-function-call-leaderboard/model_handler/gemini_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py similarity index 61% rename from berkeley-function-call-leaderboard/model_handler/gemini_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py index 328ba399b..0707b21e6 100644 --- a/berkeley-function-call-leaderboard/model_handler/gemini_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py @@ -1,25 +1,74 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - convert_to_tool, - convert_to_function_call, - augment_prompt_by_languge, - language_specific_pre_processing, -) -from model_handler.constant import GORILLA_TO_OPENAPI -import subprocess, requests, json, time +import subprocess +import time +import json +import os + +import requests + +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle class GeminiHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + model_style = ModelStyle.GOOGLE + + def __init__( + self, + model_name, + temperature=0.7, + top_p=1, + max_tokens=1000, + gcp_project_id: str | None = None + ) -> None: + super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Google + if gcp_project_id is None: + gcp_project_id = os.getenv('GEMINI_GCP_PROJECT_ID') + + assert gcp_project_id, ( + '`gcp_project_id` cannot be empty! To run the gemini model, you need to provide ' + 'your own GCP project ID, which can be found in the GCP console.' + ) + self.api_url = ( + f'https://us-central1-aiplatform.googleapis.com/v1beta1/projects/{gcp_project_id}/locations/us-central1/publishers/google/models/' + + self.model_name + + ":generateContent" + ) - def _query_gemini(self, user_query, functions): - """ - Query Gemini Pro model. - """ + @classmethod + def supported_models(cls): + return [ + 'gemini-1.0-pro', + 'gemini-1.5-pro-preview-0409', + 'gemini-1.5-pro-preview-0514', + 'gemini-1.5-flash-preview-0514', + ] + + def inference(self, prompt, functions, test_category): + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, True) + gemini_tool = utils.convert_to_tool(functions, constants.GORILLA_TO_OPENAPI, self.model_style, test_category, True) + result, metadata = self._query_gemini(prompt, gemini_tool) + return result, metadata + + def decode_ast(self, result, language="python"): + if type(result) is not list: + result = [result] + decoded_output = [] + for invoked_function in result: + name = list(invoked_function.keys())[0] + params = json.loads(invoked_function[name]) + if language != "Python": + for key in params: + params[key] = str(params[key]) + decoded_output.append({name: params}) + return decoded_output + + def decode_execute(self, result): + return utils.convert_to_function_call(result) + def _query_gemini(self, user_query, functions): token = subprocess.run( "gcloud auth print-access-token", check=False, @@ -41,19 +90,12 @@ def _query_gemini(self, user_query, functions): }, "tools": {"function_declarations": functions}, } - - # NOTE: To run the gemini model, you need to provide your own GCP project ID, which can be found in the GCP console. - API_URL = "https://us-central1-aiplatform.googleapis.com/v1beta1/projects/{YOUR_GCP_PROJECT_ID_HERE}/locations/us-central1/publishers/google/models/" + self.model_name + ":generateContent" headers = { "Authorization": "Bearer " + token, "Content-Type": "application/json", } start = time.time() - response = requests.post( - API_URL, - headers=headers, - data=json.dumps(json_data), - ) + response = requests.post(self.api_url, headers=headers, data=json.dumps(json_data)) latency = time.time() - start result = json.loads(response.content) if "error" in result: @@ -77,14 +119,10 @@ def _query_gemini(self, user_query, functions): parts.append(part["text"]) result = parts metatdata = {} - metatdata["input_tokens"] = json.loads(response.content)["usageMetadata"][ - "promptTokenCount" - ] - metatdata["output_tokens"] = json.loads(response.content)["usageMetadata"][ - "candidatesTokenCount" - ] + metatdata["input_tokens"] = json.loads(response.content)["usageMetadata"]["promptTokenCount"] + metatdata["output_tokens"] = json.loads(response.content)["usageMetadata"]["candidatesTokenCount"] metatdata["latency"] = latency - except Exception as e: + except Exception: result = "Parsing error: " + json.dumps(result) metatdata = { "input_tokens": 0, @@ -92,28 +130,3 @@ def _query_gemini(self, user_query, functions): "latency": latency, } return result, metatdata - - def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, True) - gemini_tool = convert_to_tool( - functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True - ) - result, metadata = self._query_gemini(prompt, gemini_tool) - return result, metadata - - def decode_ast(self, result, language="Python"): - if type(result) is not list: - result = [result] - decoded_output = [] - for invoked_function in result: - name = list(invoked_function.keys())[0] - params = json.loads(invoked_function[name]) - if language != "Python": - for key in params: - params[key] = str(params[key]) - decoded_output.append({name: params}) - return decoded_output - - def decode_execute(self, result): - return convert_to_function_call(result) diff --git a/berkeley-function-call-leaderboard/model_handler/gorilla_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gorilla.py similarity index 73% rename from berkeley-function-call-leaderboard/model_handler/gorilla_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gorilla.py index 70fe0e54a..1585cfd4c 100644 --- a/berkeley-function-call-leaderboard/model_handler/gorilla_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gorilla.py @@ -1,17 +1,51 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - ast_parse, - augment_prompt_by_languge, - language_specific_pre_processing, -) -import requests, json, re, time +import json +import time + +import requests + +from bfcl.model_handler import utils +from bfcl.model_handler.base import BaseHandler, ModelStyle class GorillaHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Gorilla + model_style = ModelStyle.GORILLA + + @classmethod + def supported_models(cls): + return [ + 'gorilla-openfunctions-v0', + 'gorilla-openfunctions-v2', + ] + + def inference(self, prompt, functions, test_category): + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, False) + if type(functions) is not list: + functions = [functions] + try: + result, metadata = self._get_gorilla_response(prompt, functions) + except KeyboardInterrupt: + raise KeyboardInterrupt + except: + result = "Error" + metadata = {"input_tokens": 0, "output_tokens": 0, "latency": 0} + return result, metadata + + def decode_ast(self, result, language="python"): + func = "[" + result + "]" + decoded_output = utils.ast_parse(func, language) + return decoded_output + + def decode_execute(self, result): + func = "[" + result + "]" + decoded_output = utils.ast_parse(func) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list def _get_gorilla_response(self, prompt, functions): requestData = { @@ -40,33 +74,3 @@ def _get_gorilla_response(self, prompt, functions): metadata["latency"] = latency directCode = jsonResponse["choices"][0]["message"]["content"] return directCode, metadata - - def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, False) - if type(functions) is not list: - functions = [functions] - try: - result, metadata = self._get_gorilla_response(prompt, functions) - except KeyboardInterrupt: - raise KeyboardInterrupt - except: - result = "Error" - metadata = {"input_tokens": 0, "output_tokens": 0, "latency": 0} - return result, metadata - - def decode_ast(self, result, language="Python"): - func = "[" + result + "]" - decoded_output = ast_parse(func, language) - return decoded_output - - def decode_execute(self, result): - func = "[" + result + "]" - decoded_output = ast_parse(func) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/mistral_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/mistral.py similarity index 63% rename from berkeley-function-call-leaderboard/model_handler/mistral_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/mistral.py index cb43a93fd..b2f5a94f5 100644 --- a/berkeley-function-call-leaderboard/model_handler/mistral_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/mistral.py @@ -1,44 +1,44 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, - GORILLA_TO_OPENAPI, -) -from model_handler.utils import ( - convert_to_tool, - ast_parse, - convert_to_function_call, - augment_prompt_by_languge, - language_specific_pre_processing, -) +import time +import os +import json + from mistralai.client import MistralClient from mistralai.models.chat_completion import ChatMessage -import os, time, json + +from bfcl.model_handler import utils, constants +from bfcl.model_handler.base import BaseHandler, ModelStyle class MistralHandler(BaseHandler): + model_style = ModelStyle.MISTRAL + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Mistral - self.client = MistralClient(api_key=os.getenv("MISTRAL_API_KEY")) + @classmethod + def supported_models(cls): + return [ + 'mistral-tiny-2312', + 'mistral-small-2402', + 'mistral-small-2402-FC-Any', + 'mistral-small-2402-FC-Auto', + 'mistral-medium-2312', + 'mistral-large-2402', + 'mistral-large-2402-FC-Any', + 'mistral-large-2402-FC-Auto', + ] + def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) + prompt = utils.augment_prompt_by_languge(prompt, test_category) if "FC" in self.model_name: - functions = language_specific_pre_processing(functions, test_category, True) - tool = convert_to_tool( - functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True + functions = utils.language_specific_pre_processing(functions, test_category, True) + tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_OPENAPI, self.model_style, test_category, True ) - message = [ - ChatMessage(role="user", content=prompt), - ] + message = [ChatMessage(role="user", content=prompt)] start = time.time() - if "Any" in self.model_name: - tool_choice = "any" - else: - tool_choice = "auto" + tool_choice = "any" if "Any" in self.model_name else "auto" chat_response = self.client.chat( model=self.model_name.replace("-FC-Any", "").replace("-FC-Auto", ""), messages=message, @@ -56,16 +56,13 @@ def inference(self, prompt, functions, test_category): except: result = chat_response.choices[0].message.content else: - functions = language_specific_pre_processing( - functions, test_category, False - ) + functions = utils.language_specific_pre_processing(functions, test_category, False) message = [ - ChatMessage(role="system", content=SYSTEM_PROMPT_FOR_CHAT_MODEL), + ChatMessage(role="system", content=constants.SYSTEM_PROMPT_FOR_CHAT_MODEL), ChatMessage( role="user", - content=USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + content=constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), ), ] start = time.time() @@ -84,7 +81,7 @@ def inference(self, prompt, functions, test_category): } return result, metadata - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): if "FC" in self.model_name: decoded_output = [] for invoked_function in result: @@ -102,21 +99,19 @@ def decode_ast(self, result, language="Python"): func = "[" + func if not func.endswith("]"): func = func + "]" - decoded_output = ast_parse(func, language) + decoded_output = utils.ast_parse(func, language) return decoded_output def decode_execute(self, result): if "FC" in self.model_name: - function_call = convert_to_function_call(result) + function_call = utils.convert_to_function_call(result) return function_call else: func = result func = func.replace("\\_", "_") - decode_output = ast_parse(func) + decode_output = utils.ast_parse(func) execution_list = [] for function_call in decode_output: for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) + execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})") return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/nexus_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nexus.py similarity index 67% rename from berkeley-function-call-leaderboard/model_handler/nexus_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nexus.py index 5dfa8ecdb..6a6591a29 100644 --- a/berkeley-function-call-leaderboard/model_handler/nexus_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nexus.py @@ -1,32 +1,89 @@ -from model_handler.model_style import ModelStyle -from model_handler.handler import BaseHandler -from model_handler.utils import ( - ast_parse, - augment_prompt_by_languge, - language_specific_pre_processing, -) -import requests, time +import time +import requests + +from bfcl.model_handler import utils +from bfcl.model_handler.base import BaseHandler, ModelStyle + + +FUNCTION_TEMPLATE = '''Function: +def {func_name}({func_args}) -> None: + """ + {description} + + Parameters: + {param_descriptions} + """ + +''' + +OUT_OF_DOMAIN_FUNCTION = '''Function: +def out_of_domain(user_query: str) -> str: + """ + This function is designed to handle out-of-domain queries from the user. + If the user provides any input user query that is out of the domain of the other APIs provided above, + this function should be used with the input user query as the string. + + - user_query (str): The input string that is out of domain. + + Returns nothing. + """ + +''' class NexusHandler(BaseHandler): + model_style = ModelStyle.NEXUS + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: - temperature = 0.001 - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.NEXUS + super().__init__(model_name=model_name, temperature=0.001, top_p=top_p, max_tokens=max_tokens) - def generate_functions_from_dict(self, func_dicts): - func_template = """ - Function: - def {func_name}({func_args}) -> None: - \"\"\" - {description} + @classmethod + def supported_models(cls): + return [ + 'Nexusflow-Raven-v2', + ] - Parameters: - {param_descriptions} - \"\"\" + def inference(self, prompt, functions, test_category): + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, False) + raven_prompt = self._format_raven_function(prompt, functions) + result, metadata = self._query_raven(raven_prompt) + return result, metadata - """ + def decode_ast(self, result, language="python"): + if result.endswith(";"): + result = result[:-1] + result = result.replace(";", ",") + func = "[" + result + "]" + decoded_output = utils.ast_parse(func, language) + if "out_of_domain" in result: + return "irrelevant" + return decoded_output + + def decode_execute(self, result): + if result.endswith(";"): + result = result[:-1] + result = result.replace(";", ",") + func = "[" + result + "]" + decoded_output = utils.ast_parse(func) + execution_list = [] + for function_call in decoded_output: + for key, value in function_call.items(): + execution_list.append( + f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" + ) + return execution_list + + def _format_raven_function(self, user_prompt, functions): + """Nexus-Raven requires a specific format for the function description. This + function formats the function description in the required format.""" + + raven_prompt = "\n".join(self._generate_functions_from_dict(functions)) + "\n\n" + raven_prompt += "Setting: Allowed to issue multiple calls with semicolon\n" + raven_prompt += "User Query:" + user_prompt.replace("\n", "") + "" + return raven_prompt + def _generate_functions_from_dict(self, func_dicts): functions = [] for func_dict in func_dicts: func_name = func_dict['name'] @@ -43,7 +100,6 @@ def {func_name}({func_args}) -> None: param_type = f"""String[{', '.join(f"'{e}'" for e in details['enum'])}]""" param_type = param_type.replace("string", "str").replace("number", "float").replace("integer", "int").replace("object", "dict").replace("array", "list").replace("boolean", "bool") - type_hint = param_type if param in required_params: @@ -63,7 +119,7 @@ def {func_name}({func_args}) -> None: func_args = ', '.join(func_args_list) param_descriptions_str = '\n '.join(param_descriptions) - function_str = func_template.format( + function_str = FUNCTION_TEMPLATE.format( func_name=func_name, func_args=func_args, description=description, @@ -72,50 +128,16 @@ def {func_name}({func_args}) -> None: functions.append(function_str) - functions.append( - ''' - Function: - def out_of_domain(user_query: str) -> str: - """ - This function is designed to handle out-of-domain queries from the user. - If the user provides any input user query that is out of the domain of the other APIs provided above, - this function should be used with the input user query as the string. - - - user_query (str): The input string that is out of domain. - - Returns nothing. - """ - - ''') - + functions.append(OUT_OF_DOMAIN_FUNCTION) return functions - - def _format_raven_function(self, user_prompt, functions): - """ - Nexus-Raven requires a specific format for the function description. - This function formats the function description in the required format. - """ - raven_prompt = "\n".join(self.generate_functions_from_dict(functions)) + "\n\n" - raven_prompt += "Setting: Allowed to issue multiple calls with semicolon\n" - raven_prompt += "User Query:" + user_prompt.replace("\n", "") + "" - return raven_prompt - - - def _query_raven(self, prompt): - """ - Query Nexus-Raven. - """ - - API_URL = "http://nexusraven.nexusflow.ai" + api_url = "http://nexusraven.nexusflow.ai" headers = {"Content-Type": "application/json"} def query(payload): - """ - Sends a payload to a TGI endpoint. - """ - response = requests.post(API_URL, headers=headers, json=payload) + """Sends a payload to a TGI endpoint.""" + response = requests.post(api_url, headers=headers, json=payload) return response.json() start = time.time() @@ -135,34 +157,4 @@ def query(payload): call = output[0]["generated_text"].replace("Call:", "").strip() return call, {"input_tokens": 0, "output_tokens": 0, "latency": latency} - def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, False) - raven_prompt = self._format_raven_function(prompt, functions) - result, metadata = self._query_raven(raven_prompt) - return result, metadata - - def decode_ast(self, result, language="Python"): - if result.endswith(";"): - result = result[:-1] - result = result.replace(";", ",") - func = "[" + result + "]" - decoded_output = ast_parse(func, language) - if "out_of_domain" in result: - return "irrelevant" - - return decoded_output - - def decode_execute(self, result): - if result.endswith(";"): - result = result[:-1] - result = result.replace(";", ",") - func = "[" + result + "]" - decoded_output = ast_parse(func) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list + diff --git a/berkeley-function-call-leaderboard/model_handler/nvidia_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nvidia.py similarity index 56% rename from berkeley-function-call-leaderboard/model_handler/nvidia_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nvidia.py index dc49b794b..02b29418d 100644 --- a/berkeley-function-call-leaderboard/model_handler/nvidia_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/nvidia.py @@ -1,42 +1,41 @@ -import time,os,json +import time +import os + from openai import OpenAI -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ast_parse -from model_handler.utils import ( - augment_prompt_by_languge, - language_specific_pre_processing, -) -from model_handler.constant import ( - USER_PROMPT_FOR_CHAT_MODEL, - SYSTEM_PROMPT_FOR_CHAT_MODEL, -) + +from bfcl.model_handler import utils +from bfcl.model_handler import constants +from bfcl.model_handler.base import BaseHandler, ModelStyle + class NvidiaHandler(BaseHandler): + model_style = ModelStyle.OPENAI + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.model_name = model_name - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens - self.model_style = ModelStyle.OpenAI + super().__init__(model_name, temperature, top_p, max_tokens) self.client = OpenAI( - base_url = "https://integrate.api.nvidia.com/v1", - api_key = os.getenv("NVIDIA_API_KEY") + base_url="https://integrate.api.nvidia.com/v1", + api_key=os.getenv("NVIDIA_API_KEY") ) + + @classmethod + def supported_models(cls): + return [ + 'nvidia/nemotron-4-340b-instruct', + ] + def inference(self, prompt, functions, test_category): - prompt = augment_prompt_by_languge(prompt,test_category) - functions = language_specific_pre_processing(functions,test_category,False) + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, False) message = [ { "role": "system", - "content": SYSTEM_PROMPT_FOR_CHAT_MODEL, + "content": constants.SYSTEM_PROMPT_FOR_CHAT_MODEL, }, { "role": "user", - "content": "Questions:" - + USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + "content": "Questions:" + constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), }, ] start_time = time.time() @@ -53,18 +52,8 @@ def inference(self, prompt, functions, test_category): output_token = response.usage.completion_tokens metadata = {"input_tokens": input_token, "output_tokens": output_token, "latency": latency} return result, metadata - - def write(self, result, file_to_open): - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists("./result/" + self.model_name.replace("/", "_")): - os.mkdir("./result/" + self.model_name.replace("/", "_")) - with open( - "./result/" + self.model_name.replace("/", "_") + "/" + file_to_open.replace(".json", "_result.json"), "a+" - ) as f: - f.write(json.dumps(result) + "\n") - def decode_ast(self, result, language="Python"): + def decode_ast(self, result, language="python"): result = result.replace("\n", "") if not result.startswith("["): result = "[ " + result @@ -76,10 +65,10 @@ def decode_ast(self, result, language="Python"): result = result.replace("','", ", ") if result.endswith("']"): result = result.replace("']", "]") - decode_output = ast_parse(result, language) + decode_output = utils.ast_parse(result, language) return decode_output - def decode_execute(self, result, language="Python"): + def decode_execute(self, result, language="python"): result = result.replace("\n", "") if not result.startswith("["): result = "[ " + result @@ -91,7 +80,7 @@ def decode_execute(self, result, language="Python"): result = result.replace("','", ", ") if result.endswith("']"): result = result.replace("']", "]") - decode_output = ast_parse(result, language) + decode_output = utils.ast_parse(result, language) execution_list = [] for function_call in decode_output: for key, value in function_call.items(): diff --git a/berkeley-function-call-leaderboard/model_handler/gpt_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py similarity index 65% rename from berkeley-function-call-leaderboard/model_handler/gpt_handler.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py index f8e4de938..354f4fe98 100644 --- a/berkeley-function-call-leaderboard/model_handler/gpt_handler.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/openai.py @@ -1,42 +1,50 @@ -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - convert_to_tool, - convert_to_function_call, - augment_prompt_by_languge, - language_specific_pre_processing, - ast_parse, -) -from model_handler.constant import ( - GORILLA_TO_OPENAPI, - GORILLA_TO_PYTHON, - USER_PROMPT_FOR_CHAT_MODEL, - SYSTEM_PROMPT_FOR_CHAT_MODEL, -) +import time +import os +import json + from openai import OpenAI -import os, time, json + +from bfcl.model_handler import utils, constants +from bfcl.model_handler.base import BaseHandler, ModelStyle class OpenAIHandler(BaseHandler): + model_style = ModelStyle.OPENAI + def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.OpenAI self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) - def inference(self, prompt,functions,test_category): + @classmethod + def supported_models(cls): + return [ + 'gpt-4o-2024-05-13', + 'gpt-4o-2024-05-13-FC', + 'gpt-4-turbo-2024-04-09', + 'gpt-4-turbo-2024-04-09-FC', + 'gpt-4-1106-preview', + 'gpt-4-1106-preview-FC', + 'gpt-4-0125-preview', + 'gpt-4-0125-preview-FC', + 'gpt-4-0613', + 'gpt-4-0613-FC', + 'gpt-3.5-turbo-0125', + 'gpt-3.5-turbo-0125-FC', + ] + + def inference(self, prompt, functions, test_category): if "FC" not in self.model_name: - prompt = augment_prompt_by_languge(prompt,test_category) - functions = language_specific_pre_processing(functions,test_category,False) + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, False) message = [ { "role": "system", - "content": SYSTEM_PROMPT_FOR_CHAT_MODEL, + "content": constants.SYSTEM_PROMPT_FOR_CHAT_MODEL, }, { "role": "user", - "content": USER_PROMPT_FOR_CHAT_MODEL.format( - user_prompt=prompt, functions=str(functions) - ), + "content": constants.USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, + functions=str(functions)), }, ] start_time = time.time() @@ -50,13 +58,13 @@ def inference(self, prompt,functions,test_category): latency = time.time() - start_time result = response.choices[0].message.content else: - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, True) + prompt = utils.augment_prompt_by_languge(prompt, test_category) + functions = utils.language_specific_pre_processing(functions, test_category, True) if type(functions) is not list: functions = [functions] message = [{"role": "user", "content": prompt}] - oai_tool = convert_to_tool( - functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True + oai_tool = utils.convert_to_tool( + functions, constants.GORILLA_TO_OPENAPI, self.model_style, test_category, True ) start_time = time.time() if len(oai_tool) > 0: @@ -90,26 +98,24 @@ def inference(self, prompt,functions,test_category): metadata["latency"] = latency return result,metadata - def decode_ast(self,result,language="Python"): + def decode_ast(self, result, language="python"): if "FC" not in self.model_name: - decoded_output = ast_parse(result,language) + decoded_output = utils.ast_parse(result,language) else: decoded_output = [] for invoked_function in result: name = list(invoked_function.keys())[0] params = json.loads(invoked_function[name]) - if language == "Python": - pass - else: + if language.lower() != "python": # all values of the json are casted to string for java and javascript for key in params: params[key] = str(params[key]) decoded_output.append({name: params}) return decoded_output - def decode_execute(self,result): + def decode_execute(self, result): if "FC" not in self.model_name: - decoded_output = ast_parse(result) + decoded_output = utils.ast_parse(result) execution_list = [] for function_call in decoded_output: for key, value in function_call.items(): @@ -118,5 +124,5 @@ def decode_execute(self,result): ) return execution_list else: - function_call = convert_to_function_call(result) + function_call = utils.convert_to_function_call(result) return function_call diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/snowflake.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/snowflake.py new file mode 100644 index 000000000..5c93b5fd5 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/snowflake.py @@ -0,0 +1,10 @@ +from bfcl.model_handler.proprietary_model.nvidia import NvidiaHandler + + +class SnowflakeHandler(NvidiaHandler): + + @classmethod + def supported_models(cls): + return [ + 'snowflake/arctic', + ] diff --git a/berkeley-function-call-leaderboard/model_handler/utils.py b/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py similarity index 91% rename from berkeley-function-call-leaderboard/model_handler/utils.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/utils.py index 4844f9fcc..a7d977c00 100644 --- a/berkeley-function-call-leaderboard/model_handler/utils.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py @@ -1,9 +1,11 @@ -import re, ast, builtins, ast, json -from model_handler.model_style import ModelStyle -from model_handler.constant import JAVA_TYPE_CONVERSION, JS_TYPE_CONVERSION -from model_handler.java_parser import parse_java_function_call -from model_handler.js_parser import parse_javascript_function_call -from model_handler.constant import GORILLA_TO_OPENAPI, USE_COHERE_OPTIMIZATION +import re +import ast +import builtins +import json + +from bfcl.model_handler import parser +from bfcl.model_handler.base import ModelStyle +from bfcl.model_handler import constants def _cast_to_openai_type(properties, mapping, test_category): @@ -12,7 +14,7 @@ def _cast_to_openai_type(properties, mapping, test_category): properties[key]["type"] = "string" else: var_type = value["type"] - if mapping == GORILLA_TO_OPENAPI and var_type == "float": + if mapping == constants.GORILLA_TO_OPENAPI and var_type == "float": properties[key]["format"] = "float" properties[key]["description"] += " This is a float type value." if var_type in mapping: @@ -58,13 +60,13 @@ def convert_to_tool( ): oai_tool = [] for item in functions: - if "." in item["name"] and ( - model_style == ModelStyle.OpenAI - or model_style == ModelStyle.Mistral - or model_style == ModelStyle.Google - or model_style == ModelStyle.OSSMODEL - or model_style == ModelStyle.Anthropic_FC - or model_style == ModelStyle.COHERE + if "." in item["name"] and model_style in ( + ModelStyle.OPENAI, + ModelStyle.MISTRAL, + ModelStyle.GOOGLE, + ModelStyle.ANTHROPIC_FC, + ModelStyle.COHERE, + ModelStyle.OSS_MODEL, ): # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name. item["name"] = re.sub(r"\.", "_", item["name"]) @@ -73,33 +75,29 @@ def convert_to_tool( item["parameters"]["properties"], mapping, test_category ) # When Java and Javascript, for OpenAPI compatible models, let it become string. - if ( - model_style - in [ - ModelStyle.OpenAI, - ModelStyle.Mistral, - ModelStyle.Google, - ModelStyle.Anthropic_Prompt, - ModelStyle.Anthropic_FC, - ModelStyle.FIREWORK_AI, - ModelStyle.OSSMODEL, - ModelStyle.COHERE, - ] - and stringify_parameters + if stringify_parameters and model_style in ( + ModelStyle.OPENAI, + ModelStyle.MISTRAL, + ModelStyle.GOOGLE, + ModelStyle.ANTHROPIC_FC, + ModelStyle.ANTHROPIC_PROMPT, + ModelStyle.FIREWORK_AI, + ModelStyle.COHERE, + ModelStyle.OSS_MODEL, ): properties = item["parameters"]["properties"] if test_category == "java": for key, value in properties.items(): - if value["type"] in JAVA_TYPE_CONVERSION: + if value["type"] in constants.JAVA_TYPE_CONVERSION: properties[key]["type"] = "string" elif test_category == "javascript": for key, value in properties.items(): - if value["type"] in JS_TYPE_CONVERSION: + if value["type"] in constants.JS_TYPE_CONVERSION: properties[key]["type"] = "string" - if model_style == ModelStyle.Anthropic_FC: + if model_style == ModelStyle.ANTHROPIC_FC: item["input_schema"] = item["parameters"] del item["parameters"] - if model_style == ModelStyle.Google: + if model_style == ModelStyle.GOOGLE: # Remove fields that are not supported by Gemini today. for params in item["parameters"]["properties"].values(): if "default" in params: @@ -113,7 +111,7 @@ def convert_to_tool( params["description"] += "The additional properties:" +str(params["additionalProperties"]) del params["additionalProperties"] if model_style == ModelStyle.COHERE: - if USE_COHERE_OPTIMIZATION: + if constants.USE_COHERE_OPTIMIZATION: if "required" not in item["parameters"]: item["parameters"]["required"] = [] for param_name, params in item["parameters"]["properties"].items(): @@ -181,11 +179,11 @@ def convert_to_tool( if "properties" in params: params["description"] += " Dictionary properties: " + str(params["properties"]) del params["properties"] - if model_style in [ - ModelStyle.Anthropic_Prompt, - ModelStyle.Google, - ModelStyle.OSSMODEL, - ]: + if model_style in ( + ModelStyle.ANTHROPIC_PROMPT, + ModelStyle.GOOGLE, + ModelStyle.OSS_MODEL, + ): oai_tool.append(item) elif model_style == ModelStyle.COHERE: parameter = item["parameters"]["properties"] @@ -204,11 +202,11 @@ def convert_to_tool( "parameter_definitions": parameter_definitions, } ) - elif model_style in [ - ModelStyle.OpenAI, - ModelStyle.Mistral, + elif model_style in ( + ModelStyle.OPENAI, + ModelStyle.MISTRAL, ModelStyle.FIREWORK_AI, - ]: + ): oai_tool.append({"type": "function", "function": item}) return oai_tool @@ -250,20 +248,20 @@ def convert_value(value, type_str): return value -def ast_parse(input_str, language="Python"): - if language == "Python": +def ast_parse(input_str, language="python"): + if language.lower() == "python": parsed = ast.parse(input_str, mode="eval") extracted = [] for elem in parsed.body.elts: assert isinstance(elem, ast.Call) extracted.append(resolve_ast_by_type(elem)) return extracted - elif language == "Java": - return parse_java_function_call( + elif language.lower() == "java": + return parser.parse_java_function_call( input_str[1:-1] ) # Remove the [ and ] from the string - elif language == "JavaScript": - return parse_javascript_function_call(input_str[1:-1]) + elif language.lower() == "javascript": + return parser.parse_javascript_function_call(input_str[1:-1]) else: raise NotImplementedError(f"Unsupported language: {language}") @@ -311,7 +309,7 @@ def resolve_ast_by_type(value): elif isinstance(value, ast.Name): output = value.id elif isinstance(value, ast.Call): - if len(value.keywords)==0: + if len(value.keywords) == 0: output = ast.unparse(value) else: output = resolve_ast_call(value) @@ -446,7 +444,7 @@ def construct_format_parameters_prompt(parameters): return constructed_prompt -def _function_calls_valid_format_and_invoke_extraction(last_completion): +def function_calls_valid_format_and_invoke_extraction(last_completion): """Check if the function call follows a valid format and extract the attempted function calls if so. Does not check if the tools actually exist or if they are called with the requisite params.""" # Check if there are any of the relevant XML tags present that would indicate an attempted function call. @@ -562,7 +560,7 @@ def _function_calls_valid_format_and_invoke_extraction(last_completion): } -def _convert_value(value, type_str): +def convert_value(value, type_str): """Convert a string value into its appropriate Python data type based on the provided type string. Arg: diff --git a/berkeley-function-call-leaderboard/model_handler/arctic_handler.py b/berkeley-function-call-leaderboard/model_handler/arctic_handler.py deleted file mode 100644 index fdfd9d219..000000000 --- a/berkeley-function-call-leaderboard/model_handler/arctic_handler.py +++ /dev/null @@ -1,41 +0,0 @@ -from model_handler.nvidia_handler import NvidiaHandler -from model_handler.utils import ast_parse - -class ArcticHandler(NvidiaHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - def decode_ast(self, result, language="Python"): - result = result.replace("\n", "") - if not result.startswith("["): - result = "[ " + result - if not result.endswith("]"): - result = result + " ]" - if result.startswith("['"): - result = result.replace("['", "[") - result = result.replace("', '", ", ") - result = result.replace("','", ", ") - if result.endswith("']"): - result = result.replace("']", "]") - decode_output = ast_parse(result, language) - return decode_output - - def decode_execute(self, result, language="Python"): - result = result.replace("\n", "") - if not result.startswith("["): - result = "[ " + result - if not result.endswith("]"): - result = result + " ]" - if result.startswith("['"): - result = result.replace("['", "[") - result = result.replace("', '", ", ") - result = result.replace("','", ", ") - if result.endswith("']"): - result = result.replace("']", "]") - decode_output = ast_parse(result, language) - execution_list = [] - for function_call in decode_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py b/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py deleted file mode 100644 index be108408b..000000000 --- a/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py +++ /dev/null @@ -1,88 +0,0 @@ -import json -import os -import time - -from anthropic import Anthropic -from anthropic.types import TextBlock, ToolUseBlock -from model_handler.claude_prompt_handler import ClaudePromptingHandler -from model_handler.constant import GORILLA_TO_OPENAPI -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - ast_parse, - augment_prompt_by_languge, - convert_to_function_call, - convert_to_tool, - language_specific_pre_processing, -) - - -class ClaudeFCHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.Anthropic_Prompt - - self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) - - def inference(self, prompt, functions, test_category): - if "FC" not in self.model_name: - handler = ClaudePromptingHandler(self.model_name, self.temperature, self.top_p, self.max_tokens) - return handler.inference(prompt, functions, test_category) - else: - prompt = augment_prompt_by_languge(prompt, test_category) - functions = language_specific_pre_processing(functions, test_category, True) - if type(functions) is not list: - functions = [functions] - claude_tool = convert_to_tool( - functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True - ) - message = [{"role": "user", "content": prompt}] - start_time = time.time() - - response = self.client.messages.create( - model=self.model_name.strip("-FC"), - max_tokens=self.max_tokens, - tools=claude_tool, - messages=message, - ) - latency = time.time() - start_time - text_outputs = [] - tool_call_outputs = [] - for content in response.content: - if isinstance(content, TextBlock): - text_outputs.append(content.text) - elif isinstance(content, ToolUseBlock): - tool_call_outputs.append({content.name: json.dumps(content.input)}) - result = tool_call_outputs if tool_call_outputs else text_outputs[0] - return result, {"input_tokens": response.usage.input_tokens, "output_tokens": response.usage.output_tokens, "latency": latency} - - def decode_ast(self,result,language="Python"): - if "FC" not in self.model_name: - decoded_output = ast_parse(result,language) - else: - decoded_output = [] - for invoked_function in result: - name = list(invoked_function.keys())[0] - params = json.loads(invoked_function[name]) - if language == "Python": - pass - else: - # all values of the json are casted to string for java and javascript - for key in params: - params[key] = str(params[key]) - decoded_output.append({name: params}) - return decoded_output - - def decode_execute(self,result): - if "FC" not in self.model_name: - decoded_output = ast_parse(result) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list - else: - function_call = convert_to_function_call(result) - return function_call diff --git a/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py b/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py deleted file mode 100644 index bedeb03a8..000000000 --- a/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py +++ /dev/null @@ -1,46 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import convert_to_function_call, ast_parse -import re - - -class DeepseekHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - formatted_prompt = """ - You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n - ### Instruction:\n - You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed-\n - {function}\n - Here is the question: {prompt}\n - Your job is to solve the above question using ONLY and strictly ONE line of python code given the above functions. If you think no function should be invoked return "[]".\n - If you think one or more function should be invoked, return the function call in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)] wrapped in python code" - ### Response:\n - """ - return formatted_prompt.format(function=function, prompt=prompt) - - def inference( - self, question_file, test_category, num_gpus, fromat_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, fromat_prompt_func - ) - - def decode_ast(self, result, language="Python"): - function_call = result.split("```")[1] - matches = re.findall(r"\[[^\]]*\]", function_call) - decoded_output = ast_parse(matches[0], language) - return decoded_output - - def decode_execute(self, result): - function_call = result.split("```")[1] - matches = re.findall(r"\[[^\]]*\]", function_call) - decoded_output = ast_parse(matches[0]) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/functionary_handler.py b/berkeley-function-call-leaderboard/model_handler/functionary_handler.py deleted file mode 100644 index 2213e758d..000000000 --- a/berkeley-function-call-leaderboard/model_handler/functionary_handler.py +++ /dev/null @@ -1,26 +0,0 @@ -from model_handler.gpt_handler import OpenAIHandler -from model_handler.model_style import ModelStyle -import os, json -from openai import OpenAI - -# For setup instructions, please refer to https://github.com/MeetKai/functionary for setup details. -class FunctionaryHandler(OpenAIHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens - self.model_name = model_name - self.model_style = ModelStyle.OpenAI - - self.client = OpenAI(base_url="http://localhost:8000/v1", api_key="functionary") - - def write(self, result, file_to_open): - model_name = self.model_name - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists("./result/" + model_name.replace("/", "_")): - os.mkdir("./result/" + model_name.replace("/", "_")) - with open( - "./result/" + model_name.replace("/", "_") + "/" + file_to_open, "a+" - ) as f: - f.write(json.dumps(result) + "\n") diff --git a/berkeley-function-call-leaderboard/model_handler/gemma_handler.py b/berkeley-function-call-leaderboard/model_handler/gemma_handler.py deleted file mode 100644 index fdb1f55d9..000000000 --- a/berkeley-function-call-leaderboard/model_handler/gemma_handler.py +++ /dev/null @@ -1,55 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import ast_parse -import re - - -class GemmaHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - formatted_prompt = """ - user\n - You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed-\n - {function}\n - Here is the questions you need to answer:\n - {prompt}\n - Your job is to solve the above question using ONLY and strictly ONE line of python code given the above functions. If you think no function should be invoked return "[]".\n - If you think one or more function should be invoked, return the function call in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)] wrapped in python code" - \n - model\n - """ - return formatted_prompt.format(function=function, prompt=prompt) - - def inference( - self, question_file, test_category, num_gpus, fromat_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, fromat_prompt_func - ) - - def decode_ast(self, result, language="Python"): - pattern = r"\[(.*)\]" - - # Searching for the pattern in the input text - match = re.search(pattern, result, re.DOTALL) - raw_input = match.group(1) - func = "[" + raw_input + "]" - decoded_output = ast_parse(func, language=language) - return decoded_output - - def decode_execute(self, result): - pattern = r"\[(.*)\]" - - # Searching for the pattern in the input text - match = re.search(pattern, result, re.DOTALL) - raw_input = match.group(1) - func = "[" + raw_input + "]" - decoded_output = ast_parse(func) - execution_list = [] - for function_call in decoded_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/glaive_handler.py b/berkeley-function-call-leaderboard/model_handler/glaive_handler.py deleted file mode 100644 index b5cdc6f7c..000000000 --- a/berkeley-function-call-leaderboard/model_handler/glaive_handler.py +++ /dev/null @@ -1,45 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import convert_to_function_call -import json - - -class GlaiveHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - formatted_prompt = """ - SYSTEM: You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed- - {function}\n - USER: {prompt}\n - """ - return formatted_prompt.format(function=function, prompt=prompt) - - def inference( - self, question_file, test_category, num_gpus, fromat_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, fromat_prompt_func - ) - - def decode_ast(self, result, language="Python"): - function_call = result.split("")[-1] - function_call = function_call.replace("'", "") - decoded_function = json.loads(function_call) - for key, value in decoded_function["arguments"].items(): - if language == "Python": - pass - else: - # all values of the json are casted to string for java and javascript - decoded_function["arguments"][key] = str( - decoded_function["arguments"][key] - ) - decoded_result = [{decoded_function["name"]: decoded_function["arguments"]}] - return decoded_result - - def decode_execute(self, result): - function_call = result.split("")[-1] - function_call = function_call.replace("'", "") - decoded_function = json.loads(function_call) - decoded_result = [{decoded_function["name"]: decoded_function["arguments"]}] - return convert_to_function_call(decoded_result) diff --git a/berkeley-function-call-leaderboard/model_handler/handler.py b/berkeley-function-call-leaderboard/model_handler/handler.py deleted file mode 100644 index dcad5eeda..000000000 --- a/berkeley-function-call-leaderboard/model_handler/handler.py +++ /dev/null @@ -1,50 +0,0 @@ -from model_handler.model_style import ModelStyle -import json, os - - -class BaseHandler: - model_name: str - model_style: ModelStyle - - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.model_name = model_name - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens - - def inference(self, prompt, functions, test_category): - # This method is used to retrive model response for each model. - pass - - def decode_ast(self, result, language="Python"): - # This method takes raw model output and convert it to standard AST checker input. - pass - - def decode_execute(self, result): - # This method takes raw model output and convert it to standard execute checker input. - pass - - def write(self, result, file_to_open): - # This method is used to write the result to the file. - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists("./result/" + self.model_name): - os.mkdir("./result/" + self.model_name) - with open( - "./result/" - + self.model_name - + "/" - + file_to_open.replace(".json", "_result.json"), - "a+", - ) as f: - f.write(json.dumps(result) + "\n") - - def load_result(self, test_category): - # This method is used to load the result from the file. - result_list = [] - with open( - f"./result/{self.model_name}/gorilla_openfunctions_v1_test_{test_category}_result.json" - ) as f: - for line in f: - result_list.append(json.loads(line)) - return result_list diff --git a/berkeley-function-call-leaderboard/model_handler/handler_map.py b/berkeley-function-call-leaderboard/model_handler/handler_map.py deleted file mode 100644 index bc72c105b..000000000 --- a/berkeley-function-call-leaderboard/model_handler/handler_map.py +++ /dev/null @@ -1,79 +0,0 @@ -from model_handler.arctic_handler import ArcticHandler -from model_handler.claude_fc_handler import ClaudeFCHandler -from model_handler.claude_prompt_handler import ClaudePromptingHandler -from model_handler.cohere_handler import CohereHandler -from model_handler.databricks_handler import DatabricksHandler -from model_handler.deepseek_handler import DeepseekHandler -from model_handler.firework_ai_handler import FireworkAIHandler -from model_handler.functionary_handler import FunctionaryHandler -from model_handler.gemini_handler import GeminiHandler -from model_handler.gemma_handler import GemmaHandler -from model_handler.glaive_handler import GlaiveHandler -from model_handler.gorilla_handler import GorillaHandler -from model_handler.gpt_handler import OpenAIHandler -from model_handler.hermes_handler import HermesHandler -from model_handler.llama_handler import LlamaHandler -from model_handler.mistral_handler import MistralHandler -from model_handler.nexus_handler import NexusHandler -from model_handler.oss_handler import OSSHandler -from model_handler.nvidia_handler import NvidiaHandler - -handler_map = { - "gorilla-openfunctions-v0": GorillaHandler, - "gorilla-openfunctions-v2": GorillaHandler, - "gpt-4o-2024-05-13": OpenAIHandler, - "gpt-4o-2024-05-13-FC": OpenAIHandler, - "gpt-4-turbo-2024-04-09-FC": OpenAIHandler, - "gpt-4-turbo-2024-04-09": OpenAIHandler, - "gpt-4-1106-preview-FC": OpenAIHandler, - "gpt-4-1106-preview": OpenAIHandler, - "gpt-4-0125-preview-FC": OpenAIHandler, - "gpt-4-0125-preview": OpenAIHandler, - "gpt-4-0613-FC": OpenAIHandler, - "gpt-4-0613": OpenAIHandler, - "gpt-3.5-turbo-0125-FC": OpenAIHandler, - "gpt-3.5-turbo-0125": OpenAIHandler, - "claude-2.1": ClaudePromptingHandler, - "claude-instant-1.2": ClaudePromptingHandler, - "claude-3-opus-20240229": ClaudePromptingHandler, - "claude-3-opus-20240229-FC": ClaudeFCHandler, - "claude-3-sonnet-20240229": ClaudePromptingHandler, - "claude-3-sonnet-20240229-FC": ClaudeFCHandler, - "claude-3-haiku-20240307": ClaudePromptingHandler, - "claude-3-haiku-20240307-FC": ClaudeFCHandler, - "claude-3-5-sonnet-20240620": ClaudePromptingHandler, - "claude-3-5-sonnet-20240620-FC": ClaudeFCHandler, - "mistral-large-2402": MistralHandler, - "mistral-large-2402-FC-Any": MistralHandler, - "mistral-large-2402-FC-Auto": MistralHandler, - "mistral-medium-2312": MistralHandler, - "mistral-small-2402": MistralHandler, - "mistral-small-2402-FC-Any": MistralHandler, - "mistral-small-2402-FC-Auto": MistralHandler, - "mistral-tiny-2312": MistralHandler, - "firefunction-v1-FC": FireworkAIHandler, - "firefunction-v2-FC": FireworkAIHandler, - "Nexusflow-Raven-v2": NexusHandler, - "gemini-1.0-pro": GeminiHandler, - "gemini-1.5-pro-preview-0409": GeminiHandler, - "gemini-1.5-pro-preview-0514": GeminiHandler, - "gemini-1.5-flash-preview-0514": GeminiHandler, - "gemma": OSSHandler, - "google/gemma-7b-it": GemmaHandler, - "glaiveai/glaive-function-calling-v1": GlaiveHandler, - "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler, - "meetkai/functionary-small-v2.2-FC": FunctionaryHandler, - "meetkai/functionary-medium-v2.2-FC": FunctionaryHandler, - "meetkai/functionary-small-v2.4-FC": FunctionaryHandler, - "meetkai/functionary-medium-v2.4-FC": FunctionaryHandler, - "databricks-dbrx-instruct": DatabricksHandler, - "NousResearch/Hermes-2-Pro-Mistral-7B": HermesHandler, - "meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler, - "meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler, - "command-r-plus-FC": CohereHandler, - "command-r-plus": CohereHandler, - "command-r-plus-FC-optimized": CohereHandler, - "command-r-plus-optimized": CohereHandler, - "snowflake/arctic": ArcticHandler, - "nvidia/nemotron-4-340b-instruct": NvidiaHandler, -} diff --git a/berkeley-function-call-leaderboard/model_handler/hermes_handler.py b/berkeley-function-call-leaderboard/model_handler/hermes_handler.py deleted file mode 100644 index 4d59555cd..000000000 --- a/berkeley-function-call-leaderboard/model_handler/hermes_handler.py +++ /dev/null @@ -1,92 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import convert_to_tool -from model_handler.constant import GORILLA_TO_OPENAPI -from model_handler.model_style import ModelStyle -import json - - -class HermesHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - # Hermes use Langchain to OpenAI conversion. It does not use tool call but function call. - function = convert_to_tool( - function, GORILLA_TO_OPENAPI, ModelStyle.OSSMODEL, test_category, True - ) - pydantic_format = """{"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}""" - tool_call_format = """{"arguments": , "name": }""" - formatted_prompt = """ -<|im_start|>system -You are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: {function} Use the following pydantic model json schema for each tool call you will make: {pydantic_format} For each function call return a json object with function name and arguments within XML tags as follows: - -{tool_call_format} - -<|im_end|> -<|im_start|>user -{prompt} -<|im_end|> - """ - return formatted_prompt.format( - function=function, - pydantic_format=pydantic_format, - tool_call_format=tool_call_format, - prompt=prompt, - ) - - def inference( - self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, format_prompt_func - ) - - def decode_ast(self, result, language="Python"): - lines = result.split("\n") - flag = False - func_call = [] - for line in lines: - if "" == line: - flag = True - elif "" == line: - flag = False - else: - if flag: - line = line.replace("'", '"') - tool_result = json.loads(line) - if language == "Python": - pass - else: - # all values of the json are casted to string for java and javascript - for key in tool_result["arguments"]: - tool_result["arguments"][key] = str( - tool_result["arguments"][key] - ) - func_call.append({tool_result["name"]: tool_result["arguments"]}) - flag = False - return func_call - - def decode_execute(self, result): - lines = result.split("\n") - flag = False - function_call_list = [] - for line in lines: - if "" == line: - flag = True - elif "" == line: - flag = False - else: - if flag: - line = line.replace("'", '"') - tool_result = json.loads(line) - function_call_list.append( - {tool_result["name"]: tool_result["arguments"]} - ) - flag = False - execution_list = [] - for function_call in function_call_list: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k,v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/llama_handler.py b/berkeley-function-call-leaderboard/model_handler/llama_handler.py deleted file mode 100644 index 7b1e3fd5c..000000000 --- a/berkeley-function-call-leaderboard/model_handler/llama_handler.py +++ /dev/null @@ -1,48 +0,0 @@ -from model_handler.oss_handler import OSSHandler -from model_handler.utils import ast_parse -from model_handler.constant import ( - SYSTEM_PROMPT_FOR_CHAT_MODEL, - USER_PROMPT_FOR_CHAT_MODEL, -) - - -class LlamaHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - - def _format_prompt(prompt, function, test_category): - conversations = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>{SYSTEM_PROMPT_FOR_CHAT_MODEL}<|eot_id|><|start_header_id|>user<|end_header_id|>{USER_PROMPT_FOR_CHAT_MODEL.format(user_prompt=prompt, functions=str(function))}<|eot_id|><|start_header_id|>assistant<|end_header_id|>""" - return conversations - - def inference( - self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt - ): - return super().inference( - question_file, test_category, num_gpus, format_prompt_func - ) - - def decode_ast(self, result, language="Python"): - func = result - func = func.replace("\n", "") # remove new line characters - if not func.startswith("["): - func = "[" + func - if not func.endswith("]"): - func = func + "]" - decoded_output = ast_parse(func, language) - return decoded_output - - def decode_execute(self, result): - func = result - func = func.replace("\n", "") # remove new line characters - if not func.startswith("["): - func = "[" + func - if not func.endswith("]"): - func = func + "]" - decode_output = ast_parse(func) - execution_list = [] - for function_call in decode_output: - for key, value in function_call.items(): - execution_list.append( - f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})" - ) - return execution_list diff --git a/berkeley-function-call-leaderboard/model_handler/model_style.py b/berkeley-function-call-leaderboard/model_handler/model_style.py deleted file mode 100644 index 81b8e30f1..000000000 --- a/berkeley-function-call-leaderboard/model_handler/model_style.py +++ /dev/null @@ -1,14 +0,0 @@ -from enum import Enum - - -class ModelStyle(Enum): - Gorilla = "gorilla" - OpenAI = "gpt" - Anthropic_FC = "claude" - Anthropic_Prompt = "claude" - Mistral = "mistral" - Google = "google" - FIREWORK_AI = "firework_ai" - NEXUS = "nexus" - OSSMODEL = "ossmodel" - COHERE = "cohere" diff --git a/berkeley-function-call-leaderboard/model_handler/oss_handler.py b/berkeley-function-call-leaderboard/model_handler/oss_handler.py deleted file mode 100644 index 206107878..000000000 --- a/berkeley-function-call-leaderboard/model_handler/oss_handler.py +++ /dev/null @@ -1,152 +0,0 @@ -import json -import os - -import ray -import shortuuid -import torch -from eval_checker.eval_checker_constant import FILENAME_INDEX_MAPPING -from model_handler.handler import BaseHandler -from model_handler.model_style import ModelStyle -from model_handler.utils import ( - ast_parse, - augment_prompt_by_languge, - language_specific_pre_processing, -) - -class OSSHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - super().__init__(model_name, temperature, top_p, max_tokens) - self.model_style = ModelStyle.OSSMODEL - self._init_model() - - def _init_model(self): - ray.init(ignore_reinit_error=True, num_cpus=8) - - def _format_prompt(prompt, function, test_category): - SYSTEM_PROMPT = """ - You are an helpful assistant who has access to the following functions to help the user, you can use the functions if needed- - """ - functions = "" - if isinstance(function, list): - for idx, func in enumerate(function): - functions += "\n" + str(func) - else: - functions += "\n" + str(function) - return f"SYSTEM: {SYSTEM_PROMPT}\n{functions}\nUSER: {prompt}\nASSISTANT: " - - @ray.remote(num_gpus=1) - @torch.inference_mode() - def _batch_generate( - question_jsons, - test_category, - model_path, - temperature, - max_tokens, - top_p, - format_prompt_func, - index, - ): - from vllm import LLM, SamplingParams - - prompts = [] - ans_jsons = [] - for line in question_jsons: - for key, value in FILENAME_INDEX_MAPPING.items(): - start, end = value - if index >= start and index < end: - test_category = key - break - ques_json = line - prompt = augment_prompt_by_languge(ques_json["question"], test_category) - functions = language_specific_pre_processing( - ques_json["function"], test_category, False - ) - prompts.append(format_prompt_func(prompt, functions, test_category)) - ans_id = shortuuid.uuid() - ans_jsons.append( - { - "answer_id": ans_id, - "question": ques_json["question"], - } - ) - - print("start generating: ", len(prompts)) - sampling_params = SamplingParams( - temperature=temperature, max_tokens=max_tokens, top_p=top_p - ) - llm = LLM(model=model_path, dtype="float16", trust_remote_code=True) - outputs = llm.generate(prompts, sampling_params) - final_ans_jsons = [] - for output, ans_json in zip(outputs, ans_jsons): - text = output.outputs[0].text - ans_json["text"] = text - final_ans_jsons.append(ans_json) - return final_ans_jsons - - def inference( - self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt - ): - - ques_jsons = [] - with open(question_file, "r") as ques_file: - for line in ques_file: - ques_jsons.append(json.loads(line)) - - chunk_size = len(ques_jsons) // num_gpus - ans_handles = [] - for i in range(0, len(ques_jsons), chunk_size): - ans_handles.append( - self._batch_generate.remote( - ques_jsons[i : i + chunk_size], - test_category, - self.model_name, - self.temperature, - self.max_tokens, - self.top_p, - format_prompt_func, - i, - ) - ) - ans_jsons = [] - for ans_handle in ans_handles: - ans_jsons.extend(ray.get(ans_handle)) - - return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0} - - def decode_ast(self, result, language="Python"): - func = result - if " " == func[0]: - func = func[1:] - if not func.startswith("["): - func = "[" + func - if not func.endswith("]"): - func = func + "]" - decode_output = ast_parse(func, language) - return decode_output - - def decode_execute(self, result): - return result - - def write(self, result, file_to_open): - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists("./result/" + self.model_name.replace("/", "_")): - os.mkdir("./result/" + self.model_name.replace("/", "_")) - with open( - "./result/" + self.model_name.replace("/", "_") + "/" + file_to_open, "a+" - ) as f: - f.write(json.dumps(result) + "\n") - - def load_result(self, test_category): - eval_data = [] - with open("./eval_data_total.json") as f: - for line in f: - eval_data.append(json.loads(line)) - result_list = [] - idx = 0 - with open(f"./result/{self.model_name}/result.json") as f: - for line in f: - if eval_data[idx]["test_category"] == test_category: - result_list.append(json.loads(line)) - idx += 1 - return result_list From 4121e51c5cdebe71f29b5e9102c5afddf8d41b90 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sun, 7 Jul 2024 19:02:39 -0400 Subject: [PATCH 07/35] Move `eval_checker` to `bfcl/eval_checker` --- .../bfcl/eval_checker/__init__.py | 0 .../{ => bfcl}/eval_checker/checker.py | 0 .../eval_checker_constant.py => bfcl/eval_checker/constants.py} | 0 .../{ => bfcl}/eval_checker/custom_exception.py | 0 .../{ => bfcl}/eval_checker/eval_runner_helper.py | 2 +- .../{ => bfcl}/eval_checker/executable_python_function.py | 0 .../{ => bfcl}/eval_checker/java_type_converter.py | 0 .../eval_checker/javascript_type_converter.py} | 0 .../{eval_checker/eval_runner.py => bfcl/evaluate.py} | 0 9 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/__init__.py rename berkeley-function-call-leaderboard/{ => bfcl}/eval_checker/checker.py (100%) rename berkeley-function-call-leaderboard/{eval_checker/eval_checker_constant.py => bfcl/eval_checker/constants.py} (100%) rename berkeley-function-call-leaderboard/{ => bfcl}/eval_checker/custom_exception.py (100%) rename berkeley-function-call-leaderboard/{ => bfcl}/eval_checker/eval_runner_helper.py (99%) rename berkeley-function-call-leaderboard/{ => bfcl}/eval_checker/executable_python_function.py (100%) rename berkeley-function-call-leaderboard/{ => bfcl}/eval_checker/java_type_converter.py (100%) rename berkeley-function-call-leaderboard/{eval_checker/js_type_converter.py => bfcl/eval_checker/javascript_type_converter.py} (100%) rename berkeley-function-call-leaderboard/{eval_checker/eval_runner.py => bfcl/evaluate.py} (100%) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/__init__.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/berkeley-function-call-leaderboard/eval_checker/checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/checker.py rename to berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_checker_constant.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/constants.py similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/eval_checker_constant.py rename to berkeley-function-call-leaderboard/bfcl/eval_checker/constants.py diff --git a/berkeley-function-call-leaderboard/eval_checker/custom_exception.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/custom_exception.py rename to berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py similarity index 99% rename from berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py rename to berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index a97e2ca55..be44faf66 100644 --- a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -481,7 +481,7 @@ "nvidia/nemotron-4-340b-instruct", ] -# Price got from AZure, 22.032 per hour for 8 V100, Pay As You Go Total Price +# Price got from Azure, 22.032 per hour for 8 V100, Pay As You Go Total Price # Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/ V100_x8_PRICE_PER_HOUR = 22.032 diff --git a/berkeley-function-call-leaderboard/eval_checker/executable_python_function.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_python_function.py similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/executable_python_function.py rename to berkeley-function-call-leaderboard/bfcl/eval_checker/executable_python_function.py diff --git a/berkeley-function-call-leaderboard/eval_checker/java_type_converter.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/java_type_converter.py similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/java_type_converter.py rename to berkeley-function-call-leaderboard/bfcl/eval_checker/java_type_converter.py diff --git a/berkeley-function-call-leaderboard/eval_checker/js_type_converter.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/javascript_type_converter.py similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/js_type_converter.py rename to berkeley-function-call-leaderboard/bfcl/eval_checker/javascript_type_converter.py diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/evaluate.py similarity index 100% rename from berkeley-function-call-leaderboard/eval_checker/eval_runner.py rename to berkeley-function-call-leaderboard/bfcl/evaluate.py From 12bdeed8c0a948923b689d0eabc5786783d6906e Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sun, 7 Jul 2024 19:04:01 -0400 Subject: [PATCH 08/35] Add benchmark module --- .../bfcl/__init__.py | 0 .../bfcl/benchmark.py | 103 ++++++++++++++++ .../bfcl/types.py | 112 ++++++++++++++++++ 3 files changed, 215 insertions(+) create mode 100644 berkeley-function-call-leaderboard/bfcl/__init__.py create mode 100644 berkeley-function-call-leaderboard/bfcl/benchmark.py create mode 100644 berkeley-function-call-leaderboard/bfcl/types.py diff --git a/berkeley-function-call-leaderboard/bfcl/__init__.py b/berkeley-function-call-leaderboard/bfcl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/berkeley-function-call-leaderboard/bfcl/benchmark.py b/berkeley-function-call-leaderboard/bfcl/benchmark.py new file mode 100644 index 000000000..9a9278f42 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/benchmark.py @@ -0,0 +1,103 @@ +import os +import argparse + +from dotenv import load_dotenv + +from bfcl.model_handler.base import BaseHandler, ModelStyle +from bfcl.types import (LeaderboardCategory, LeaderboardCategories, + LeaderboardVersion, ModelType) + +load_dotenv() + + +def main() -> None: + args = get_args() + if os.getenv('USE_COHERE_OPTIMIZATION') and 'command-r-plus' in args.model: + args.model += '-optimized' + + test_categories = _get_test_categories(args) + model_handler = _get_model_handler(args) + test_inputs = test_categories.load_data() + if model_handler.model_style == ModelStyle.OSS_MODEL: + result = model_handler.inference( + inputs=test_inputs, + test_category=test_categories, + num_gpus=args.num_gpus, + ) + for res in result[0]: + model_handler.write(res, "result.json") + + +def get_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument( + '--model', + type=str, + default='gorilla-openfunctions-v2', + help="Name of the LLM. (default: 'gorilla-openfunctions-v2')" + ) + parser.add_argument( + '--model-type', + type=ModelType, + choices=[category.value for category in ModelType], + default=ModelType.PROPRIETARY.value, + help="Model type: Open-source or Proprietary (default: 'proprietary')" + ) + parser.add_argument( + '--test-category', + type=str, + default=LeaderboardCategory.ALL.value, + help=( + 'Comma-separated list of test categories ' + f"({','.join(category.value for category in LeaderboardCategory)}). " + "(default: 'all')" + ) + ) + parser.add_argument( + '--version', + type=LeaderboardVersion, + default=LeaderboardVersion.V1.value, + choices=[category.value for category in LeaderboardVersion], + help="Leaderboard version. (default: 'v1')", + ) + parser.add_argument('--temperature', type=float, default=0.7, help='Temperature (default: 0.7)') + parser.add_argument('--top-p', type=float, default=1, help='Top-p (default: 1)') + parser.add_argument('--max-tokens', type=int, default=1000, help='Max tokens (default: 1000)') + parser.add_argument('--num-gpus', default=1, type=int, help='No. of GPUs (default: 1)') + parser.add_argument('--timeout', default=60, type=int, help='Timeout (default: 60)') + args = parser.parse_args() + return args + + +def _get_test_categories(args) -> LeaderboardCategories: + if args.test_category == LeaderboardCategory.ALL.value: + categories = [category for category in LeaderboardCategory if category != LeaderboardCategory.ALL] + else: + categories = [] + for value in args.test_category.split(','): + if value not in LeaderboardCategory._value2member_map_: + raise ValueError(f'Invalid test category: "{value}"!') + categories.append(LeaderboardCategory(value)) + return LeaderboardCategories(categories=categories, version=args.version) + + +def _get_model_handler(args) -> BaseHandler: + if args.model_type == ModelType.OSS: + from bfcl.model_handler.oss_model import MODEL_TO_HANDLER_CLS + elif args.model_type == ModelType.PROPRIETARY: + from bfcl.model_handler.proprietary_model import MODEL_TO_HANDLER_CLS + + assert (handler_cls := MODEL_TO_HANDLER_CLS.get(args.model_name)), \ + f'Invalid model name! Please select a {args.model_type.value} model from {tuple(MODEL_TO_HANDLER_CLS)}' + + return handler_cls( + model_name=args.model, + temperature=args.temperature, + top_p=args.top_p, + max_tokens=args.max_tokens, + ) + + +if __name__ == '__main__': + + main() \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/types.py b/berkeley-function-call-leaderboard/bfcl/types.py new file mode 100644 index 000000000..97d3dd7d4 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/types.py @@ -0,0 +1,112 @@ +import json +import hashlib +from enum import Enum +from typing import Any, List, Dict +from pathlib import Path + +from pydantic import BaseModel +from huggingface_hub import hf_hub_download + + +class ModelType(str, Enum): + OSS = 'oss' + PROPRIETARY = 'proprietary' + + +class LeaderboardExecutableCategory(str, Enum): + EXEC_SIMPLE = 'executable_simple' + EXEC_PARALLEL_FUNCTION = 'executable_parallel_function' + EXEC_MULTIPLE_FUNCTION = 'executable_multiple_function' + EXEC_PARALLEL_MULTIPLE_FUNCTION = 'executable_parallel_multiple_function' + REST = 'rest' + + +class LeaderboardAstCategory(str, Enum): + SIMPLE = 'simple' + RELEVANCE = 'relevance' + PARALLEL_FUNCTION = 'parallel_function' + MULTIPLE_FUNCTION = 'multiple_function' + PARALLEL_MULTIPLE_FUNCTION = 'parallel_multiple_function' + JAVA = 'java' + JAVASCRIPT = 'javascript' + + +class LeaderboardCategory(str, Enum): + EXEC_SIMPLE = LeaderboardExecutableCategory.EXEC_SIMPLE.value + EXEC_PARALLEL_FUNCTION = LeaderboardExecutableCategory.EXEC_PARALLEL_FUNCTION.value + EXEC_MULTIPLE_FUNCTION = LeaderboardExecutableCategory.EXEC_MULTIPLE_FUNCTION.value + EXEC_PARALLEL_MULTIPLE_FUNCTION = LeaderboardExecutableCategory.EXEC_PARALLEL_MULTIPLE_FUNCTION.value + REST = LeaderboardExecutableCategory.REST.value + SIMPLE = LeaderboardAstCategory.SIMPLE.value + RELEVANCE = LeaderboardAstCategory.RELEVANCE.value + PARALLEL_FUNCTION = LeaderboardAstCategory.PARALLEL_FUNCTION.value + MULTIPLE_FUNCTION = LeaderboardAstCategory.MULTIPLE_FUNCTION.value + PARALLEL_MULTIPLE_FUNCTION = LeaderboardAstCategory.PARALLEL_MULTIPLE_FUNCTION.value + JAVA = LeaderboardAstCategory.JAVA.value + JAVASCRIPT = LeaderboardAstCategory.JAVASCRIPT.value + SQL = 'sql' + CHATABLE = 'chatable' + ALL = 'all' # Adding the 'ALL' category + + +class LeaderboardVersion(str, Enum): + V1 = 'v1' + + +class LeaderboardCategories(BaseModel): + categories: List[LeaderboardCategory] + version: LeaderboardVersion = LeaderboardVersion.V1 + cache_dir: Path | str = '.cache' + + def model_post_init(self, __context: Any) -> None: + if LeaderboardCategory.ALL in self.categories: + self.categories = [cat for cat in LeaderboardCategory if cat != LeaderboardCategory.ALL] + self.cache_dir = Path.cwd() / self.cache_dir + + @property + def output_file_path(self) -> Path: + uid = self._generate_hash(self.model_dump_json()) + file_name = f'{uid}.jsonl' + return self.cache_dir / file_name + + def load_data(self) -> List[Dict]: + data = [] + if self.output_file_path.exists(): + print(f'Loading test data from "{self.output_file_path}" 🦍') + # Load cached data + with open(self.output_file_path, 'r') as file: + for line in file: + item = json.loads(line) + data.append(item) + else: + # Load data for each test category + for category, file_path in self._get_test_data(): + with open(file_path, 'r') as file: + for line in file: + item = json.loads(line) + item['test_category'] = category.value + item['id'] = self._generate_hash(json.dumps(item)) + data.append(item) + + # Save data + with open(self.output_file_path, 'w') as file: + for item in data: + file.write(item + '\n') + print(f'Test data successfully saved at "{self.output_file_path}" 🦍') + + return data + + def _get_test_data(self): + template = f'gorilla_openfunctions_{self.version.value}_test_{{}}.json' + for category in self.categories: + file_path = hf_hub_download( + repo_id='gorilla-llm/Berkeley-Function-Calling-Leaderboard', + filename=template.format(category.value), + repo_type='dataset', + cache_dir=self.cache_dir + ) + yield category, file_path + + def _generate_hash(self, input_str) -> str: + hash_object = hashlib.sha256(input_str.encode('utf-8')) + return hash_object.hexdigest() From 837c7677af55a5675eee2bb19dfc8e996f1a5caa Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sun, 7 Jul 2024 19:04:48 -0400 Subject: [PATCH 09/35] Remove `eval_data_compilation` - Test data compilation handled by `bfcl/types.py:LeaderboardCategories.load_data` method --- .../eval_data_compilation.py | 37 ------------------- 1 file changed, 37 deletions(-) delete mode 100644 berkeley-function-call-leaderboard/eval_data_compilation.py diff --git a/berkeley-function-call-leaderboard/eval_data_compilation.py b/berkeley-function-call-leaderboard/eval_data_compilation.py deleted file mode 100644 index 4338faac2..000000000 --- a/berkeley-function-call-leaderboard/eval_data_compilation.py +++ /dev/null @@ -1,37 +0,0 @@ -import json - -data = [] -""" - Compile evaluation data into a single file -""" - -test_files = [ - "executable_parallel_function", - "parallel_multiple_function", - "executable_simple", - "rest", - "sql", - "parallel_function", - "chatable", - "java", - "javascript", - "executable_multiple_function", - "simple", - "relevance", - "executable_parallel_multiple_function", - "multiple_function", -] - -for test_name in test_files: - with open(f"./data/gorilla_openfunctions_v1_test_{test_name}.json", "r") as file: - for line in file: - item = json.loads(line) - item["question_type"] = test_name - data.append(item) - -with open("./eval_data_total.json", "w") as file: - for item in data: - file.write(json.dumps(item)) - file.write("\n") - -print("Data successfully compiled into eval_data_total.json 🦍") From 1e0004f7960e2f28d7bc098a1d9533b8c72806cf Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Mon, 8 Jul 2024 17:43:47 +0000 Subject: [PATCH 10/35] Remove `poetry.lock` - poetry build system is no longer used --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index ae2f97ef7..e6c74d18b 100644 --- a/.gitignore +++ b/.gitignore @@ -55,5 +55,4 @@ berkeley-function-call-leaderboard/score/ .direnv/ .venv -poetry.lock .cache \ No newline at end of file From e52d5319b216447e3918940e222497f3d7c6fbef Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Mon, 8 Jul 2024 17:49:46 +0000 Subject: [PATCH 11/35] Add hugging face hub token --- berkeley-function-call-leaderboard/.env.example | 3 +++ 1 file changed, 3 insertions(+) diff --git a/berkeley-function-call-leaderboard/.env.example b/berkeley-function-call-leaderboard/.env.example index 0105854d1..a940c6642 100644 --- a/berkeley-function-call-leaderboard/.env.example +++ b/berkeley-function-call-leaderboard/.env.example @@ -1,3 +1,6 @@ +# [OPTIONAL] Only required for downloading gated hugging face models +HUGGING_FACE_HUB_TOKEN= + # [OPTIONAL] Only required for respective proprietary model evaluation OPENAI_API_KEY=sk-XXXXXX MISTRAL_API_KEY= From f0833ed236063c4b5dfb85e4fa11e37a990edf5c Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Mon, 8 Jul 2024 17:50:48 +0000 Subject: [PATCH 12/35] Update build system --- .../pyproject.toml | 73 ++++++++++--------- berkeley-function-call-leaderboard/setup.py | 4 + 2 files changed, 43 insertions(+), 34 deletions(-) create mode 100644 berkeley-function-call-leaderboard/setup.py diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index 48d16c087..f5bfa8720 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -1,42 +1,47 @@ -[tool.poetry] +[build-system] +requires = ["setuptools>=40.8.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] name = "bfcl" version = "0.1.0" description = "Berkeley Function Calling Leaderboard (BFCL)" -authors = ["NAME "] +authors = [ + {name="Shishir Patil", email="sgp@berkeley.edu"} +] readme = "README.md" -repository = "https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard" - -[tool.poetry.dependencies] -python = ">=3.9,<3.12" -requests = "*" -tqdm = "*" -numpy = "*" -shortuuid = "*" -huggingface_hub = "*" -pydantic = "^2.8.2" -python-dotenv = "^1.0.1" -tree-sitter = "^0.21.0" -datasets = "^2.19.2" -openai = "^1.35.10" -tree-sitter-java = "0.21.0" -tree-sitter-javascript = "0.21.4" -vllm = { version = "0.5.1", optional = true } -mistralai = { version = "^0.4.2", optional = true } -anthropic = { version = "^0.29.0", optional = true } -cohere = { version = "^5.2.5", optional = true } - -[tool.poetry.extras] -oss_eval = ["vllm"] -proprietary_eval = [ - "mistralai", - "anthropic", - "cohere" +requires-python = ">=3.9" +license = { "text" = "Apache 2.0" } +dependencies = [ + "requests", + "tqdm", + "numpy", + "huggingface_hub", + "pydantic>=2.8.2", + "python-dotenv>=1.0.1", + "tree-sitter~=0.21.0", + "tree-sitter-java==0.21.0", + "tree-sitter-javascript==0.21.4", + "openai>=1.35.10", ] -[tool.poetry.scripts] +[tool.setuptools.packages.find] +include = ["bfcl*"] + +[project.scripts] bfcl_benchmark = "bfcl.benchmark:main" -bfcl_eval = "bfcl.evaluate:main" -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +[project.urls] +Repository = "https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard" + +[project.optional-dependencies] +oss_eval = ["vllm==0.5.1"] +proprietary_eval = [ + "mistralai==0.4.2", + "anthropic==0.29.0", + "cohere==5.2.5", +] +all = [ + "bfcl[oss_eval]", + "bfcl[proprietary_eval]", +] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/setup.py b/berkeley-function-call-leaderboard/setup.py new file mode 100644 index 000000000..e81bcd1c6 --- /dev/null +++ b/berkeley-function-call-leaderboard/setup.py @@ -0,0 +1,4 @@ +import setuptools + +# This is to make sure that the package supports editable installs +setuptools.setup() \ No newline at end of file From 1e8da5af61ab9418b31b11a3ddbacceee17f3576 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Mon, 8 Jul 2024 17:53:16 +0000 Subject: [PATCH 13/35] Move `functionary` from `oss_model` to `proprietary_model` - To allow for separate dependencies for oss and proprietary model --- .../bfcl/model_handler/oss_model/__init__.py | 5 +---- .../bfcl/model_handler/proprietary_model/__init__.py | 5 +++-- .../{oss_model => proprietary_model}/functionary.py | 0 3 files changed, 4 insertions(+), 6 deletions(-) rename berkeley-function-call-leaderboard/bfcl/model_handler/{oss_model => proprietary_model}/functionary.py (100%) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py index 322eef851..9c73992ba 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/__init__.py @@ -1,5 +1,4 @@ from .deepseek import DeepseekHandler -from .functionary import FunctionaryHandler from .gemma import GemmaHandler from .glaive import GlaiveHandler from .hermes import HermesHandler @@ -7,7 +6,6 @@ __all__ = [ 'DeepseekHandler', - 'FunctionaryHandler', 'GemmaHandler', 'GlaiveHandler', 'HermesHandler', @@ -16,7 +14,6 @@ MODEL_TO_HANDLER_CLS = {} for handler_name in __all__: - module = globals()[handler_name] - handler_class = getattr(module, handler_name) + handler_class = globals()[handler_name] for model in handler_class.supported_models(): MODEL_TO_HANDLER_CLS[model] = handler_class \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py index 70f7cf387..cca90711e 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/__init__.py @@ -2,6 +2,7 @@ from .cohere import CohereHandler from .databricks import DatabricksHandler from .firework_ai import FireworkAIHandler +from .functionary import FunctionaryHandler from .gemini import GeminiHandler from .gorilla import GorillaHandler from .mistral import MistralHandler @@ -16,6 +17,7 @@ 'CohereHandler', 'DatabricksHandler', 'FireworkAIHandler', + 'FunctionaryHandler', 'GeminiHandler', 'GorillaHandler', 'MistralHandler', @@ -27,7 +29,6 @@ MODEL_TO_HANDLER_CLS = {} for handler_name in __all__: - module = globals()[handler_name] - handler_class = getattr(module, handler_name) + handler_class = globals()[handler_name] for model in handler_class.supported_models(): MODEL_TO_HANDLER_CLS[model] = handler_class \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/functionary.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/functionary.py similarity index 100% rename from berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/functionary.py rename to berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/functionary.py From 34a170a421b4608d6a93a050d7f7f09c8518e0bc Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Mon, 8 Jul 2024 17:56:47 +0000 Subject: [PATCH 14/35] Fix type error --- berkeley-function-call-leaderboard/bfcl/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/berkeley-function-call-leaderboard/bfcl/types.py b/berkeley-function-call-leaderboard/bfcl/types.py index 97d3dd7d4..35de37945 100644 --- a/berkeley-function-call-leaderboard/bfcl/types.py +++ b/berkeley-function-call-leaderboard/bfcl/types.py @@ -91,7 +91,7 @@ def load_data(self) -> List[Dict]: # Save data with open(self.output_file_path, 'w') as file: for item in data: - file.write(item + '\n') + file.write(json.dumps(item) + '\n') print(f'Test data successfully saved at "{self.output_file_path}" 🦍') return data From f7365216e92393ee234cc485195f7b5c969dfb9c Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Mon, 8 Jul 2024 17:57:24 +0000 Subject: [PATCH 15/35] Remove test category - test category is already added to each example during loading the data --- .../bfcl/benchmark.py | 16 +++++++--------- .../bfcl/model_handler/oss_model/base.py | 6 ++---- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/benchmark.py b/berkeley-function-call-leaderboard/bfcl/benchmark.py index 9a9278f42..d0fd2d1d0 100644 --- a/berkeley-function-call-leaderboard/bfcl/benchmark.py +++ b/berkeley-function-call-leaderboard/bfcl/benchmark.py @@ -19,13 +19,11 @@ def main() -> None: model_handler = _get_model_handler(args) test_inputs = test_categories.load_data() if model_handler.model_style == ModelStyle.OSS_MODEL: - result = model_handler.inference( - inputs=test_inputs, - test_category=test_categories, - num_gpus=args.num_gpus, - ) - for res in result[0]: - model_handler.write(res, "result.json") + responses = model_handler.inference(inputs=test_inputs, num_gpus=args.num_gpus) + file_name = test_categories.output_file_path.name.replace('.jsonl', '_result.jsonl') + model_handler.write(responses, file_name) + else: + raise NotImplementedError() def get_args() -> argparse.Namespace: @@ -87,8 +85,8 @@ def _get_model_handler(args) -> BaseHandler: elif args.model_type == ModelType.PROPRIETARY: from bfcl.model_handler.proprietary_model import MODEL_TO_HANDLER_CLS - assert (handler_cls := MODEL_TO_HANDLER_CLS.get(args.model_name)), \ - f'Invalid model name! Please select a {args.model_type.value} model from {tuple(MODEL_TO_HANDLER_CLS)}' + assert (handler_cls := MODEL_TO_HANDLER_CLS.get(args.model)), \ + f'Invalid model name "{args.model}"! Please select a {args.model_type.value} model from {tuple(MODEL_TO_HANDLER_CLS)}' return handler_cls( model_name=args.model, diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py index 63703163f..e9e479161 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py @@ -38,14 +38,13 @@ def get_prompt(self, user_input, functions, test_category) -> str: user_input=user_input, ) - def inference(self, inputs, test_category, num_gpus): + def inference(self, inputs, num_gpus): chunk_size = len(inputs) // num_gpus futures = [] for i in range(0, len(inputs), chunk_size): futures.append( self._batch_generate.remote( inputs[i: i + chunk_size], - test_category, self.model_name, self.sampling_params, get_prompt_func=self.get_prompt, @@ -79,9 +78,8 @@ def _batch_generate( get_prompt_func ): prompts = [] - for line in inputs: + for _input in inputs: test_category = _input["test_category"] - _input = line prompt = utils.augment_prompt_by_languge(_input["question"], test_category) functions = utils.language_specific_pre_processing(_input["function"], test_category, False) prompts.append(get_prompt_func(prompt, functions, test_category)) From 893c9afeb920a60383815dfe0854d7ae020014f5 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Mon, 8 Jul 2024 16:24:30 -0400 Subject: [PATCH 16/35] Make `eval_checker` consistent with `main` branch by merging (#496) --- .../bfcl/eval_checker/checker.py | 29 +- .../bfcl/eval_checker/custom_exception.py | 8 +- .../bfcl/eval_checker/eval_runner.py | 535 ++++++++++++++++++ .../bfcl/eval_checker/eval_runner_helper.py | 52 +- .../bfcl/evaluate.py | 518 ----------------- 5 files changed, 607 insertions(+), 535 deletions(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py index 7a64bc3bf..ee9562145 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py @@ -1,5 +1,3 @@ -from js_type_converter import js_type_converter -from java_type_converter import java_type_converter from model_handler.constant import ( UNDERSCORE_TO_DOT, JAVA_TYPE_CONVERSION, @@ -12,6 +10,11 @@ import time import json +# We switch to conditional import for the following two imports to avoid unnecessary installations. +# User doesn't need to setup the tree-sitter packages if they are not running the test for that language. +# from js_type_converter import js_type_converter +# from java_type_converter import java_type_converter + PYTHON_TYPE_MAPPING = { "string": str, "integer": int, @@ -362,9 +365,19 @@ def simple_function_checker( nested_type_converted = None if language == "Java": + from java_type_converter import java_type_converter + expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description] if expected_type_description in JAVA_TYPE_CONVERSION: + if type(value) != str: + result["valid"] = False + result["error"].append( + f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + result["error_type"] = "type_error:java" + return result + if expected_type_description in NESTED_CONVERSION_TYPE_LIST: nested_type = param_details[param]["items"]["type"] nested_type_converted = JAVA_TYPE_CONVERSION[nested_type] @@ -375,9 +388,19 @@ def simple_function_checker( value = java_type_converter(value, expected_type_description) elif language == "JavaScript": + from js_type_converter import js_type_converter + expected_type_converted = JS_TYPE_CONVERSION[expected_type_description] if expected_type_description in JS_TYPE_CONVERSION: + if type(value) != str: + result["valid"] = False + result["error"].append( + f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + result["error_type"] = "type_error:js" + return result + if expected_type_description in NESTED_CONVERSION_TYPE_LIST: nested_type = param_details[param]["items"]["type"] nested_type_converted = JS_TYPE_CONVERSION[nested_type] @@ -945,4 +968,4 @@ def exec_checker(decoded_result: list, func_description: dict, test_category: st func_description["execution_result"][0], func_description["execution_result_type"][0], False, - ) + ) \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py index e30fe81c5..3504862d8 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py @@ -1,10 +1,10 @@ class NoAPIKeyError(Exception): def __init__(self): - self.message = "Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." + self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." super().__init__(self.message) class BadAPIStatusError(Exception): - def __init__(self, message): - self.message = message - super().__init__(self.message) \ No newline at end of file + def __init__(self, errors, error_rate): + self.errors = errors + self.error_rate = error_rate \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py new file mode 100644 index 000000000..dd45c5dd4 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -0,0 +1,535 @@ +import sys + +sys.path.append("../") + +from checker import ast_checker, exec_checker, executable_checker_rest +from custom_exception import BadAPIStatusError +from eval_runner_helper import * +from tqdm import tqdm +import argparse + + +# NOTE: This file should be run in the `eval_checker` directory + + +def single_executable_file_runner( + handler, model_result, prompt, model_name, test_category +): + assert len(model_result) == len(prompt) + + result = [] + correct_count = 0 + for i in tqdm(range(len(model_result)), desc="Running tests"): + raw_result = model_result[i]["result"] + try: + decoded_result = handler.decode_execute(raw_result) + except Exception as e: + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [f"Failed to decode executable. {str(e)}"], + "error_type": "executable_decoder:decoder_failed", + "prompt": prompt[i], + "model_result_raw": raw_result, + } + ) + continue + + if "rest" in test_category: + # REST is always single-functioned. Therefore we take the first one and pass it to the REST checker. + if not is_rest_format_output(decoded_result): + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [ + "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." + ], + "error_type": "executable_decoder:rest_wrong_output_format", + "prompt": prompt[i], + "model_result_raw": str(raw_result), + "model_result_decoded": str(decoded_result), + } + ) + continue + + checker_result = executable_checker_rest(decoded_result[0], i) + + else: + if not is_executable_format_output(decoded_result): + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [ + "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." + ], + "error_type": "executable_decoder:wrong_output_format", + "prompt": prompt[i], + "model_result_raw": str(raw_result), + "model_result_decoded": str(decoded_result), + } + ) + continue + + prompt_item = prompt[i] + checker_result = exec_checker(decoded_result, prompt_item, test_category) + + if checker_result["valid"]: + correct_count += 1 + else: + temp = {} + temp["id"] = i + 1 + temp["model_name"] = model_name + temp["test_category"] = test_category + temp["valid"] = checker_result["valid"] + temp["error"] = checker_result["error"] + temp["error_type"] = checker_result["error_type"] + temp["prompt"] = prompt[i] + temp["model_result_raw"] = raw_result + temp["model_result_decoded"] = decoded_result + if "model_executed_output" in checker_result: + temp["model_executed_output"] = checker_result["model_executed_output"] + result.append(temp) + + accuracy = correct_count / len(model_result) + result.insert( + 0, + { + "accuracy": accuracy, + "correct_count": correct_count, + "total_count": len(model_result), + }, + ) + output_file_name = test_category + "_score.json" + output_file_dir = os.path.join(OUTPUT_PATH, model_name) + write_list_of_dicts_to_file(output_file_name, result, output_file_dir) + + return accuracy, len(model_result) + + +def single_relevance_file_runner(handler, model_result, model_name, test_category): + + result = [] + correct_count = 0 + for i in range(len(model_result)): + model_result_item = model_result[i]["result"] + success = False + decoded_result = None + + try: + decoded_result = handler.decode_ast(model_result_item, language="Python") + success = False + if is_empty_output(decoded_result): + success = True + + except Exception as e: + success = True + + if success: + correct_count += 1 + else: + temp = {} + temp["id"] = i + 1 + temp["model_name"] = model_name + temp["test_category"] = test_category + temp["valid"] = success + temp["error"] = [ + f"Valid syntax. Successfully decode AST when it should not." + ] + temp["error_type"] = "relevance_error:decoder_success" + temp["model_result"] = model_result_item + temp["decoded_result"] = decoded_result + + result.append(temp) + + accuracy = correct_count / len(model_result) + result.insert( + 0, + { + "accuracy": accuracy, + "correct_count": correct_count, + "total_count": len(model_result), + }, + ) + output_file_name = test_category + "_score.json" + output_file_dir = os.path.join(OUTPUT_PATH, model_name) + write_list_of_dicts_to_file(output_file_name, result, output_file_dir) + + return accuracy, len(model_result) + + +def single_ast_file_runner( + handler, model_result, prompt, possible_answer, language, test_category, model_name +): + assert ( + len(model_result) == len(prompt) == len(possible_answer) + ), "The length of the model result does not match the length of the prompt or possible answer. Please check the input files for completeness." + + result = [] + correct_count = 0 + for i in range(len(model_result)): + model_result_item = model_result[i]["result"] + prompt_item = prompt[i]["function"] + possible_answer_item = possible_answer[i] + + try: + model_result_item_raw = model_result_item + model_result_item = handler.decode_ast(model_result_item, language) + except Exception as e: + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [f"Invalid syntax. Failed to decode AST. {str(e)}"], + "error_type": "ast_decoder:decoder_failed", + "prompt": prompt[i], + "model_result_raw": model_result_item_raw, + "possible_answer": possible_answer_item, + } + ) + continue + + decoder_output_valid = is_function_calling_format_output(model_result_item) + if not decoder_output_valid: + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [ + "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." + ], + "error_type": "ast_decoder:decoder_wrong_output_format", + "prompt": prompt[i], + "model_result_raw": str(model_result_item_raw), + "model_result_decoded": str(model_result_item), + "possible_answer": possible_answer_item, + } + ) + continue + + checker_result = ast_checker( + prompt_item, + model_result_item, + possible_answer_item, + language, + test_category, + model_name, + ) + + if checker_result["valid"]: + correct_count += 1 + else: + temp = {} + temp["id"] = i + 1 + temp["model_name"] = model_name + temp["test_category"] = test_category + temp["valid"] = checker_result["valid"] + temp["error"] = checker_result["error"] + temp["error_type"] = checker_result["error_type"] + temp["prompt"] = prompt[i] + temp["model_result_raw"] = model_result_item_raw + temp["model_result_decoded"] = model_result_item + temp["possible_answer"] = possible_answer_item + result.append(temp) + + accuracy = correct_count / len(model_result) + result.insert( + 0, + { + "accuracy": accuracy, + "correct_count": correct_count, + "total_count": len(model_result), + }, + ) + output_file_name = test_category + "_score.json" + output_file_dir = os.path.join(OUTPUT_PATH, model_name) + write_list_of_dicts_to_file(output_file_name, result, output_file_dir) + + return accuracy, len(model_result) + + +#### Main runner function #### +def runner(model_names, test_categories, api_sanity_check): + + # A flag to indicate if the API has been tested. + # We should always test the API with ground truth first before running the executable tests. + # Sometimes the API may not be working as expected and we want to catch that before running the evaluation to ensure the results are accurate. + API_TESTED = False + API_STATUS_ERROR_REST = None + API_STATUS_ERROR_EXECUTABLE = None + + # Before running the executable evaluation, we need to get the expected output from the ground truth. + # So we need a list of all the test categories that we have ran the ground truth evaluation on. + # We only get the expected output once for each test category. + EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = [] + + # Get a list of all entries in the folder + entries = os.scandir(INPUT_PATH) + + # Filter out the subdirectories + subdirs = [entry.path for entry in entries if entry.is_dir()] + + # Traverse each subdirectory + for subdir in subdirs: + + model_name = subdir.split(INPUT_PATH)[1] + if model_names is not None and model_name not in model_names: + continue + + model_name_escaped = model_name.replace("_", "/") + + files = [ + f + for f in os.listdir(subdir) + if os.path.isfile(os.path.join(subdir, f)) and not f.startswith(".") + ] + # Check if there is only one file and that file is 'result.json' + # If so, this is an OSS model result file and we need to special process it first + if len(files) == 1 and files[0] == "result.json": + result_json_file_path = os.path.join(subdir, "result.json") + oss_file_formatter(result_json_file_path, subdir) + print( + f"Detected OSS model: {model_name}. result.json has been split into individual test category files." + ) + + # Pattern to match JSON files in this subdirectory + json_files_pattern = os.path.join(subdir, "*.json") + + print(f"🦍 Model: {model_name}") + + # Find and process all JSON files in the subdirectory + for model_result_json in glob.glob(json_files_pattern): + + if os.path.basename(model_result_json) == "result.json": + continue + + test_category = extract_after_test(model_result_json) + if test_categories is not None and test_category not in test_categories: + continue + + handler = get_handler(model_name_escaped) + + # We don't evaluate chatable and SQL models in our current leaderboard + if is_chatable(test_category) or is_sql(test_category): + continue + + language = "Python" + if is_java(test_category): + language = "Java" + if is_js(test_category): + language = "JavaScript" + + print(f"🔍 Running test: {test_category}") + + model_result = load_file(model_result_json) + record_cost_latency(LEADERBOARD_TABLE, model_name, model_result) + + if is_relevance(test_category): + accuracy, total_count = single_relevance_file_runner( + handler, model_result, model_name, test_category + ) + record_result( + LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count + ) + print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") + continue + + # Find the corresponding test file + prompt_file = find_file_with_suffix(PROMPT_PATH, test_category) + prompt = load_file(prompt_file) + + if is_executable(test_category): + # We only test the API with ground truth once + if not API_TESTED and api_sanity_check: + print("---- Sanity checking API status ----") + try: + api_status_sanity_check_rest() + except BadAPIStatusError as e: + API_STATUS_ERROR_REST = e + + try: + api_status_sanity_check_executable() + except BadAPIStatusError as e: + API_STATUS_ERROR_EXECUTABLE = e + + display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=True) + print("Continuing evaluation...") + + API_TESTED = True + + if ( + test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN + and not is_rest(test_category) + ): + print( + f"---- Getting real-time execution result from ground truth for {test_category} ----" + ) + get_executable_expected_output(prompt_file) + print( + f"---- Ground truth real-time execution result obtained for {test_category} 🌟 ----" + ) + EXECUTABLE_TEST_CATEGORIES_HAVE_RUN.append(test_category) + # Need to re-load the prompt file after getting the expected output, as the prompt file has been updated + prompt = load_file(prompt_file) + + accuracy, total_count = single_executable_file_runner( + handler, model_result, prompt, model_name, test_category + ) + record_result( + LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count + ) + print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") + + continue + + # Find the corresponding possible answer file + possible_answer_file = find_file_with_suffix( + POSSIBLE_ANSWER_PATH, test_category + ) + possible_answer = load_file(possible_answer_file) + accuracy, total_count = single_ast_file_runner( + handler, + model_result, + prompt, + possible_answer, + language, + test_category, + model_name, + ) + record_result( + LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count + ) + print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") + + # This function reads all the score files from local folder and updates the leaderboard table. + # This is helpful when you only want to run the evaluation for a subset of models and test categories. + update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH) + # Write the leaderboard table to a file + generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH) + + # Clean up the executable expected output files + # They should be re-generated the next time the evaluation is run + clean_up_executable_expected_output( + PROMPT_PATH, EXECUTABLE_TEST_CATEGORIES_HAVE_RUN + ) + + display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=False) + + print(f"🏁 Evaluation completed. See {os.path.abspath(OUTPUT_PATH + 'data.csv')} for evaluation results.") + + +ARG_PARSE_MAPPING = { + "ast": [ + "simple", + "multiple_function", + "parallel_function", + "parallel_multiple_function", + "java", + "javascript", + "relevance", + ], + "executable": [ + "executable_simple", + "executable_multiple_function", + "executable_parallel_function", + "executable_parallel_multiple_function", + "rest", + ], + "all": [ + "simple", + "multiple_function", + "parallel_function", + "parallel_multiple_function", + "java", + "javascript", + "relevance", + "executable_simple", + "executable_multiple_function", + "executable_parallel_function", + "executable_parallel_multiple_function", + "rest", + ], + "non-python": [ + "java", + "javascript", + ], + "python": [ + "simple", + "multiple_function", + "parallel_function", + "parallel_multiple_function", + "relevance", + "executable_simple", + "executable_multiple_function", + "executable_parallel_function", + "executable_parallel_multiple_function", + "rest", + ], +} + + +INPUT_PATH = "../result/" +PROMPT_PATH = "../data/" +POSSIBLE_ANSWER_PATH = "../data/possible_answer/" +OUTPUT_PATH = "../score/" + +# A dictionary to store the results +# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count +LEADERBOARD_TABLE = {} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process two lists of strings.") + + # Add arguments for two lists of strings + parser.add_argument( + "--model", nargs="+", type=str, help="A list of model names to evaluate" + ) + parser.add_argument( + "--test-category", + nargs="+", + type=str, + help="A list of test categories to run the evaluation on", + ) + parser.add_argument( + "-c", + "--api-sanity-check", + action="store_true", + default=False, # Default value is False, meaning the sanity check is skipped unless the flag is specified + help="Perform the REST API status sanity check before running the evaluation. By default, the sanity check is skipped.", + ) + + args = parser.parse_args() + + api_sanity_check = args.api_sanity_check + test_categories = None + if args.test_category is not None: + test_categories = [] + for test_category in args.test_category: + if test_category in ARG_PARSE_MAPPING: + test_categories.extend(ARG_PARSE_MAPPING[test_category]) + else: + test_categories.append(test_category) + + model_names = args.model + if args.model is not None: + model_names = [] + for model_name in args.model: + # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. + # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). + # We patch it here to avoid confusing the user. + model_names.append(model_name.replace("/", "_")) + + runner(model_names, test_categories, api_sanity_check) \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index be44faf66..83e1e8917 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -372,6 +372,12 @@ "https://huggingface.co/nvidia/nemotron-4-340b-instruct", "NVIDIA", "nvidia-open-model-license" + ], + "THUDM/glm-4-9b-chat": [ + "GLM-4-9b-Chat (FC)", + "https://huggingface.co/THUDM/glm-4-9b-chat", + "THUDM", + "glm-4" ] } @@ -467,6 +473,7 @@ "meta-llama/Meta-Llama-3-8B-Instruct": 73, "meta-llama/Meta-Llama-3-70B-Instruct": 307, "gorilla-openfunctions-v2": 83, + "THUDM/glm-4-9b-chat": 223 } @@ -479,9 +486,10 @@ "meetkai/functionary-small-v2.4-FC", "snowflake/arctic", "nvidia/nemotron-4-340b-instruct", + "THUDM/glm-4-9b-chat", ] -# Price got from Azure, 22.032 per hour for 8 V100, Pay As You Go Total Price +# Price got from AZure, 22.032 per hour for 8 V100, Pay As You Go Total Price # Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/ V100_x8_PRICE_PER_HOUR = 22.032 @@ -630,9 +638,7 @@ def api_status_sanity_check_rest(): errors.append((data, status)) if correct_count != len(ground_truth_replaced): - [print("Data:", data, "\nError:", status["error"]) for data, status in errors] - error_msg = f"API Status Test Failed for REST Section. {len(ground_truth_replaced) - correct_count} out of {len(ground_truth_replaced)} API behaviors are not as expected. Be careful with executable test category results; they may be inaccurate." - raise BadAPIStatusError(error_msg) + raise BadAPIStatusError(errors, f"{len(ground_truth_replaced) - correct_count} / {len(ground_truth_replaced)}") def api_status_sanity_check_executable(): @@ -656,11 +662,37 @@ def api_status_sanity_check_executable(): errors.append((data, status)) if correct_count != len(ground_truth): - [print("Data:", data, "\nError:", status["error"]) for data, status in errors] - error_msg = f"API Status Test Failed for Executable Section. {len(ground_truth) - correct_count} out of {len(ground_truth)} API behaviors are not as expected. Be careful with executable test category results; they may be inaccurate." - raise BadAPIStatusError(error_msg) - - + raise BadAPIStatusError(errors, f"{len(ground_truth) - correct_count} / {len(ground_truth)}") + + +def display_api_status_error(rest_error, executable_error, display_success=False): + if not rest_error and not executable_error: + if display_success: + print("🟢 All API Status Test Passed!") + return None + + RED_FONT = "\033[91m" + RESET = "\033[0m" + + print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n") + + if rest_error: + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test category (REST). Please contact API provider.\n") + print(f"{rest_error.error_rate} APIs affected:\n") + for data, status in rest_error.errors: + print(f" - Test Case: {data['ground_truth']}") + print(f" Error Type: {status['error_type']}\n") + + if executable_error: + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test categories (Non-REST). Please contact API provider.\n") + print(f"{executable_error.error_rate} APIs affected:\n") + for data, status in executable_error.errors: + print(f" - Test Case: {data['ground_truth'][0]}") + print(f" Error Type: {status['error_type']}\n") + + print(f"{RED_FONT}{'-' * 100}\n{RESET}") + + def get_executable_expected_output(prompt_file_path): # Before we run the evaluation, we need to add the "execution_result" field to the prompt file, using the ground truth data. prompt_content = load_file(prompt_file_path) @@ -995,4 +1027,4 @@ def collapse_json_objects(file_path): for obj in objects: json_obj = json.loads(obj) compact_json = json.dumps(json_obj, separators=(",", ":")) - out_file.write(compact_json + "\n") + out_file.write(compact_json + "\n") \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluate.py b/berkeley-function-call-leaderboard/bfcl/evaluate.py index ec0b557c1..e69de29bb 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluate.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluate.py @@ -1,518 +0,0 @@ -import sys - -sys.path.append("../") - -from checker import ast_checker, exec_checker, executable_checker_rest -from eval_runner_helper import * -from tqdm import tqdm -import argparse - - -# NOTE: This file should be run in the `eval_checker` directory - - -def single_executable_file_runner( - handler, model_result, prompt, model_name, test_category -): - assert len(model_result) == len(prompt) - - result = [] - correct_count = 0 - for i in tqdm(range(len(model_result)), desc="Running tests"): - raw_result = model_result[i]["result"] - try: - decoded_result = handler.decode_execute(raw_result) - except Exception as e: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [f"Failed to decode executable. {str(e)}"], - "error_type": "executable_decoder:decoder_failed", - "prompt": prompt[i], - "model_result_raw": raw_result, - } - ) - continue - - if "rest" in test_category: - # REST is always single-functioned. Therefore we take the first one and pass it to the REST checker. - if not is_rest_format_output(decoded_result): - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "executable_decoder:rest_wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(raw_result), - "model_result_decoded": str(decoded_result), - } - ) - continue - - checker_result = executable_checker_rest(decoded_result[0], i) - - else: - if not is_executable_format_output(decoded_result): - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "executable_decoder:wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(raw_result), - "model_result_decoded": str(decoded_result), - } - ) - continue - - prompt_item = prompt[i] - checker_result = exec_checker(decoded_result, prompt_item, test_category) - - if checker_result["valid"]: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = checker_result["valid"] - temp["error"] = checker_result["error"] - temp["error_type"] = checker_result["error_type"] - temp["prompt"] = prompt[i] - temp["model_result_raw"] = raw_result - temp["model_result_decoded"] = decoded_result - if "model_executed_output" in checker_result: - temp["model_executed_output"] = checker_result["model_executed_output"] - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -def single_relevance_file_runner(handler, model_result, model_name, test_category): - - result = [] - correct_count = 0 - for i in range(len(model_result)): - model_result_item = model_result[i]["result"] - success = False - decoded_result = None - - try: - decoded_result = handler.decode_ast(model_result_item, language="Python") - success = False - if is_empty_output(decoded_result): - success = True - - except Exception as e: - success = True - - if success: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = success - temp["error"] = [ - f"Valid syntax. Successfully decode AST when it should not." - ] - temp["error_type"] = "relevance_error:decoder_success" - temp["model_result"] = model_result_item - temp["decoded_result"] = decoded_result - - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -def single_ast_file_runner( - handler, model_result, prompt, possible_answer, language, test_category, model_name -): - assert ( - len(model_result) == len(prompt) == len(possible_answer) - ), "The length of the model result does not match the length of the prompt or possible answer. Please check the input files for completeness." - - result = [] - correct_count = 0 - for i in range(len(model_result)): - model_result_item = model_result[i]["result"] - prompt_item = prompt[i]["function"] - possible_answer_item = possible_answer[i] - - try: - model_result_item_raw = model_result_item - model_result_item = handler.decode_ast(model_result_item, language) - except Exception as e: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [f"Invalid syntax. Failed to decode AST. {str(e)}"], - "error_type": "ast_decoder:decoder_failed", - "prompt": prompt[i], - "model_result_raw": model_result_item_raw, - "possible_answer": possible_answer_item, - } - ) - continue - - decoder_output_valid = is_function_calling_format_output(model_result_item) - if not decoder_output_valid: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "ast_decoder:decoder_wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(model_result_item_raw), - "model_result_decoded": str(model_result_item), - "possible_answer": possible_answer_item, - } - ) - continue - - checker_result = ast_checker( - prompt_item, - model_result_item, - possible_answer_item, - language, - test_category, - model_name, - ) - - if checker_result["valid"]: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = checker_result["valid"] - temp["error"] = checker_result["error"] - temp["error_type"] = checker_result["error_type"] - temp["prompt"] = prompt[i] - temp["model_result_raw"] = model_result_item_raw - temp["model_result_decoded"] = model_result_item - temp["possible_answer"] = possible_answer_item - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -#### Main runner function #### -def runner(model_names, test_categories, api_sanity_check): - - # A flag to indicate if the API has been tested. - # We should always test the API with ground truth first before running the executable tests. - # Sometimes the API may not be working as expected and we want to catch that before running the evaluation to ensure the results are accurate. - API_TESTED = False - - # Before running the executable evaluation, we need to get the expected output from the ground truth. - # So we need a list of all the test categories that we have ran the ground truth evaluation on. - # We only get the expected output once for each test category. - EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = [] - - # Get a list of all entries in the folder - entries = os.scandir(INPUT_PATH) - - # Filter out the subdirectories - subdirs = [entry.path for entry in entries if entry.is_dir()] - - # Traverse each subdirectory - for subdir in subdirs: - - model_name = subdir.split(INPUT_PATH)[1] - if model_names is not None and model_name not in model_names: - continue - - model_name_escaped = model_name.replace("_", "/") - - files = [ - f - for f in os.listdir(subdir) - if os.path.isfile(os.path.join(subdir, f)) and not f.startswith(".") - ] - # Check if there is only one file and that file is 'result.json' - # If so, this is an OSS model result file and we need to special process it first - if len(files) == 1 and files[0] == "result.json": - result_json_file_path = os.path.join(subdir, "result.json") - oss_file_formatter(result_json_file_path, subdir) - print( - f"Detected OSS model: {model_name}. result.json has been split into individual test category files." - ) - - # Pattern to match JSON files in this subdirectory - json_files_pattern = os.path.join(subdir, "*.json") - - print(f"🦍 Model: {model_name}") - - # Find and process all JSON files in the subdirectory - for model_result_json in glob.glob(json_files_pattern): - - if os.path.basename(model_result_json) == "result.json": - continue - - test_category = extract_after_test(model_result_json) - if test_categories is not None and test_category not in test_categories: - continue - - handler = get_handler(model_name_escaped) - - # We don't evaluate chatable and SQL models in our current leaderboard - if is_chatable(test_category) or is_sql(test_category): - continue - - language = "Python" - if is_java(test_category): - language = "Java" - if is_js(test_category): - language = "JavaScript" - - print(f"🔍 Running test: {test_category}") - - model_result = load_file(model_result_json) - record_cost_latency(LEADERBOARD_TABLE, model_name, model_result) - - if is_relevance(test_category): - accuracy, total_count = single_relevance_file_runner( - handler, model_result, model_name, test_category - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - continue - - # Find the corresponding test file - prompt_file = find_file_with_suffix(PROMPT_PATH, test_category) - prompt = load_file(prompt_file) - - if is_executable(test_category): - # We only test the API with ground truth once - if not API_TESTED and api_sanity_check: - print("---- Sanity checking API status ----") - api_status_sanity_check_rest() - api_status_sanity_check_executable() - print("---- Sanity check Passed 💯 ----") - API_TESTED = True - - if ( - test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN - and not is_rest(test_category) - ): - print( - f"---- Getting real-time execution result from ground truth for {test_category} ----" - ) - get_executable_expected_output(prompt_file) - print( - f"---- Ground truth real-time execution result obtained for {test_category} 🌟 ----" - ) - EXECUTABLE_TEST_CATEGORIES_HAVE_RUN.append(test_category) - # Need to re-load the prompt file after getting the expected output, as the prompt file has been updated - prompt = load_file(prompt_file) - - accuracy, total_count = single_executable_file_runner( - handler, model_result, prompt, model_name, test_category - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - - continue - - # Find the corresponding possible answer file - possible_answer_file = find_file_with_suffix( - POSSIBLE_ANSWER_PATH, test_category - ) - possible_answer = load_file(possible_answer_file) - accuracy, total_count = single_ast_file_runner( - handler, - model_result, - prompt, - possible_answer, - language, - test_category, - model_name, - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - - # This function reads all the score files from local folder and updates the leaderboard table. - # This is helpful when you only want to run the evaluation for a subset of models and test categories. - update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH) - # Write the leaderboard table to a file - generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH) - - # Clean up the executable expected output files - # They should be re-generated the next time the evaluation is run - clean_up_executable_expected_output( - PROMPT_PATH, EXECUTABLE_TEST_CATEGORIES_HAVE_RUN - ) - - -ARG_PARSE_MAPPING = { - "ast": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "java", - "javascript", - "relevance", - ], - "executable": [ - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], - "all": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "java", - "javascript", - "relevance", - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], - "non-python": [ - "java", - "javascript", - ], - "python": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "relevance", - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], -} - - -INPUT_PATH = "../result/" -PROMPT_PATH = "../data/" -POSSIBLE_ANSWER_PATH = "../data/possible_answer/" -OUTPUT_PATH = "../score/" - -# A dictionary to store the results -# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count -LEADERBOARD_TABLE = {} - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Process two lists of strings.") - - # Add arguments for two lists of strings - parser.add_argument( - "--model", nargs="+", type=str, help="A list of model names to evaluate" - ) - parser.add_argument( - "--test-category", - nargs="+", - type=str, - help="A list of test categories to run the evaluation on", - ) - parser.add_argument( - "-s", - "--skip-api-sanity-check", - action="store_false", - default=True, # Default value is True, meaning the sanity check is performed unless the flag is specified - help="Skip the REST API status sanity check before running the evaluation. By default, the sanity check is performed.", - ) - - args = parser.parse_args() - - api_sanity_check = args.skip_api_sanity_check - test_categories = None - if args.test_category is not None: - test_categories = [] - for test_category in args.test_category: - if test_category in ARG_PARSE_MAPPING: - test_categories.extend(ARG_PARSE_MAPPING[test_category]) - else: - test_categories.append(test_category) - - model_names = args.model - if args.model is not None: - model_names = [] - for model_name in args.model: - # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. - # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). - # We patch it here to avoid confusing the user. - model_names.append(model_name.replace("/", "_")) - - runner(model_names, test_categories, api_sanity_check) From 88e84624be008c73ce6baa16eb71f114a9cf0888 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Mon, 8 Jul 2024 20:58:15 -0400 Subject: [PATCH 17/35] Standardize test groups - Use same test groups for benchmarking and evaluation - Add a custom enum class with intuitive methods to dynamically create test groups - Use custom enum to reduce manually creation of test groups - Update benchmark cli args to accept test group argument - Add pydantic validator to validate test group and test categories --- .../bfcl/benchmark.py | 32 ++++-- .../bfcl/types.py | 100 +++++++++++------- .../bfcl/utils.py | 28 +++++ 3 files changed, 111 insertions(+), 49 deletions(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/utils.py diff --git a/berkeley-function-call-leaderboard/bfcl/benchmark.py b/berkeley-function-call-leaderboard/bfcl/benchmark.py index d0fd2d1d0..ea8487de1 100644 --- a/berkeley-function-call-leaderboard/bfcl/benchmark.py +++ b/berkeley-function-call-leaderboard/bfcl/benchmark.py @@ -5,7 +5,7 @@ from bfcl.model_handler.base import BaseHandler, ModelStyle from bfcl.types import (LeaderboardCategory, LeaderboardCategories, - LeaderboardVersion, ModelType) + LeaderboardVersion, ModelType, LeaderboardCategoryGroup) load_dotenv() @@ -37,18 +37,25 @@ def get_args() -> argparse.Namespace: parser.add_argument( '--model-type', type=ModelType, - choices=[category.value for category in ModelType], + choices=[mtype.value for mtype in ModelType], default=ModelType.PROPRIETARY.value, help="Model type: Open-source or Proprietary (default: 'proprietary')" ) parser.add_argument( - '--test-category', + '--test-group', + type=LeaderboardCategoryGroup, + choices=[group.value for group in LeaderboardCategoryGroup], + default=None, + help='Test category group (default: None)' + ) + parser.add_argument( + '--test-categories', type=str, - default=LeaderboardCategory.ALL.value, + default=None, help=( 'Comma-separated list of test categories ' - f"({','.join(category.value for category in LeaderboardCategory)}). " - "(default: 'all')" + f"({','.join(cat.value for cat in LeaderboardCategory)}). " + "(default: None)" ) ) parser.add_argument( @@ -68,15 +75,18 @@ def get_args() -> argparse.Namespace: def _get_test_categories(args) -> LeaderboardCategories: - if args.test_category == LeaderboardCategory.ALL.value: - categories = [category for category in LeaderboardCategory if category != LeaderboardCategory.ALL] - else: + if args.test_categories: categories = [] - for value in args.test_category.split(','): + for value in args.test_categories.split(','): if value not in LeaderboardCategory._value2member_map_: raise ValueError(f'Invalid test category: "{value}"!') categories.append(LeaderboardCategory(value)) - return LeaderboardCategories(categories=categories, version=args.version) + args.test_categories = categories + return LeaderboardCategories( + test_group=args.test_group, + test_categories=args.test_categories, + version=args.version + ) def _get_model_handler(args) -> BaseHandler: diff --git a/berkeley-function-call-leaderboard/bfcl/types.py b/berkeley-function-call-leaderboard/bfcl/types.py index 35de37945..c087c81a3 100644 --- a/berkeley-function-call-leaderboard/bfcl/types.py +++ b/berkeley-function-call-leaderboard/bfcl/types.py @@ -1,71 +1,95 @@ import json import hashlib from enum import Enum -from typing import Any, List, Dict from pathlib import Path +from typing import Any, List, Dict, Type -from pydantic import BaseModel +from pydantic import BaseModel, model_validator from huggingface_hub import hf_hub_download +from bfcl.utils import CustomEnum + class ModelType(str, Enum): OSS = 'oss' PROPRIETARY = 'proprietary' +class LeaderboardNonPythonCategory(str, CustomEnum): + JAVA = 'java' + JAVASCRIPT = 'javascript' -class LeaderboardExecutableCategory(str, Enum): - EXEC_SIMPLE = 'executable_simple' - EXEC_PARALLEL_FUNCTION = 'executable_parallel_function' - EXEC_MULTIPLE_FUNCTION = 'executable_multiple_function' - EXEC_PARALLEL_MULTIPLE_FUNCTION = 'executable_parallel_multiple_function' - REST = 'rest' - - -class LeaderboardAstCategory(str, Enum): +class LeaderboardAstCategory(str, CustomEnum): SIMPLE = 'simple' RELEVANCE = 'relevance' - PARALLEL_FUNCTION = 'parallel_function' MULTIPLE_FUNCTION = 'multiple_function' + PARALLEL_FUNCTION = 'parallel_function' PARALLEL_MULTIPLE_FUNCTION = 'parallel_multiple_function' - JAVA = 'java' - JAVASCRIPT = 'javascript' - - -class LeaderboardCategory(str, Enum): - EXEC_SIMPLE = LeaderboardExecutableCategory.EXEC_SIMPLE.value - EXEC_PARALLEL_FUNCTION = LeaderboardExecutableCategory.EXEC_PARALLEL_FUNCTION.value - EXEC_MULTIPLE_FUNCTION = LeaderboardExecutableCategory.EXEC_MULTIPLE_FUNCTION.value - EXEC_PARALLEL_MULTIPLE_FUNCTION = LeaderboardExecutableCategory.EXEC_PARALLEL_MULTIPLE_FUNCTION.value - REST = LeaderboardExecutableCategory.REST.value - SIMPLE = LeaderboardAstCategory.SIMPLE.value - RELEVANCE = LeaderboardAstCategory.RELEVANCE.value - PARALLEL_FUNCTION = LeaderboardAstCategory.PARALLEL_FUNCTION.value - MULTIPLE_FUNCTION = LeaderboardAstCategory.MULTIPLE_FUNCTION.value - PARALLEL_MULTIPLE_FUNCTION = LeaderboardAstCategory.PARALLEL_MULTIPLE_FUNCTION.value - JAVA = LeaderboardAstCategory.JAVA.value - JAVASCRIPT = LeaderboardAstCategory.JAVASCRIPT.value - SQL = 'sql' - CHATABLE = 'chatable' - ALL = 'all' # Adding the 'ALL' category + JAVA = LeaderboardNonPythonCategory.JAVA.value + JAVASCRIPT = LeaderboardNonPythonCategory.JAVASCRIPT.value + +class LeaderboardExecutableCategory(str, CustomEnum): + EXECUTABLE_SIMPLE = 'executable_simple' + EXECUTABLE_PARALLEL_FUNCTION = 'executable_parallel_function' + EXECUTABLE_MULTIPLE_FUNCTION = 'executable_multiple_function' + EXECUTABLE_PARALLEL_MULTIPLE_FUNCTION = 'executable_parallel_multiple_function' + REST = 'rest' +LeaderboardPythonCategory: Type[CustomEnum] = ( + LeaderboardAstCategory + .add(LeaderboardExecutableCategory) + .subtract(LeaderboardNonPythonCategory) + .rename('LeaderboardPythonCategory') +) + +LeaderboardCategory: Type[CustomEnum] = ( + LeaderboardPythonCategory + .add(LeaderboardNonPythonCategory) + .rename('LeaderboardCategory') + .update(dict(SQL='sql', CHATABLE='chatable')) +) + +class LeaderboardCategoryGroup(str, Enum): + AST = 'ast' + EXECUTABLE = 'executable' + NON_PYTHON = 'non_python' + PYTHON = 'python' + ALL = 'all' + +CATEGORY_GROUP_MAPPING = { + LeaderboardCategoryGroup.AST: LeaderboardAstCategory, + LeaderboardCategoryGroup.EXECUTABLE: LeaderboardExecutableCategory, + LeaderboardCategoryGroup.NON_PYTHON: LeaderboardNonPythonCategory, + LeaderboardCategoryGroup.PYTHON: LeaderboardPythonCategory, + LeaderboardCategoryGroup.ALL: LeaderboardCategory +} class LeaderboardVersion(str, Enum): V1 = 'v1' class LeaderboardCategories(BaseModel): - categories: List[LeaderboardCategory] + test_group: LeaderboardCategoryGroup | None = None + test_categories: List[LeaderboardCategory] | None = None # type: ignore version: LeaderboardVersion = LeaderboardVersion.V1 cache_dir: Path | str = '.cache' + @model_validator(mode='before') + @classmethod + def check_either_field_provided(cls, values): + if values.get('test_group') is not None and values.get('test_categories') is not None: + raise ValueError("Provide either 'test_group' or 'test_categories', not both") + elif values.get('test_group') is None and values.get('test_categories') is None: + raise ValueError("Provide either 'test_group' or 'test_categories'") + return values + def model_post_init(self, __context: Any) -> None: - if LeaderboardCategory.ALL in self.categories: - self.categories = [cat for cat in LeaderboardCategory if cat != LeaderboardCategory.ALL] + if self.test_group: + self.test_categories = [cat for cat in CATEGORY_GROUP_MAPPING[self.test_group]] self.cache_dir = Path.cwd() / self.cache_dir - + @property def output_file_path(self) -> Path: - uid = self._generate_hash(self.model_dump_json()) + uid = self._generate_hash(self.model_dump_json(warnings=False)) file_name = f'{uid}.jsonl' return self.cache_dir / file_name @@ -98,7 +122,7 @@ def load_data(self) -> List[Dict]: def _get_test_data(self): template = f'gorilla_openfunctions_{self.version.value}_test_{{}}.json' - for category in self.categories: + for category in self.test_categories: file_path = hf_hub_download( repo_id='gorilla-llm/Berkeley-Function-Calling-Leaderboard', filename=template.format(category.value), diff --git a/berkeley-function-call-leaderboard/bfcl/utils.py b/berkeley-function-call-leaderboard/bfcl/utils.py new file mode 100644 index 000000000..f073f3a2b --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/utils.py @@ -0,0 +1,28 @@ +from enum import Enum + + +class CustomEnum(Enum): + @classmethod + def add(cls, other): + combined_members = {member.name: member.value for member in cls} + combined_members.update({member.name: member.value for member in other}) + return __class__(cls.__name__, combined_members) + + @classmethod + def subtract(cls, other): + remaining_members = { + member.name: member.value + for member in cls if member.value not in other._value2member_map_ + } + return __class__(cls.__name__, remaining_members) + + @classmethod + def rename(cls, new_name): + members = {member.name: member.value for member in cls} + return __class__(new_name, members) + + @classmethod + def update(cls, new_members): + members = {member.name: member.value for member in cls} + members.update(new_members) + return __class__(cls.__name__, members) \ No newline at end of file From cb7349a256b04a17393e076ff4b6879be6d476d5 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Tue, 9 Jul 2024 19:32:50 -0400 Subject: [PATCH 18/35] Improve test data downloading and saving model responses - Load original json test data files - Add `id` and `test_category` keys to each example - Save model responses for each test category in a separate file --- .../bfcl/benchmark.py | 16 +++++-- .../bfcl/model_handler/oss_model/base.py | 17 +++---- .../bfcl/types.py | 47 ++++++------------- 3 files changed, 35 insertions(+), 45 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/benchmark.py b/berkeley-function-call-leaderboard/bfcl/benchmark.py index ea8487de1..6a2505b34 100644 --- a/berkeley-function-call-leaderboard/bfcl/benchmark.py +++ b/berkeley-function-call-leaderboard/bfcl/benchmark.py @@ -17,11 +17,19 @@ def main() -> None: test_categories = _get_test_categories(args) model_handler = _get_model_handler(args) - test_inputs = test_categories.load_data() + test_category_to_data = test_categories.load_test_data() if model_handler.model_style == ModelStyle.OSS_MODEL: - responses = model_handler.inference(inputs=test_inputs, num_gpus=args.num_gpus) - file_name = test_categories.output_file_path.name.replace('.jsonl', '_result.jsonl') - model_handler.write(responses, file_name) + # Combine all samples to use GPUs efficiently + test_inputs = sum(test_category_to_data.values(), []) + combined_responses = model_handler.inference(inputs=test_inputs, num_gpus=args.num_gpus) + # Collect all the responses for each test category + test_category_to_responses = {} + for response in combined_responses: + test_category_to_responses.setdefault(response['test_category'], []).append(response) + # Save responses for each test category + for test_category, responses in test_category_to_responses.items(): + file_name = test_categories.get_file_name(test_category).replace('.json', '_result.jsonl') + model_handler.write(responses, file_name) else: raise NotImplementedError() diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py index e9e479161..f27b1d2af 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base.py @@ -1,4 +1,5 @@ import json +from typing import List, Dict import ray import torch @@ -29,7 +30,7 @@ def supported_models(cls): def _init_model(self) -> None: ray.init(ignore_reinit_error=True, num_cpus=8) - def get_prompt(self, user_input, functions, test_category) -> str: + def get_prompt(self, user_input, functions) -> str: if isinstance(functions, list): functions = json.dumps(functions) return self.prompt_template.format( @@ -38,7 +39,7 @@ def get_prompt(self, user_input, functions, test_category) -> str: user_input=user_input, ) - def inference(self, inputs, num_gpus): + def inference(self, inputs, num_gpus) -> List[Dict]: chunk_size = len(inputs) // num_gpus futures = [] for i in range(0, len(inputs), chunk_size): @@ -79,16 +80,16 @@ def _batch_generate( ): prompts = [] for _input in inputs: - test_category = _input["test_category"] - prompt = utils.augment_prompt_by_languge(_input["question"], test_category) - functions = utils.language_specific_pre_processing(_input["function"], test_category, False) - prompts.append(get_prompt_func(prompt, functions, test_category)) + test_category = _input['test_category'] + prompt = utils.augment_prompt_by_languge(_input['question'], test_category) + functions = utils.language_specific_pre_processing(_input['function'], test_category, False) + prompts.append(get_prompt_func(prompt, functions)) print(f'Getting responses for {len(prompts)} samples...') - llm = LLM(model=model_path, dtype="float16", trust_remote_code=True) + llm = LLM(model=model_path, dtype='float16', trust_remote_code=True) outputs = llm.generate(prompts, sampling_params) responses = [ - dict(id=_input['id'], response=output.outputs[0].text) + dict(id=_input['id'], test_category=_input['test_category'], response=output.outputs[0].text) for output, _input in zip(outputs, inputs) ] return responses diff --git a/berkeley-function-call-leaderboard/bfcl/types.py b/berkeley-function-call-leaderboard/bfcl/types.py index c087c81a3..406ed90cb 100644 --- a/berkeley-function-call-leaderboard/bfcl/types.py +++ b/berkeley-function-call-leaderboard/bfcl/types.py @@ -87,49 +87,30 @@ def model_post_init(self, __context: Any) -> None: self.test_categories = [cat for cat in CATEGORY_GROUP_MAPPING[self.test_group]] self.cache_dir = Path.cwd() / self.cache_dir - @property - def output_file_path(self) -> Path: - uid = self._generate_hash(self.model_dump_json(warnings=False)) - file_name = f'{uid}.jsonl' - return self.cache_dir / file_name - - def load_data(self) -> List[Dict]: - data = [] - if self.output_file_path.exists(): - print(f'Loading test data from "{self.output_file_path}" 🦍') - # Load cached data - with open(self.output_file_path, 'r') as file: + def load_test_data(self) -> Dict[LeaderboardCategory, List[Dict]]: # type: ignore + data = {} + for test_category, file_path in self._get_test_data(): + data[test_category] = [] + with open(file_path, 'r') as file: for line in file: item = json.loads(line) - data.append(item) - else: - # Load data for each test category - for category, file_path in self._get_test_data(): - with open(file_path, 'r') as file: - for line in file: - item = json.loads(line) - item['test_category'] = category.value - item['id'] = self._generate_hash(json.dumps(item)) - data.append(item) - - # Save data - with open(self.output_file_path, 'w') as file: - for item in data: - file.write(json.dumps(item) + '\n') - print(f'Test data successfully saved at "{self.output_file_path}" 🦍') - + item['test_category'] = test_category.value + item['id'] = self._generate_hash(json.dumps(item)) + data[test_category].append(item) return data + + def get_file_name(self, test_category: LeaderboardCategory) -> str: # type: ignore + return f'gorilla_openfunctions_{self.version.value}_test_{test_category.value}.json' def _get_test_data(self): - template = f'gorilla_openfunctions_{self.version.value}_test_{{}}.json' - for category in self.test_categories: + for test_category in self.test_categories: file_path = hf_hub_download( repo_id='gorilla-llm/Berkeley-Function-Calling-Leaderboard', - filename=template.format(category.value), + filename=self.get_file_name(test_category), repo_type='dataset', cache_dir=self.cache_dir ) - yield category, file_path + yield test_category, file_path def _generate_hash(self, input_str) -> str: hash_object = hashlib.sha256(input_str.encode('utf-8')) From a4a1c4fc67a9cf489e7fcae854cd58fd75be910f Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Tue, 9 Jul 2024 21:10:57 -0400 Subject: [PATCH 19/35] Support benchmarking of proprietary models --- .../bfcl/benchmark.py | 32 +++++++++++++++++-- .../bfcl/model_handler/base.py | 20 ++++++------ .../bfcl/model_handler/utils.py | 14 ++++---- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/benchmark.py b/berkeley-function-call-leaderboard/bfcl/benchmark.py index 6a2505b34..c62dfe825 100644 --- a/berkeley-function-call-leaderboard/bfcl/benchmark.py +++ b/berkeley-function-call-leaderboard/bfcl/benchmark.py @@ -1,6 +1,8 @@ import os +import json import argparse +from tqdm import tqdm from dotenv import load_dotenv from bfcl.model_handler.base import BaseHandler, ModelStyle @@ -18,6 +20,8 @@ def main() -> None: test_categories = _get_test_categories(args) model_handler = _get_model_handler(args) test_category_to_data = test_categories.load_test_data() + get_file_name = lambda cat: test_categories.get_file_name(cat).replace('.json', '_result.jsonl') + print('Getting model responses...') if model_handler.model_style == ModelStyle.OSS_MODEL: # Combine all samples to use GPUs efficiently test_inputs = sum(test_category_to_data.values(), []) @@ -28,10 +32,32 @@ def main() -> None: test_category_to_responses.setdefault(response['test_category'], []).append(response) # Save responses for each test category for test_category, responses in test_category_to_responses.items(): - file_name = test_categories.get_file_name(test_category).replace('.json', '_result.jsonl') - model_handler.write(responses, file_name) + model_handler.write(responses, file_name=get_file_name(test_category)) else: - raise NotImplementedError() + # Proprietary models + for test_category, test_inputs in test_category_to_data.items(): + # Check if model responses are already available for the test category + file_name = get_file_name(test_category) + responses = model_handler.load_model_responses(file_name) + if responses is not None and len(responses) == len(test_inputs): + continue + response_ids = set(rp['id'] for rp in responses) if responses else None + file_path = model_handler.model_dir / file_name + with open(file_path, 'a+') as file: + for test_input in tqdm(test_inputs, total=len(test_inputs), desc=f'{test_category.value}'): + if response_ids and test_input['id'] in response_ids: + continue + # TODO: Handle rate limits + try: + response, metadata = model_handler.inference( + prompt=test_input['question'], + functions=test_input['function'], + test_category=test_category, + ) + row = dict(id=test_input['id'], response=response, **metadata) + file.write(json.dumps(row) + '\n') + except Exception as e: + print('Failed to get response! Error:', e) def get_args() -> argparse.Namespace: diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/base.py b/berkeley-function-call-leaderboard/bfcl/model_handler/base.py index 0af6a099b..5db2ea6d6 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/base.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/base.py @@ -35,6 +35,8 @@ def __init__( self.result_dir = Path.cwd() / 'result' self.result_dir.mkdir(exist_ok=True) + self.model_dir = self.result_dir / self.model_name.replace('/', '--') + self.model_dir.mkdir(exist_ok=True) @classmethod @abstractmethod @@ -56,20 +58,20 @@ def decode_execute(self, result): """Takes raw model output and converts it to the standard execute checker input.""" pass - def write(self, responses: List[Dict], file_name): + def write(self, responses: List[Dict], file_name: str) -> None: """Write the model responses to the file.""" - model_dir = self.result_dir / self.model_name.replace('/', '--') - model_dir.mkdir(exist_ok=True, parents=True) - file_path = model_dir / file_name + file_path = self.model_dir / file_name with open(file_path, 'w') as file: for response in responses: file.write(json.dumps(response) + '\n') print(f'Saved model responses at "{file_path}".') - def load_result(self, file_path): - """Load the result from the file.""" + def load_model_responses(self, file_name: str) -> List[Dict] | None: + """Load the model responses if available.""" - with open(file_path, 'r') as f: - result = [json.loads(line) for line in f] - return result + file_path = self.model_dir / file_name + if file_path.exists(): + with open(file_path, 'r') as f: + result = [json.loads(line) for line in f] + return result diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py b/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py index a7d977c00..a2a37c07e 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py @@ -339,12 +339,12 @@ def augment_prompt_by_languge(prompt, test_category): return prompt -def language_specific_pre_processing(function, test_category, string_param): - if type(function) is dict: - function = [function] - if len(function) == 0: - return function - for item in function: +def language_specific_pre_processing(functions, test_category, string_param): + if isinstance(functions, (dict, str)): + functions = [functions] + if len(functions) == 0: + return functions + for item in functions: properties = item["parameters"]["properties"] if test_category == "java": for key, value in properties.items(): @@ -369,7 +369,7 @@ def language_specific_pre_processing(function, test_category, string_param): + value["type"] + " in string representation." ) - return function + return functions def construct_tool_use_system_prompt(tools): From 795d959d6237a26837e4a216173bef9b6ac4b80c Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Tue, 9 Jul 2024 21:12:29 -0400 Subject: [PATCH 20/35] Replaced with `bfcl/benchmark.py` --- .../openfunctions_evaluation.py | 110 ------------------ 1 file changed, 110 deletions(-) delete mode 100644 berkeley-function-call-leaderboard/openfunctions_evaluation.py diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py deleted file mode 100644 index cbf0f7f84..000000000 --- a/berkeley-function-call-leaderboard/openfunctions_evaluation.py +++ /dev/null @@ -1,110 +0,0 @@ -import argparse, json, os -from tqdm import tqdm -from model_handler.handler_map import handler_map -from model_handler.model_style import ModelStyle -from model_handler.constant import USE_COHERE_OPTIMIZATION - - -def get_args(): - parser = argparse.ArgumentParser() - # Refer to model_choice for supported models. - parser.add_argument("--model", type=str, default="gorilla-openfunctions-v2") - # Refer to test_categories for supported categories. - parser.add_argument("--test-category", type=str, default="all") - - # Parameters for the model that you want to test. - parser.add_argument("--temperature", type=float, default=0.7) - parser.add_argument("--top-p", type=float, default=1) - parser.add_argument("--max-tokens", type=int, default=1200) - parser.add_argument("--num-gpus", default=1, type=int) - parser.add_argument("--timeout", default=60, type=int) - - args = parser.parse_args() - return args - - -test_categories = { - "executable_simple": "gorilla_openfunctions_v1_test_executable_simple.json", - "executable_parallel_function": "gorilla_openfunctions_v1_test_executable_parallel_function.json", - "executable_multiple_function": "gorilla_openfunctions_v1_test_executable_multiple_function.json", - "executable_parallel_multiple_function": "gorilla_openfunctions_v1_test_executable_parallel_multiple_function.json", - "simple": "gorilla_openfunctions_v1_test_simple.json", - "relevance": "gorilla_openfunctions_v1_test_relevance.json", - "parallel_function": "gorilla_openfunctions_v1_test_parallel_function.json", - "multiple_function": "gorilla_openfunctions_v1_test_multiple_function.json", - "parallel_multiple_function": "gorilla_openfunctions_v1_test_parallel_multiple_function.json", - "java": "gorilla_openfunctions_v1_test_java.json", - "javascript": "gorilla_openfunctions_v1_test_javascript.json", - "rest": "gorilla_openfunctions_v1_test_rest.json", - "sql": "gorilla_openfunctions_v1_test_sql.json", -} - - -def build_handler(model_name, temperature, top_p, max_tokens): - handler = handler_map[model_name](model_name, temperature, top_p, max_tokens) - return handler - - -def load_file(test_category): - if test_category == "all": - test_cate, files_to_open = list(test_categories.keys()), list( - test_categories.values() - ) - else: - test_cate, files_to_open = [test_category], [test_categories[test_category]] - return test_cate, files_to_open - - -if __name__ == "__main__": - args = get_args() - if USE_COHERE_OPTIMIZATION and "command-r-plus" in args.model: - args.model = args.model + "-optimized" - handler = build_handler(args.model, args.temperature, args.top_p, args.max_tokens) - if handler.model_style == ModelStyle.OSSMODEL: - result = handler.inference( - question_file="eval_data_total.json", - test_category=args.test_category, - num_gpus=args.num_gpus, - ) - for res in result[0]: - handler.write(res, "result.json") - else: - test_cate, files_to_open = load_file(args.test_category) - for test_category, file_to_open in zip(test_cate, files_to_open): - print("Generating: " + file_to_open) - test_cases = [] - with open("./data/" + file_to_open) as f: - for line in f: - test_cases.append(json.loads(line)) - num_existing_result = 0 # if the result file already exists, skip the test cases that have been tested. - if os.path.exists( - "./result/" - + args.model.replace("/", "_") - + "/" - + file_to_open.replace(".json", "_result.json") - ): - with open( - "./result/" - + args.model.replace("/", "_") - + "/" - + file_to_open.replace(".json", "_result.json") - ) as f: - for line in f: - num_existing_result += 1 - for index, test_case in enumerate(tqdm(test_cases)): - if index < num_existing_result: - continue - user_question, functions = test_case["question"], test_case["function"] - if type(functions) is dict or type(functions) is str: - functions = [functions] - result, metadata = handler.inference( - user_question, functions, test_category - ) - result_to_write = { - "idx": index, - "result": result, - "input_token_count": metadata["input_tokens"], - "output_token_count": metadata["output_tokens"], - "latency": metadata["latency"], - } - handler.write(result_to_write, file_to_open) From c7c51672a5516553e3af32714ee134985504ac52 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Wed, 10 Jul 2024 21:31:45 -0400 Subject: [PATCH 21/35] Add relevance evaluator - Single cli entrypoint with subcommands to run benchmark and evaluation --- .../bfcl/benchmark.py | 111 ++----------- .../bfcl/cli.py | 148 ++++++++++++++++++ .../bfcl/evaluate.py | 38 +++++ .../bfcl/evaluator/__init__.py | 5 + .../bfcl/evaluator/constants.py | 111 +++++++++++++ .../bfcl/evaluator/evaluator.py | 127 +++++++++++++++ .../bfcl/evaluator/metrics.py | 75 +++++++++ .../bfcl/evaluator/utils.py | 22 +++ .../bfcl/types.py | 2 +- .../pyproject.toml | 2 +- 10 files changed, 538 insertions(+), 103 deletions(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/cli.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/__init__.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/constants.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/metrics.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/utils.py diff --git a/berkeley-function-call-leaderboard/bfcl/benchmark.py b/berkeley-function-call-leaderboard/bfcl/benchmark.py index c62dfe825..3d28ba961 100644 --- a/berkeley-function-call-leaderboard/bfcl/benchmark.py +++ b/berkeley-function-call-leaderboard/bfcl/benchmark.py @@ -1,26 +1,20 @@ -import os import json import argparse from tqdm import tqdm -from dotenv import load_dotenv -from bfcl.model_handler.base import BaseHandler, ModelStyle -from bfcl.types import (LeaderboardCategory, LeaderboardCategories, - LeaderboardVersion, ModelType, LeaderboardCategoryGroup) +from bfcl.types import Leaderboard +from bfcl.model_handler.base import ModelStyle, BaseHandler -load_dotenv() - -def main() -> None: - args = get_args() - if os.getenv('USE_COHERE_OPTIMIZATION') and 'command-r-plus' in args.model: - args.model += '-optimized' - - test_categories = _get_test_categories(args) - model_handler = _get_model_handler(args) - test_category_to_data = test_categories.load_test_data() - get_file_name = lambda cat: test_categories.get_file_name(cat).replace('.json', '_result.jsonl') +def benchmark( + leaderboard: Leaderboard, + model_handler: BaseHandler, + args: argparse.Namespace +) -> None: + + test_category_to_data = leaderboard.load_test_data() + get_file_name = lambda cat: leaderboard.get_file_name(cat).replace('.json', '_result.jsonl') print('Getting model responses...') if model_handler.model_style == ModelStyle.OSS_MODEL: # Combine all samples to use GPUs efficiently @@ -58,88 +52,3 @@ def main() -> None: file.write(json.dumps(row) + '\n') except Exception as e: print('Failed to get response! Error:', e) - - -def get_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument( - '--model', - type=str, - default='gorilla-openfunctions-v2', - help="Name of the LLM. (default: 'gorilla-openfunctions-v2')" - ) - parser.add_argument( - '--model-type', - type=ModelType, - choices=[mtype.value for mtype in ModelType], - default=ModelType.PROPRIETARY.value, - help="Model type: Open-source or Proprietary (default: 'proprietary')" - ) - parser.add_argument( - '--test-group', - type=LeaderboardCategoryGroup, - choices=[group.value for group in LeaderboardCategoryGroup], - default=None, - help='Test category group (default: None)' - ) - parser.add_argument( - '--test-categories', - type=str, - default=None, - help=( - 'Comma-separated list of test categories ' - f"({','.join(cat.value for cat in LeaderboardCategory)}). " - "(default: None)" - ) - ) - parser.add_argument( - '--version', - type=LeaderboardVersion, - default=LeaderboardVersion.V1.value, - choices=[category.value for category in LeaderboardVersion], - help="Leaderboard version. (default: 'v1')", - ) - parser.add_argument('--temperature', type=float, default=0.7, help='Temperature (default: 0.7)') - parser.add_argument('--top-p', type=float, default=1, help='Top-p (default: 1)') - parser.add_argument('--max-tokens', type=int, default=1000, help='Max tokens (default: 1000)') - parser.add_argument('--num-gpus', default=1, type=int, help='No. of GPUs (default: 1)') - parser.add_argument('--timeout', default=60, type=int, help='Timeout (default: 60)') - args = parser.parse_args() - return args - - -def _get_test_categories(args) -> LeaderboardCategories: - if args.test_categories: - categories = [] - for value in args.test_categories.split(','): - if value not in LeaderboardCategory._value2member_map_: - raise ValueError(f'Invalid test category: "{value}"!') - categories.append(LeaderboardCategory(value)) - args.test_categories = categories - return LeaderboardCategories( - test_group=args.test_group, - test_categories=args.test_categories, - version=args.version - ) - - -def _get_model_handler(args) -> BaseHandler: - if args.model_type == ModelType.OSS: - from bfcl.model_handler.oss_model import MODEL_TO_HANDLER_CLS - elif args.model_type == ModelType.PROPRIETARY: - from bfcl.model_handler.proprietary_model import MODEL_TO_HANDLER_CLS - - assert (handler_cls := MODEL_TO_HANDLER_CLS.get(args.model)), \ - f'Invalid model name "{args.model}"! Please select a {args.model_type.value} model from {tuple(MODEL_TO_HANDLER_CLS)}' - - return handler_cls( - model_name=args.model, - temperature=args.temperature, - top_p=args.top_p, - max_tokens=args.max_tokens, - ) - - -if __name__ == '__main__': - - main() \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/cli.py b/berkeley-function-call-leaderboard/bfcl/cli.py new file mode 100644 index 000000000..dd30aa8a6 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/cli.py @@ -0,0 +1,148 @@ +import os +import argparse + +from dotenv import load_dotenv + +from bfcl.benchmark import benchmark +from bfcl.evaluate import evaluate +from bfcl.model_handler.base import BaseHandler +from bfcl.types import (LeaderboardCategory, Leaderboard, LeaderboardVersion, + ModelType, LeaderboardCategoryGroup) + +load_dotenv() + + +def main(): + args = _get_args() + leaderboard = _load_leaderboard(args) + model_handler = _load_model_handler(args) + + if args.command == 'benchmark': + benchmark(leaderboard, model_handler, args) + else: + evaluate(leaderboard, model_handler, args) + + +def _get_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog='bfcl', + description='Berkeley Function Calling Leaderboard (BFCL)' + ) + + subparsers = parser.add_subparsers(dest='command', required=True, help='Sub-command to run') + + # Common arguments for both benchmark and evaluation + common_parser = argparse.ArgumentParser(add_help=False) + common_parser.add_argument( + '--model', + type=str, + default='gorilla-openfunctions-v2', + help="Name of the LLM. (default: 'gorilla-openfunctions-v2')" + ) + common_parser.add_argument( + '--model-type', + type=ModelType, + choices=[mtype.value for mtype in ModelType], + default=ModelType.PROPRIETARY.value, + help="Model type: Open-source or Proprietary (default: 'proprietary')" + ) + common_parser.add_argument( + '--test-group', + type=LeaderboardCategoryGroup, + choices=[group.value for group in LeaderboardCategoryGroup], + default=None, + help='Test category group (default: None)' + ) + common_parser.add_argument( + '--test-categories', + type=str, + default=None, + help=( + 'Comma-separated list of test categories ' + f"({','.join(cat.value for cat in LeaderboardCategory)}). " + "(default: None)" + ) + ) + common_parser.add_argument( + '--version', + type=LeaderboardVersion, + default=LeaderboardVersion.V1.value, + choices=[category.value for category in LeaderboardVersion], + help="Leaderboard version. (default: 'v1')", + ) + + _add_benchmark_args(subparsers, common_parser) + _add_evaluation_args(subparsers, common_parser) + + args = parser.parse_args() + return args + + +def _add_benchmark_args(subparsers, common_parser): + """Add benchmark-specific arguments.""" + + benchmark_parser = subparsers.add_parser('benchmark', parents=[common_parser], help='Run benchmark') + benchmark_parser.add_argument('--temperature', type=float, default=0.7, help='Temperature (default: 0.7)') + benchmark_parser.add_argument('--top-p', type=float, default=1, help='Top-p (default: 1)') + benchmark_parser.add_argument('--max-tokens', type=int, default=1000, help='Max tokens (default: 1000)') + benchmark_parser.add_argument('--num-gpus', default=1, type=int, help='No. of GPUs (default: 1)') + benchmark_parser.add_argument('--timeout', default=60, type=int, help='Timeout (default: 60)') + + +def _add_evaluation_args(subparsers, common_parser): + """Add evaluation-specific arguments.""" + + evaluator_parser = subparsers.add_parser('evaluate', parents=[common_parser], help='Run evaluation') + evaluator_parser.add_argument( + '--perform-api-sanity-check', + action='store_true', + default=False, + help='Perform the REST API status sanity check before running the evaluation. (default: False)', + ) + + +def _load_leaderboard(args: argparse.Namespace) -> Leaderboard: + if args.test_categories: + categories = [] + for value in args.test_categories.split(','): + if value not in LeaderboardCategory._value2member_map_: + raise ValueError(f'Invalid test category: "{value}"!') + categories.append(LeaderboardCategory(value)) + args.test_categories = categories + return Leaderboard( + test_group=args.test_group, + test_categories=args.test_categories, + version=args.version + ) + + +def _load_model_handler(args: argparse.Namespace) -> BaseHandler: + if args.model_type == ModelType.OSS: + from bfcl.model_handler.oss_model import MODEL_TO_HANDLER_CLS + elif args.model_type == ModelType.PROPRIETARY: + from bfcl.model_handler.proprietary_model import MODEL_TO_HANDLER_CLS + + if os.getenv('USE_COHERE_OPTIMIZATION') and 'command-r-plus' in args.model: + args.model += '-optimized' + + assert (handler_cls := MODEL_TO_HANDLER_CLS.get(args.model)), ( + f'Invalid model name "{args.model}"! Please select a {args.model_type.value} ' + f'model from {tuple(MODEL_TO_HANDLER_CLS)}' + ) + + # This model handler function is shared by `benchmark` and `evaluate` functions + # `evaluate` cli args doesn't required temperature, top_p and max_tokens, + # since for evaluation we won't be calling the inference method. + if hasattr(args, 'temperature'): + return handler_cls( + model_name=args.model, + temperature=args.temperature, + top_p=args.top_p, + max_tokens=args.max_tokens, + ) + else: + return handler_cls(model_name=args.model) + + +if __name__ == "__main__": + main() diff --git a/berkeley-function-call-leaderboard/bfcl/evaluate.py b/berkeley-function-call-leaderboard/bfcl/evaluate.py index e69de29bb..a32c29ba0 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluate.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluate.py @@ -0,0 +1,38 @@ +import json +import argparse +from pathlib import Path + +from bfcl.evaluator import LeaderboardEvaluator +from bfcl.types import Leaderboard, LeaderboardCategory +from bfcl.model_handler.base import BaseHandler + + +def evaluate( + leaderboard: Leaderboard, + model_handler: BaseHandler, + args: argparse.Namespace +) -> None: + + print('🦍 Model:', args.model) + evaluator = LeaderboardEvaluator(model_handler=model_handler, leaderboard=leaderboard) + file_name_to_test_category = {} + for test_category in leaderboard.test_categories: + if test_category in (LeaderboardCategory.SQL, LeaderboardCategory.CHATABLE): + print(f'Evaluation for test category "{test_category.value}" is not currently supported!') + else: + file_name = leaderboard.get_file_name(test_category) + file_name_to_test_category[Path(file_name).stem] = test_category + + for file_path in model_handler.model_dir.glob('*.jsonl'): + test_category = file_name_to_test_category.get(file_path.stem.replace('_result', '')) + if test_category is None: + continue + evaluator(file_path, test_category) + + metrics = evaluator.get_leaderboard_metrics() + metrics_json = json.dumps(metrics, indent=2) + file_path = model_handler.model_dir / 'leaderboard_evaluation_result.json' + file_path.write_text(metrics_json) + print(f'Saved leaderboard evaluation result at "{file_path}"') + print('🏁 Evaluation completed.') + print(metrics_json) \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/__init__.py new file mode 100644 index 000000000..f4a3e1fdf --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/__init__.py @@ -0,0 +1,5 @@ +from .evaluator import LeaderboardEvaluator + +__all__ = [ + 'LeaderboardEvaluator' +] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py b/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py new file mode 100644 index 000000000..82367595b --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py @@ -0,0 +1,111 @@ +INPUT_PRICE_PER_MILLION_TOKEN = { + "claude-3-opus-20240229-FC": 15, + "claude-3-opus-20240229": 15, + "claude-3-sonnet-20240229-FC": 3, + "claude-3-sonnet-20240229": 3, + "claude-3-haiku-20240307-FC": 0.25, + "claude-3-haiku-20240307": 0.25, + "claude-3-5-sonnet-20240620-FC": 3, + "claude-3-5-sonnet-20240620": 3, + "claude-2.1": 8, + "claude-instant-1.2": 0.8, + "mistral-large-2402-FC-Any": 4, + "mistral-large-2402-FC-Auto": 4, + "mistral-medium-2312": 2.7, + "mistral-small-2402-FC-Any": 1, + "mistral-small-2402-FC-Auto": 1, + "mistral-small-2402": 1, + "mistral-tiny-2312": 0.25, + "gpt-4o-2024-05-13-FC": 5, + "gpt-4o-2024-05-13": 5, + "gpt-4-1106-preview-FC": 10, + "gpt-4-1106-preview": 10, + "gpt-4-0125-preview": 10, + "gpt-4-0125-preview-FC": 10, + "gpt-4-turbo-2024-04-09-FC": 10, + "gpt-4-turbo-2024-04-09": 10, + "gpt-4-0613": 30, + "gpt-4-0613-FC": 30, + "gpt-3.5-turbo-0125": 0.5, + "gpt-3.5-turbo-0125-FC": 0.5, + "gemini-1.0-pro": 0.5, + "gemini-1.5-pro-preview-0409": 3.5, + "gemini-1.5-pro-preview-0514": 3.5, + "gemini-1.5-flash-preview-0514": 0.35, + "databricks-dbrx-instruct": 2.25, + "command-r-plus-FC": 3, + "command-r-plus": 3, + "command-r-plus-FC-optimized": 3, + "command-r-plus-optimized": 3, +} + +OUTPUT_PRICE_PER_MILLION_TOKEN = { + "claude-3-opus-20240229-FC": 75, + "claude-3-opus-20240229": 75, + "claude-3-sonnet-20240229-FC": 15, + "claude-3-sonnet-20240229": 15, + "claude-3-5-sonnet-20240620-FC": 15, + "claude-3-5-sonnet-20240620": 15, + "claude-3-haiku-20240307-FC": 1.25, + "claude-3-haiku-20240307": 1.25, + "claude-2.1": 24, + "claude-instant-1.2": 2.4, + "mistral-large-2402-FC-Any": 12, + "mistral-large-2402-FC-Auto": 12, + "mistral-small-2402": 3, + "mistral-medium-2312": 8.1, + "mistral-small-2402-FC-Any": 3, + "mistral-small-2402-FC-Auto": 3, + "mistral-tiny-2312": 0.25, + "gpt-4o-2024-05-13-FC": 15, + "gpt-4o-2024-05-13": 15, + "gpt-4-turbo-2024-04-09-FC": 30, + "gpt-4-turbo-2024-04-09": 30, + "gpt-4-1106-preview": 30, + "gpt-4-1106-preview-FC": 30, + "gpt-4-0125-preview-FC": 30, + "gpt-4-0125-preview": 30, + "gpt-4-0613": 60, + "gpt-4-0613-FC": 60, + "gpt-3.5-turbo-0125": 1.5, + "gpt-3.5-turbo-0125-FC": 1.5, + "gemini-1.0-pro": 1.5, + "gemini-1.5-pro-preview-0409": 10.50, + "gemini-1.5-pro-preview-0514": 10.50, + "gemini-1.5-flash-preview-0514": 0.53, + "databricks-dbrx-instruct": 6.75, + "command-r-plus-FC": 15, + "command-r-plus": 15, + "command-r-plus-FC-optimized": 15, + "command-r-plus-optimized": 15, +} + +# The latency of the open-source models are hardcoded here. +# Because we do batching when generating the data, so the latency is not +# accurate from the result data. +# This is the latency for the whole batch of data, when using 8 V100 GPUs. +OSS_LATENCY = { + "deepseek-ai/deepseek-coder-6.7b-instruct": 909, + "google/gemma-7b-it": 95, + "NousResearch/Hermes-2-Pro-Mistral-7B": 135, + "meta-llama/Meta-Llama-3-8B-Instruct": 73, + "meta-llama/Meta-Llama-3-70B-Instruct": 307, + "gorilla-openfunctions-v2": 83, + "THUDM/glm-4-9b-chat": 223 +} + +# Price got from Azure, 22.032 per hour for 8 V100, Pay As You Go Total Price +# Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/ +V100_x8_PRICE_PER_HOUR = 22.032 + +NO_COST_MODELS = [ + "Nexusflow-Raven-v2", + "firefunction-v1-FC", + "firefunction-v2-FC", + "meetkai/functionary-medium-v2.4-FC", + "meetkai/functionary-small-v2.2-FC", + "meetkai/functionary-small-v2.4-FC", + "snowflake/arctic", + "nvidia/nemotron-4-340b-instruct", + "THUDM/glm-4-9b-chat", +] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py new file mode 100644 index 000000000..d5255098e --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py @@ -0,0 +1,127 @@ +import json +from pathlib import Path +from typing import List, Dict, Any + +from pydantic import BaseModel + +from bfcl.model_handler.base import BaseHandler +from bfcl.types import Leaderboard, LeaderboardCategory +from bfcl.evaluator.metrics import LeaderboardModelMetrics +from bfcl.evaluator import utils as evaluator_utils + + +class FailedResult(BaseModel): + example_id: str + test_category: str + is_valid: bool + error_type: str + error_message: str + llm_response: str + decoded_result: Any + + +class LeaderboardEvaluator: + def __init__(self, model_handler: BaseHandler, leaderboard: Leaderboard) -> None: + self.model_name = model_handler.model_name + self.model_handler = model_handler + self.leaderboard = leaderboard + self._model_metrics = LeaderboardModelMetrics(self.model_name) + self._test_category_to_metrics = {} + + def __call__(self, file_path: Path, test_category) -> None: + model_responses = self.model_handler.load_model_responses(file_path.name) + if model_responses is None: + print(f'Skipping evaluation of test category "{test_category.value}" due to empty model responses!') + return + + if test_category == LeaderboardCategory.JAVA: + language = 'java' + elif test_category == LeaderboardCategory.JAVASCRIPT: + language = 'javascript' + else: + language = 'python' + + print('🔍 Running test:', test_category.value) + self._model_metrics(model_responses) + + accuracy = None + if test_category == LeaderboardCategory.RELEVANCE: + result = self.run_relevance_evaluator(model_responses) + accuracy = result['accuracy'] + + self._test_category_to_metrics[test_category] = dict( + accuracy=accuracy, + total_count=result['total_count'] + ) + print(f"✅ Test completed: {test_category.value} | 🎯 Accuracy: {accuracy:.4f}") + + def get_leaderboard_metrics(self) -> Dict: + model_metrics = self._model_metrics.compute() + total_count = 0 + weighted_total_accuracy = unweighted_total_accuracy = 0 + test_category_to_accuracy = {} + for test_category, metrics in self._test_category_to_metrics.items(): + test_category_to_accuracy[test_category.value] = metrics['accuracy'] + total_count += metrics['total_count'] + weighted_total_accuracy += metrics['accuracy'] * metrics['total_count'] + unweighted_total_accuracy += metrics['accuracy'] + return dict( + overall_accuracy_weighted=weighted_total_accuracy / total_count, + overall_accuracy_unweighted=unweighted_total_accuracy / len(self._test_category_to_metrics), + **test_category_to_accuracy, + **model_metrics, + ) + + def run_relevance_evaluator(self, model_responses: List[Dict]) -> Dict: + """Run function relevance detection. + + In relevance detection, we design a scenario where none of the provided functions + are relevant and supposed to be invoked. We expect the model's output to be no + function call.""" + + failed_model_responses = [] + correct_count = 0 + for response in model_responses: + model_response = response['response'] + success = False + decoded_result = None + try: + decoded_result = self.model_handler.decode_ast(model_response, language='python') + success = evaluator_utils.is_empty_output(decoded_result) + except Exception: + success = True + + if success: + correct_count += 1 + else: + result = FailedResult( + example_id=response['id'], + test_category=LeaderboardCategory.RELEVANCE.value, + is_valid=False, + error_type='relevance_error:decoder_success', + error_message='Valid syntax. Successfully decode AST when it should not.', + llm_response=model_response, + decoded_result=decoded_result, + ) + failed_model_responses.append(result) + + result = dict( + accuracy=correct_count / len(model_responses), + correct_count=correct_count, + total_count=len(model_responses), + failed_model_responses=failed_model_responses, + ) + self._save_scores(LeaderboardCategory.RELEVANCE, result) + return result + + def _save_scores(self, test_category, result) -> None: + if ( + (failed_model_responses := result.get('failed_model_responses')) + and isinstance(failed_model_responses[0], FailedResult) + ): + result['failed_model_responses'] = [rp.model_dump() for rp in failed_model_responses] + + file_name = self.leaderboard.get_file_name(test_category).replace('.json', '_score.json') + file_path = self.model_handler.model_dir / file_name + file_path.write_text(json.dumps(result, indent=2)) + print(f'Saved {test_category.value} evaluation result at "{file_path}".') \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/metrics.py b/berkeley-function-call-leaderboard/bfcl/evaluator/metrics.py new file mode 100644 index 000000000..d021260c9 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/metrics.py @@ -0,0 +1,75 @@ +from typing import Dict, List + +import numpy as np + +from bfcl.evaluator import constants + + +class LeaderboardModelMetrics: + def __init__(self, model_name: str) -> None: + self.model_name = model_name + self._init_metrics() + + def _init_metrics(self) -> None: + self._metrics = dict( + cost=dict(input_tokens=[], output_tokens=[]), + latency=[], + ) + + def reset(self) -> None: + self._init_metrics() + + def compute(self) -> Dict: + cost = mean_latency = std_latency = p95_latency = 'N/A' + if ( + self.model_name in constants.INPUT_PRICE_PER_MILLION_TOKEN + and len(self._metrics['cost']['input_tokens']) > 0 + and len(self._metrics['cost']['output_tokens']) > 0 + ): + mean_input_tokens = np.mean(self._metrics['cost']['input_tokens']) + mean_output_tokens = np.mean(self._metrics['cost']['output_tokens']) + cost = ( + mean_input_tokens * constants.INPUT_PRICE_PER_MILLION_TOKEN[self.model_name] + + mean_output_tokens * constants.OUTPUT_PRICE_PER_MILLION_TOKEN[self.model_name] + ) / 1000 + + if self.model_name in constants.OSS_LATENCY: + mean_latency = round(constants.OSS_LATENCY[self.model_name] / 1700, 2) + cost = mean_latency * 1000 * constants.V100_x8_PRICE_PER_HOUR / 3600 + elif len(self._metrics['latency']) != 0: + mean_latency = np.mean(self._metrics['latency']) + std_latency = np.std(self._metrics['latency']) + p95_latency = np.percentile(self._metrics['latency'], 95) + mean_latency = round(mean_latency, 2) + std_latency = round(std_latency, 2) + p95_latency = round(p95_latency, 2) + + if self.model_name not in constants.INPUT_PRICE_PER_MILLION_TOKEN: + cost = sum(self._metrics['latency']) * constants.V100_x8_PRICE_PER_HOUR / 3600 + cost = round(cost, 2) + + if self.model_name in constants.NO_COST_MODELS: + cost = 'N/A' + elif isinstance(cost, float): + cost = round(cost, 2) + + computed_metrics = dict( + cost=cost, + mean_latency=mean_latency, + std_latency=std_latency, + p95_latency=p95_latency + ) + return computed_metrics + + def __call__(self, model_responses: List[Dict]) -> None: + for response in model_responses: + if (latency := response.get('latency')): + self._metrics['latency'].append(latency) + if latency > 60: + print("*" * 100) + print(f"❗️Warning: Latency for a model '{self.model_name}' response is {latency:.4f}.") + print("*" * 100) + if (input_tokens := response.get('input_tokens')) and input_tokens != 0: + self._metrics['cost']['input_tokens'].append(input_tokens) + if (output_tokens := response.get('output_tokens')) and output_tokens != 0: + self._metrics['cost']['output_tokens'].append(output_tokens) diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py new file mode 100644 index 000000000..a54333492 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py @@ -0,0 +1,22 @@ +def is_empty_output(decoded_output) -> bool: + # This function is a patch to the ast decoder for relevance detection. + # Sometimes the ast decoder will parse successfully, but the input doesn't + # really have a function call. + # [], [{}], and anything that is not in function calling format is considered + # empty (and thus should be marked as correct). + if ( + not is_function_calling_format_output(decoded_output) + or len(decoded_output) == 0 + or (len(decoded_output) == 1 and len(decoded_output[0]) == 0) + ): + return True + +@staticmethod +def is_function_calling_format_output(decoded_output): + # Ensure the output is a list of dictionaries + if isinstance(decoded_output, list): + for item in decoded_output: + if not isinstance(item, dict): + return False + return True + return False diff --git a/berkeley-function-call-leaderboard/bfcl/types.py b/berkeley-function-call-leaderboard/bfcl/types.py index 406ed90cb..65572e07f 100644 --- a/berkeley-function-call-leaderboard/bfcl/types.py +++ b/berkeley-function-call-leaderboard/bfcl/types.py @@ -67,7 +67,7 @@ class LeaderboardVersion(str, Enum): V1 = 'v1' -class LeaderboardCategories(BaseModel): +class Leaderboard(BaseModel): test_group: LeaderboardCategoryGroup | None = None test_categories: List[LeaderboardCategory] | None = None # type: ignore version: LeaderboardVersion = LeaderboardVersion.V1 diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index f5bfa8720..bde5f28ca 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ include = ["bfcl*"] [project.scripts] -bfcl_benchmark = "bfcl.benchmark:main" +bfcl = "bfcl.cli:main" [project.urls] Repository = "https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard" From 1605012db497c65615aab7ec9979aed01ed71505 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Thu, 11 Jul 2024 12:39:36 -0400 Subject: [PATCH 22/35] Rename `benchmark` to `llm_generation` --- .../bfcl/{benchmark.py => llm_generation.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename berkeley-function-call-leaderboard/bfcl/{benchmark.py => llm_generation.py} (98%) diff --git a/berkeley-function-call-leaderboard/bfcl/benchmark.py b/berkeley-function-call-leaderboard/bfcl/llm_generation.py similarity index 98% rename from berkeley-function-call-leaderboard/bfcl/benchmark.py rename to berkeley-function-call-leaderboard/bfcl/llm_generation.py index 3d28ba961..68d355f72 100644 --- a/berkeley-function-call-leaderboard/bfcl/benchmark.py +++ b/berkeley-function-call-leaderboard/bfcl/llm_generation.py @@ -7,7 +7,7 @@ from bfcl.model_handler.base import ModelStyle, BaseHandler -def benchmark( +def collect_model_responses( leaderboard: Leaderboard, model_handler: BaseHandler, args: argparse.Namespace From 90a6bde3eb4c653427158e98f5b51969a33569dd Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Thu, 11 Jul 2024 12:40:10 -0400 Subject: [PATCH 23/35] Rename `evaluate` to `evaluation` --- .../bfcl/{evaluate.py => evaluation.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename berkeley-function-call-leaderboard/bfcl/{evaluate.py => evaluation.py} (100%) diff --git a/berkeley-function-call-leaderboard/bfcl/evaluate.py b/berkeley-function-call-leaderboard/bfcl/evaluation.py similarity index 100% rename from berkeley-function-call-leaderboard/bfcl/evaluate.py rename to berkeley-function-call-leaderboard/bfcl/evaluation.py From fb0a599aeca56d8b2eeec8a485aaf533026ca7d3 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Thu, 11 Jul 2024 12:41:10 -0400 Subject: [PATCH 24/35] Update sub-commands --- berkeley-function-call-leaderboard/bfcl/cli.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/cli.py b/berkeley-function-call-leaderboard/bfcl/cli.py index dd30aa8a6..5dca55708 100644 --- a/berkeley-function-call-leaderboard/bfcl/cli.py +++ b/berkeley-function-call-leaderboard/bfcl/cli.py @@ -3,8 +3,8 @@ from dotenv import load_dotenv -from bfcl.benchmark import benchmark -from bfcl.evaluate import evaluate +from bfcl.evaluation import evaluate +from bfcl.llm_generation import collect_model_responses from bfcl.model_handler.base import BaseHandler from bfcl.types import (LeaderboardCategory, Leaderboard, LeaderboardVersion, ModelType, LeaderboardCategoryGroup) @@ -17,8 +17,8 @@ def main(): leaderboard = _load_leaderboard(args) model_handler = _load_model_handler(args) - if args.command == 'benchmark': - benchmark(leaderboard, model_handler, args) + if args.command == 'llm_generation': + collect_model_responses(leaderboard, model_handler, args) else: evaluate(leaderboard, model_handler, args) @@ -71,17 +71,17 @@ def _get_args() -> argparse.Namespace: help="Leaderboard version. (default: 'v1')", ) - _add_benchmark_args(subparsers, common_parser) + _add_llm_generation_args(subparsers, common_parser) _add_evaluation_args(subparsers, common_parser) args = parser.parse_args() return args -def _add_benchmark_args(subparsers, common_parser): - """Add benchmark-specific arguments.""" +def _add_llm_generation_args(subparsers, common_parser): + """Add LLM generation specific arguments.""" - benchmark_parser = subparsers.add_parser('benchmark', parents=[common_parser], help='Run benchmark') + benchmark_parser = subparsers.add_parser('llm_generation', parents=[common_parser], help='Collect LLM responses') benchmark_parser.add_argument('--temperature', type=float, default=0.7, help='Temperature (default: 0.7)') benchmark_parser.add_argument('--top-p', type=float, default=1, help='Top-p (default: 1)') benchmark_parser.add_argument('--max-tokens', type=int, default=1000, help='Max tokens (default: 1000)') @@ -92,7 +92,7 @@ def _add_benchmark_args(subparsers, common_parser): def _add_evaluation_args(subparsers, common_parser): """Add evaluation-specific arguments.""" - evaluator_parser = subparsers.add_parser('evaluate', parents=[common_parser], help='Run evaluation') + evaluator_parser = subparsers.add_parser('evaluation', parents=[common_parser], help='Run evaluation') evaluator_parser.add_argument( '--perform-api-sanity-check', action='store_true', From a42fd29d3c5d1aa3f8bbde9f42964314f7503b75 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Thu, 11 Jul 2024 21:35:55 -0400 Subject: [PATCH 25/35] Add evaluation for executable group --- .../bfcl/evaluation.py | 6 +- .../bfcl/evaluator/checker.py | 443 +++++++++ .../bfcl/evaluator/evaluator.py | 166 +++- .../bfcl/evaluator/exceptions.py | 10 + .../bfcl/evaluator/exec_python_functions.py | 879 ++++++++++++++++++ .../bfcl/evaluator/utils.py | 46 +- 6 files changed, 1534 insertions(+), 16 deletions(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/exceptions.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py diff --git a/berkeley-function-call-leaderboard/bfcl/evaluation.py b/berkeley-function-call-leaderboard/bfcl/evaluation.py index a32c29ba0..e43dfda04 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluation.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluation.py @@ -14,7 +14,11 @@ def evaluate( ) -> None: print('🦍 Model:', args.model) - evaluator = LeaderboardEvaluator(model_handler=model_handler, leaderboard=leaderboard) + evaluator = LeaderboardEvaluator( + model_handler=model_handler, + leaderboard=leaderboard, + perform_api_sanity_check=args.perform_api_sanity_check + ) file_name_to_test_category = {} for test_category in leaderboard.test_categories: if test_category in (LeaderboardCategory.SQL, LeaderboardCategory.CHATABLE): diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker.py new file mode 100644 index 000000000..2061fbe16 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker.py @@ -0,0 +1,443 @@ +import os +import time +import json +from pathlib import Path +from typing import Dict, List + +from tqdm import tqdm + +from bfcl.types import LeaderboardExecutableCategory +from bfcl.evaluator.utils import display_api_status_error +from bfcl.evaluator.exceptions import BadAPIStatusError, NoAPIKeyError + + +class ExecutableChecker: + REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2 + + def __init__(self, cache_dir: str) -> None: + self.cache_dir = cache_dir + self.data_dir = Path(__file__, '..', '..', '..').resolve() / 'data' + self.rest_api_ground_truth_file_path = self.data_dir / 'api_status_check_ground_truth_REST.jsonl' + self.rest_eval_response_v5_file_path = self.data_dir / 'rest-eval-response_v5.jsonl' + with open(self.rest_eval_response_v5_file_path, 'r') as file: + self.rest_eval_response_data = [json.loads(line) for line in file] + self.executable_ground_truth_file_path = self.data_dir / 'api_status_check_ground_truth_executable.jsonl' + + def perform_api_sanity_checks(self) -> None: + print("---- Sanity checking API status ----") + try: + self.rest_api_status_sanity_check() + except BadAPIStatusError as e: + API_STATUS_ERROR_REST = e + try: + self.executable_api_status_sanity_check() + except BadAPIStatusError as e: + API_STATUS_ERROR_EXECUTABLE = e + display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=True) + + def rest_api_status_sanity_check(self) -> None: + # Use the ground truth data to make sure the API is working correctly + ground_truth_replaced = self._get_updated_rest_ground_truth_data() + correct_count = 0 + errors = [] + for idx, data in tqdm( + enumerate(ground_truth_replaced), + total=len(ground_truth_replaced), + desc="API Status Test (REST)", + ): + status = self.rest_executable_checker(data["ground_truth"], self.rest_eval_response_data[idx]) + if status["valid"]: + correct_count += 1 + else: + errors.append((data, status)) + + if correct_count != len(ground_truth_replaced): + raise BadAPIStatusError(errors, f"{len(ground_truth_replaced) - correct_count} / {len(ground_truth_replaced)}") + + def executable_api_status_sanity_check(self) -> None: + with open(self.executable_ground_truth_file_path, 'r') as file: + ground_truth = [json.loads(line) for line in file] + correct_count = 0 + errors = [] + for data in tqdm(ground_truth, total=len(ground_truth), desc="API Status Test (Non-REST)"): + status = self._simple_executable_checker( + data["ground_truth"][0], + data["execution_result"][0], + data["execution_result_type"][0], + True, + ) + if status["valid"]: + correct_count += 1 + else: + errors.append((data, status)) + + if correct_count != len(ground_truth): + raise BadAPIStatusError(errors, f"{len(ground_truth) - correct_count} / {len(ground_truth)}") + + def executable_checker( + self, + decoded_result: List, + func_description: Dict, + test_category: LeaderboardExecutableCategory + ): + if 'multiple' in test_category.value or 'parallel' in test_category.value: + return self._parallel_no_order_executable_checker( + decoded_result, + func_description["execution_result"], + func_description["execution_result_type"], + ) + + else: + if len(decoded_result) != 1: + return { + "valid": False, + "error": ["Wrong number of functions."], + "error_type": "simple_exec_checker:wrong_count", + } + return self._simple_executable_checker( + decoded_result[0], + func_description["execution_result"][0], + func_description["execution_result_type"][0], + False, + ) + + def _get_updated_rest_ground_truth_data(self) -> List[Dict]: + output_file_path = self.cache_dir / self.rest_api_ground_truth_file_path.name + # Avoid loading the output file from the cache, since the api keys might change + + placeholders = {} + env_vars = ('GEOCODE_API_KEY', 'RAPID_API_KEY', 'OMDB_API_KEY', 'EXCHANGERATE_API_KEY') + for var in env_vars: + assert (api_key := os.getenv(var)), f'Please provide your {var} in the `.env` file.' + placeholders['YOUR-' + var.replace('_', '-')] = api_key + print("All API keys are present.") + + def replace_placeholders(data): + if isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, (dict, list)): + replace_placeholders(value) + elif isinstance(value, str): + for placeholder, actual_value in placeholders.items(): + if placeholder in value: # Check if placeholder is in the string + data[key] = value.replace(placeholder, actual_value) + elif isinstance(data, list): + for idx, item in enumerate(data): + if isinstance(item, (dict, list)): + replace_placeholders(item) + elif isinstance(item, str): + for placeholder, actual_value in placeholders.items(): + if placeholder in item: # Check if placeholder is in the string + data[idx] = item.replace(placeholder, actual_value) + return data + + modified_data = [] + with open(self.rest_api_ground_truth_file_path, 'r') as file: + for line in file: + try: + data = replace_placeholders(json.loads(line)) + modified_data.append(data) + except json.JSONDecodeError: + # Handle the case where a line is not a valid JSON object + print('Invalid JSON line!') + + with open(output_file_path, 'w') as f: + for modified_line in modified_data: + f.write(json.dumps(modified_line) + '\n') + print(f'Saved REST API ground truth file with replaced placeholders at {output_file_path} 🦍.') + + return modified_data + + def rest_executable_checker(self, func_call, eval_ground_truth): + if "https://geocode.maps.co" in func_call: + time.sleep(2) + if "requests_get" in func_call: + func_call = func_call.replace("requests_get", "requests.get") + try: + response = eval(func_call) + except Exception as e: + return { + "valid": False, + "error": [f"Execution failed. {str(e)}"], + "error_type": "executable_checker_rest:execution_error", + } + try: + if response.status_code != 200: + return { + "valid": False, + "error": [ + f"Execution result status code is not 200, got {response.status_code}" + ], + "error_type": "executable_checker_rest:wrong_status_code", + } + except Exception as e: + return { + "valid": False, + "error": [f"Cannot get status code of the response. Error: {str(e)}"], + "error_type": "executable_checker_rest:cannot_get_status_code", + } + try: + if isinstance(eval_ground_truth, dict): + if isinstance(response.json(), dict): + if set(eval_ground_truth.keys()) == set(response.json().keys()): + return {"valid": True, "error": [], "error_type": ""} + return { + "valid": False, + "error": ["Key inconsistency"], + "error_type": "executable_checker_rest:wrong_key", + } + return { + "valid": False, + "error": [ + f"Expected dictionary, but got {type(response.json())}" + ], + "error_type": "executable_checker_rest:wrong_type", + } + + elif isinstance(eval_ground_truth, list): + if isinstance(response.json(), list): + if len(eval_ground_truth) != len(response.json()): + return { + "valid": False, + "error": [f"Response list length inconsistency."], + "error_type": "value_error:exec_result_rest_count", + } + + else: + for i in range(len(eval_ground_truth)): + if set(eval_ground_truth[i].keys()) != set( + response.json()[i].keys() + ): + return { + "valid": False, + "error": [f"Key inconsistency"], + "error_type": "executable_checker_rest:wrong_key", + } + + return {"valid": True, "error": []} + else: + return { + "valid": False, + "error": [ + f"Expected list, but got {type(response.json())}" + ], + "error_type": "executable_checker_rest:wrong_type", + } + return { + "valid": False, + "error": [ + f"Expected dict or list, but got {type(response.json())}" + ], + "error_type": "executable_checker_rest:wrong_type", + } + except Exception as e: + return { + "valid": False, + "error": [ + f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}" + ], + "error_type": "executable_checker_rest:response_format_error", + } + + def _simple_executable_checker( + self, + function_call: str, + expected_result, + expected_result_type: str, + is_sanity_check=False, + ): + result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} + + exec_dict = {} + + try: + exec( + "from bfcl.evaluator.exec_python_functions import *" + "\nresult=" + function_call, + exec_dict, + ) + exec_output = exec_dict["result"] + except NoAPIKeyError as e: + raise e + except Exception as e: + result["valid"] = False + result["error"].append( + f"Error in execution: {repr(function_call)}. Error: {str(e)}" + ) + result["error_type"] = "executable_checker:execution_error" + return result + + # We need to special handle the case where the execution result is a tuple and convert it to a list + # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json + if isinstance(exec_output, tuple): + exec_output = list(exec_output) + + if expected_result_type == "exact_match": + if exec_output != expected_result: + result["valid"] = False + result["error"].append( + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}." + ) + result["error_type"] = "executable_checker:wrong_result" + result["model_executed_output"] = exec_output + return result + + elif expected_result_type == "real_time_match": + # Allow for 5% difference + if (type(expected_result) == float or type(expected_result) == int) and ( + type(exec_output) == float or type(exec_output) == int + ): + if not ( + expected_result * (1 - ExecutableChecker.REAL_TIME_MATCH_ALLOWED_DIFFERENCE) + <= exec_output + <= expected_result * (1 + ExecutableChecker.REAL_TIME_MATCH_ALLOWED_DIFFERENCE) + ): + result["valid"] = False + result["error"].append( + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, " + f"but got: {exec_output}. {ExecutableChecker.REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed." + ) + result["error_type"] = "executable_checker:wrong_result_real_time" + result["model_executed_output"] = exec_output + return result + else: + result["valid"] = False + result["error"].append( + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, " + f"but got: {exec_output}. Type needs to be float or int for real time match criteria." + ) + result["error_type"] = "executable_checker:wrong_result_real_time" + result["model_executed_output"] = exec_output + return result + + else: + # Structural match + pattern_match_result = self._pattern_matcher(exec_output, expected_result, function_call, is_sanity_check) + if not pattern_match_result["valid"]: + return pattern_match_result + + return result + + def _parallel_no_order_executable_checker( + self, + decoded_result: List, + expected_exec_result: List, + expected_exec_result_type: List + ): + if len(decoded_result) != len(expected_exec_result): + return { + "valid": False, + "error": [ + f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}." + ], + "error_type": "value_error:exec_result_count", + } + + matched_indices = [] + for i in range(len(expected_exec_result)): + all_errors = [] + for index in range(len(decoded_result)): + if index in matched_indices: + continue + + result = self._simple_executable_checker( + decoded_result[index], + expected_exec_result[i], + expected_exec_result_type[i], + False, + ) + + if result["valid"]: + matched_indices.append(index) + break + else: + all_errors.append( + { + f"Model Result Index {index}": { + "sub_error": result["error"], + "sub_error_type": result["error_type"], + "model_executed_output": ( + result["model_executed_output"] + if "model_executed_output" in result + else None + ), + } + } + ) + + if not result["valid"]: + considered_indices = [ + i for i in range(len(decoded_result)) if i not in matched_indices + ] + all_errors.insert( + 0, + f"Could not find a matching function among index {considered_indices} of model " \ + "output for index {i} of possible answers.", + ) + return { + "valid": False, + "error": all_errors, + "error_type": "executable_checker:cannot_find_match", + } + + return {"valid": True, "error": [], "error_type": "executable_checker:unclear"} + + @staticmethod + def _pattern_matcher(exec_output, expected_result, function_call, is_sanity_check): + result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} + + if type(exec_output) != type(expected_result): + return { + "valid": False, + "error": [ + f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}." + ], + "error_type": "executable_checker:wrong_result_type", + "model_executed_output": exec_output, + } + if type(exec_output) == dict: + # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one. + # This happens when the key is a timestamp or a random number. + if is_sanity_check: + if len(exec_output) != len(expected_result): + return { + "valid": False, + "error": [ + f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." + ], + "error_type": "executable_checker:wrong_result_type:dict_length", + "model_executed_output": exec_output, + } + else: + return result + + for key, value in expected_result.items(): + if key not in exec_output: + return { + "valid": False, + "error": [ + f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output." + ], + "error_type": "executable_checker:wrong_result_type:dict_key_not_found", + "model_executed_output": exec_output, + } + for key, value in exec_output.items(): + if key not in expected_result: + return { + "valid": False, + "error": [ + f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output." + ], + "error_type": "executable_checker:wrong_result_type:dict_extra_key", + "model_executed_output": exec_output, + } + if type(exec_output) == list: + if len(exec_output) != len(expected_result): + return { + "valid": False, + "error": [ + f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." + ], + "error_type": "executable_checker:wrong_result_type:list_length", + "model_executed_output": exec_output, + } + return result \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py index d5255098e..c3306799f 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py @@ -2,11 +2,13 @@ from pathlib import Path from typing import List, Dict, Any +from tqdm import tqdm from pydantic import BaseModel +import bfcl.types as types from bfcl.model_handler.base import BaseHandler -from bfcl.types import Leaderboard, LeaderboardCategory from bfcl.evaluator.metrics import LeaderboardModelMetrics +from bfcl.evaluator.checker import ExecutableChecker from bfcl.evaluator import utils as evaluator_utils @@ -19,12 +21,25 @@ class FailedResult(BaseModel): llm_response: str decoded_result: Any + class Config: + extra = 'allow' + class LeaderboardEvaluator: - def __init__(self, model_handler: BaseHandler, leaderboard: Leaderboard) -> None: + def __init__( + self, + model_handler: BaseHandler, + leaderboard: types.Leaderboard, + perform_api_sanity_check: bool + ) -> None: self.model_name = model_handler.model_name self.model_handler = model_handler self.leaderboard = leaderboard + self.test_category_to_data = leaderboard.load_test_data() + + self._checker = ExecutableChecker(leaderboard.cache_dir) + if perform_api_sanity_check: + self._checker.perform_api_sanity_checks() self._model_metrics = LeaderboardModelMetrics(self.model_name) self._test_category_to_metrics = {} @@ -34,9 +49,9 @@ def __call__(self, file_path: Path, test_category) -> None: print(f'Skipping evaluation of test category "{test_category.value}" due to empty model responses!') return - if test_category == LeaderboardCategory.JAVA: + if test_category == types.LeaderboardCategory.JAVA: language = 'java' - elif test_category == LeaderboardCategory.JAVASCRIPT: + elif test_category == types.LeaderboardCategory.JAVASCRIPT: language = 'javascript' else: language = 'python' @@ -44,16 +59,19 @@ def __call__(self, file_path: Path, test_category) -> None: print('🔍 Running test:', test_category.value) self._model_metrics(model_responses) - accuracy = None - if test_category == LeaderboardCategory.RELEVANCE: + result = None + if test_category == types.LeaderboardCategory.RELEVANCE: result = self.run_relevance_evaluator(model_responses) + elif test_category.value in types.LeaderboardExecutableCategory: + result = self.run_executable_evaluator(test_category, model_responses) + + if result: accuracy = result['accuracy'] - - self._test_category_to_metrics[test_category] = dict( - accuracy=accuracy, - total_count=result['total_count'] - ) - print(f"✅ Test completed: {test_category.value} | 🎯 Accuracy: {accuracy:.4f}") + self._test_category_to_metrics[test_category] = dict( + accuracy=accuracy, + total_count=result['total_count'] + ) + print(f"✅ Test completed: {test_category.value} | 🎯 Accuracy: {accuracy:.4f}") def get_leaderboard_metrics(self) -> Dict: model_metrics = self._model_metrics.compute() @@ -96,7 +114,7 @@ def run_relevance_evaluator(self, model_responses: List[Dict]) -> Dict: else: result = FailedResult( example_id=response['id'], - test_category=LeaderboardCategory.RELEVANCE.value, + test_category=types.LeaderboardCategory.RELEVANCE.value, is_valid=False, error_type='relevance_error:decoder_success', error_message='Valid syntax. Successfully decode AST when it should not.', @@ -111,7 +129,127 @@ def run_relevance_evaluator(self, model_responses: List[Dict]) -> Dict: total_count=len(model_responses), failed_model_responses=failed_model_responses, ) - self._save_scores(LeaderboardCategory.RELEVANCE, result) + self._save_scores(types.LeaderboardCategory.RELEVANCE, result) + return result + + def run_executable_evaluator( + self, + test_category: types.LeaderboardCategory, + model_responses: List[Dict] + ) -> Dict: + """Run executable function/API evaluator. + + Invoke function or API for the documentation provided. The accuracy + is measured by actually running the function call with function + source code loaded.""" + + test_data = self.test_category_to_data[test_category] + assert len(model_responses) == len(test_data) + + if test_category != types.LeaderboardExecutableCategory.REST: + print(f"---- Getting real-time execution result from ground truth for '{test_category.value}' ----") + exec_dict = {} + for item in tqdm(test_data, desc="Getting Executable Expected Output"): + if item.get('execution_result'): + # Execution result have already been added to the test dataset + continue + execution_result = [] + ground_truth = item["ground_truth"] + for i in range(len(ground_truth)): + exec( + "from bfcl.evaluator.exec_python_functions import *" + + "\nresult=" + + ground_truth[i], + exec_dict, + ) + execution_result.append(exec_dict["result"]) + item["execution_result"] = execution_result + print(f"---- Ground truth real-time execution result obtained for '{test_category.value}' 🌟 ----") + + failed_model_responses = [] + correct_count = 0 + for idx, response in enumerate(model_responses): + model_response = response['response'] + try: + decoded_result = self.model_handler.decode_execute(model_response) + except Exception as e: + result = FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_type='executable_decoder:decoder_failed', + error_message=f"Failed to decode executable. {str(e)}", + llm_response=model_response, + decoded_result=decoded_result, + ) + failed_model_responses.append(result) + continue + + if test_category == types.LeaderboardExecutableCategory.REST: + # REST is always single-functioned. Therefore we take the first one and pass + # it to the REST checker. + if not evaluator_utils.is_rest_format_output(decoded_result): + result = FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_type='executable_decoder:rest_wrong_output_format', + error_message=( + 'Did not output in the specified format. Note: the model_result is wrapped in a ' + 'string to ensure json serializability.' + ), + llm_response=str(model_response), + decoded_result=str(decoded_result), + ) + failed_model_responses.append(result) + continue + + checker_result = self._checker.rest_executable_checker( + decoded_result[0], + eval_ground_truth=self._checker.rest_eval_response_data[idx] + ) + else: + if not evaluator_utils.is_executable_format_output(decoded_result): + result = FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_type='executable_decoder:wrong_output_format', + error_message=( + 'Did not output in the specified format. Note: the model_result is wrapped in a ' + 'string to ensure json serializability.' + ), + llm_response=str(model_response), + decoded_result=str(decoded_result), + ) + failed_model_responses.append(result) + continue + + checker_result = self._checker.executable_checker(decoded_result, test_data[idx], test_category) + + if checker_result["valid"]: + correct_count += 1 + else: + result = FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=checker_result['valid'], + error_type=checker_result['error_type'], + error_message=checker_result['error'], + llm_response=model_response, + decoded_result=decoded_result, + ) + if "model_executed_output" in checker_result: + result.model_executed_output = checker_result["model_executed_output"] + failed_model_responses.append(result) + + result = dict( + accuracy=correct_count / len(model_responses), + correct_count=correct_count, + total_count=len(model_responses), + failed_model_responses=failed_model_responses, + ) + self._save_scores(test_category, result) return result def _save_scores(self, test_category, result) -> None: diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/exceptions.py b/berkeley-function-call-leaderboard/bfcl/evaluator/exceptions.py new file mode 100644 index 000000000..3504862d8 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/exceptions.py @@ -0,0 +1,10 @@ +class NoAPIKeyError(Exception): + def __init__(self): + self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." + super().__init__(self.message) + + +class BadAPIStatusError(Exception): + def __init__(self, errors, error_rate): + self.errors = errors + self.error_rate = error_rate \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py b/berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py new file mode 100644 index 000000000..90fa0faef --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py @@ -0,0 +1,879 @@ +import os +import math +import time + +import requests + +# Make sure the env variables are populated +env_vars = ('GEOCODE_API_KEY', 'RAPID_API_KEY', 'OMDB_API_KEY', 'EXCHANGERATE_API_KEY') +for var in env_vars: + assert (api_key := os.getenv(var)), f'Please provide your {var} in the `.env` file.' + + +def calculate_triangle_area(base, height): + """ + Calculates the area of a triangle. + Args: + base (integer): The base of the triangle. + height (integer): The height of the triangle. + """ + return base * height / 2 + + +def get_distance(pointA, pointB): + """ + Calculates the distance between two 2D points. + Args: + pointA (tuple): The first point. + pointB (tuple): The second point. + """ + return ((pointA[0] - pointB[0]) ** 2 + (pointA[1] - pointB[1]) ** 2) ** 0.5 + + +def math_factorial(n): + """ + Calculates the factorial of a number. + Args: + n (integer): The number to calculate the factorial of. + """ + result = 1 + for i in range(1, n + 1): + result *= i + return result + + +def quadratic_roots(a, b, c): + """ + Calculates the roots of a quadratic equation. + Args: + a (integer): The first coefficient. + b (integer): The second coefficient. + c (integer): The third coefficient. + Returns: + A list of roots, where each root is either a float or a dictionary + with 'real' and 'imaginary' parts for complex roots. + """ + discriminant = b**2 - 4 * a * c + if discriminant >= 0: + root1 = (-b + discriminant**0.5) / (2 * a) + root2 = (-b - discriminant**0.5) / (2 * a) + roots = [root1, root2] + else: + real_part = -b / (2 * a) + imaginary_part = (abs(discriminant) ** 0.5) / (2 * a) + roots = [ + {"real": real_part, "imaginary": imaginary_part}, + {"real": real_part, "imaginary": -imaginary_part}, + ] + + return roots + + +def geometry_area_circle(radius): + """ + Calculates the area of a circle. + Args: + radius (integer): The radius of the circle. + """ + return math.pi * radius**2 + + +def get_prime_factors(number): + """ + Calculates the prime factors of a number. + Args: + number (integer): The number to calculate the prime factors of. + """ + factors = [] + divisor = 2 + while number > 1: + while number % divisor == 0: + factors.append(divisor) + number /= divisor + divisor += 1 + return factors + + +def math_gcd(a, b): + """ + Calculates the greatest common divisor of two numbers. + Args: + a (integer): The first number. This should be the larger number. + b (integer): The second number. + """ + if b == 0: + return a + else: + return math_gcd(b, a % b) + + +def math_lcm(a, b): + """ + Calculates the least common multiple of two numbers. + Args: + a (integer): The first number. This should be the larger number. + b (integer): The second number. + """ + return a * b / math_gcd(a, b) + + +def calculate_final_velocity(initial_velocity, acceleration, time): + """ + Calculates the final velocity of an object. + Args: + initial_velocity (integer): The initial velocity of the object. + acceleration (integer): The acceleration of the object. + time (integer): The time the object has been moving. + """ + return initial_velocity + acceleration * time + + +def calculate_displacement(initial_velocity, acceleration, time): + """ + Calculates the displacement of an object. + Args: + initial_velocity (integer): The initial velocity of the object. + acceleration (integer): The acceleration of the object. + time (integer): The time the object has been moving. + """ + return initial_velocity * time + 0.5 * acceleration * time**2 + + +def calculate_electrostatic_potential_energy(charge, voltage): + """ + Calculates the electrostatic potential energy. + Args: + charge (integer): The charge of the object. + voltage (integer): The voltage of the object. + """ + return charge * voltage + + +def calculate_density(mass, volume): + """ + Calculates the density of an object. + Args: + mass (integer): The mass of the object. + volume (integer): The volume of the object. + """ + return mass / volume + + +def mat_mul(matA, matB): + """ + Multiplies two matrices. + Args: + matA (list): The first matrix. + matB (list): The second matrix. + """ + result = [[0 for i in range(len(matB[0]))] for j in range(len(matA))] + for i in range(len(matA)): + for j in range(len(matB[0])): + for k in range(len(matB)): + result[i][j] += matA[i][k] * matB[k][j] + return result + + +def calculate_mean(numbers): + """ + Calculates the mean of a list of numbers. + Args: + numbers (list): The list of numbers. + """ + return sum(numbers) / len(numbers) + + +def calculate_standard_deviation(numbers): + """ + Calculates the standard deviation of a list of numbers. + Args: + numbers (list): The list of numbers. + """ + mean = calculate_mean(numbers) + variance = sum((number - mean) ** 2 for number in numbers) / len(numbers) + return variance**0.5 + + +def calc_binomial_probability(n, k, p): + """ + Calculates the probability of getting k successes in n trials. + Args: + n (integer): The number of trials. + k (integer): The number of successes. + p (integer): The probability of success. + """ + return ( + math_factorial(n) + / (math_factorial(k) * math_factorial(n - k)) + * (p**k * (1 - p) ** (n - k)) + ) + + +def calculate_permutations(n, k): + """ + Calculates the number of permutations of k elements from a set of n elements. + Args: + n (integer): The number of elements in the set. + k (integer): The number of elements to choose. + """ + return math_factorial(n) / math_factorial(n - k) + + +def get_fibonacci_sequence(n): + """ + Calculates the n numbers of the Fibonacci. + Args: + n (integer): The number of Fibonacci numbers to calculate. + """ + sequence = [0, 1] + for i in range(2, n): + sequence.append(sequence[i - 1] + sequence[i - 2]) + return sequence + + +def estimate_derivative(function, x): + """ + Estimate the derivative of a function at a given point. + Args: + function (function): The function to calculate the derivative of. + x (integer): The point to calculate the derivative at. + """ + func = eval(function) + h = 0.0000000001 + return (func(x + h) - func(x)) / h + + +def calculate_cosine_similarity(vectorA, vectorB): + """ + Calculates the cosine similarity of two vectors. + Args: + vectorA (list): The first vector. + vectorB (list): The second vector. + """ + dot_product = sum(vectorA[i] * vectorB[i] for i in range(len(vectorA))) + magnitudeA = (sum(vectorA[i] ** 2 for i in range(len(vectorA)))) ** 0.5 + magnitudeB = (sum(vectorB[i] ** 2 for i in range(len(vectorB)))) ** 0.5 + return dot_product / (magnitudeA * magnitudeB) + + +def mortgage_calculator(loan_amount, interest_rate, loan_period): + """ + Calculates the monthly mortgage payment. + Args: + loan_amount (integer): The amount of the loan. + interest_rate (integer): The interest rate of the loan. + loan_period (integer): The period of the loan. + """ + monthly_interest_rate = interest_rate / 12 + number_of_payments = loan_period * 12 + monthly_payment = ( + loan_amount + * monthly_interest_rate + * (1 + monthly_interest_rate) ** number_of_payments + / ((1 + monthly_interest_rate) ** number_of_payments - 1) + ) + return monthly_payment + + +def calculate_future_value(present_value, interest_rate, periods): + """ + Calculates the future value of an investment. + Args: + present_value (integer): The present value of the investment. + interest_rate (integer): The interest rate of the investment. + periods (integer): The number of periods. + """ + return present_value * (1 + interest_rate) ** periods + + +def sort_array(array, reverse=False): + """ + Sorts an array of numbers. + Args: + array (list): The array of numbers. + reverse (optional bool): Whether to sort the array in reverse order, i.e., descending order. + """ + return sorted(array, reverse=reverse) + + +def get_weather_data(coordinates): + """ + Fetches weather data from the Open-Meteo API for the given latitude and longitude. + + Args: + coordinates (tuple): The latitude of the location. + + Returns: + float: The current temperature in the coordinates you've asked for + """ + lat, long = coordinates + url = "https://api.open-meteo.com/v1/forecast" + params = { + "latitude": lat, + "longitude": long, + "current": "temperature_2m", + "temperature_unit": "fahrenheit", + } + + response = requests.get(url, params=params) + if response.status_code == 200: + return response.json()["current"]["temperature_2m"] + else: + return "Failed to fetch data with status code: {}".format(response.status_code) + + +def get_coordinates_from_city(city_name): + """ + Fetches the latitude and longitude of a given city name using the Maps.co Geocoding API. + + Args: + city_name (str): The name of the city. + + Returns: + tuple: The latitude and longitude of the city. + """ + time.sleep(2) # To avoid rate limiting + url = "https://geocode.maps.co/search" + params = {"q": city_name, "api_key": os.getenv("GEOCODE_API_KEY")} + + response = requests.get(url, params=params) + if response.status_code == 200: + data = response.json() + if data: + return data[0]["lat"], data[0]["lon"] + else: + return "No data found for the given city name." + else: + return "Failed to fetch data with status code: {}".format(response.status_code) + + +def convert_currency(amount, from_currency, to_currency): + """ + Converts a given amount from one currency to another using the ExchangeRate-API. + + Args: + amount (float): The amount of money to convert. + from_currency (str): The ISO currency code for the base currency. + to_currency (str): The ISO currency code for the target currency. + + Returns: + float: The converted amount in the target currency. + """ + key = os.getenv("EXCHANGERATE_API_KEY") + base_url = f"https://v6.exchangerate-api.com/v6/{key}/latest/{from_currency}" + response = requests.get(base_url) + + if response.status_code == 200: + data = response.json() + rates = data.get("conversion_rates", {}) + if to_currency in rates: + converted_amount = amount * rates[to_currency] + return converted_amount + else: + return "Target currency code not found." + else: + return "Failed to fetch data with status code: {}".format(response.status_code) + + +def find_term_on_urban_dictionary(term): + """ + Finds the definition of a term on Urban Dictionary. + Args: + term (str): The term to find the definition of. + """ + url = "https://mashape-community-urban-dictionary.p.rapidapi.com/define" + + querystring = {"term": term} + + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "mashape-community-urban-dictionary.p.rapidapi.com", + } + + response = requests.get(url, headers=headers, params=querystring) + + return response.json()["list"][0]["definition"] + + +def get_coordinate_by_ip_address(ip_address): + """ + Finds the latitude and longitude of an IP address. + Args: + ip_address (str): The IP address to find the location of. + """ + url = f"http://ip-api.com/json/{ip_address}" + response = requests.get(url) + try: + return (response.json()["lat"], response.json()["lon"]) + except: + return response.json()["message"] + + +def get_zipcode_by_ip_address(ip_address): + """ + Finds the zipcode of an IP address. + Args: + ip_address (str): The IP address to find the location of. + """ + url = f"http://ip-api.com/json/{ip_address}" + response = requests.get(url) + try: + return response.json()["zip"] + except: + return response.json()["message"] + + +def get_covid_death_by_country(country): + """ + Finds the most up to date total deaths of a country result from COVID. + Args: + country (str): The country to find the total deaths of, in the format of the country's full name. + """ + url = "https://covid-193.p.rapidapi.com/statistics" + + querystring = {"country": country} + + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "covid-193.p.rapidapi.com", + } + + response = requests.get(url, headers=headers, params=querystring) + try: + return response.json()["response"][0]["deaths"]["total"] + except: + return response.json() + + +def get_active_covid_case_by_country(country): + """ + Finds the most up to date active cases of a country result from COVID. + Args: + country (str): The country to find the active cases of. + """ + url = "https://covid-193.p.rapidapi.com/statistics" + + querystring = {"country": country} + + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "covid-193.p.rapidapi.com", + } + + response = requests.get(url, headers=headers, params=querystring) + try: + return response.json()["response"][0]["cases"]["active"] + except: + return response.json() + + +def get_rating_by_amazon_ASIN(ASIN): + url = "https://real-time-amazon-data.p.rapidapi.com/product-details" + querystring = {"asin": ASIN, "country": "US"} + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "real-time-amazon-data.p.rapidapi.com", + } + + retries = 0 + max_retries = 5 + while retries < max_retries: + response = requests.get(url, headers=headers, params=querystring) + try: + return response.json()["data"]["product_star_rating"] + except KeyError: + wait_time = 2**retries # Exponential backoff: 1, 2, 4 seconds + time.sleep(wait_time) + retries += 1 + + return None + + +def get_price_by_amazon_ASIN(ASIN): + url = "https://real-time-amazon-data.p.rapidapi.com/product-details" + querystring = {"asin": ASIN, "country": "US"} + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "real-time-amazon-data.p.rapidapi.com", + } + + retries = 0 + max_retries = 5 + while retries < max_retries: + response = requests.get(url, headers=headers, params=querystring) + try: + return response.json()["data"]["product_price"] + except KeyError: + wait_time = 2**retries # Exponential backoff: 1, 2, 4 seconds + time.sleep(wait_time) + retries += 1 + + return None + + +def get_product_name_by_amazon_ASIN(ASIN): + url = "https://real-time-amazon-data.p.rapidapi.com/product-details" + querystring = {"asin": ASIN, "country": "US"} + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "real-time-amazon-data.p.rapidapi.com", + } + + retries = 0 + max_retries = 5 + while retries < max_retries: + response = requests.get(url, headers=headers, params=querystring) + try: + return response.json()["data"]["product_title"] + except KeyError: + wait_time = 2**retries # Exponential backoff: 1, 2, 4 seconds + time.sleep(wait_time) + retries += 1 + + return None + + +def get_company_name_by_stock_name(stock_name): + """ + Finds the company name of a stock by its stock name. + Args: + stock_name (str): The stock name of the product. + """ + url = "https://yahoo-finance15.p.rapidapi.com/api/v1/markets/search" + + querystring = {"search": stock_name} + + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "yahoo-finance15.p.rapidapi.com", + } + + response = requests.get(url, headers=headers, params=querystring) + try: + return response.json()["body"][0]["name"] + except: + return response.json() + + +def get_stock_price_by_stock_name(stock_name): + """ + Finds the price of a stock by its stock name. + Args: + stock_name (str): The stock name of the product. + """ + url = "https://yahoo-finance15.p.rapidapi.com/api/v1/markets/stock/quotes" + + querystring = {"ticker": stock_name} + + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "yahoo-finance15.p.rapidapi.com", + } + + response = requests.get(url, headers=headers, params=querystring) + try: + return float(response.json()["body"][0]["regularMarketPrice"]) + except: + return response.json() + + +def get_stock_history(stock_name, interval, diffandsplits="true"): + """ + Finds the price of a stock by its stock name. + Args: + stock_name (str): The stock name of the product. + interval (str): The interval of the stock history. Allows one of following : 5m|15m|30m|1h|1d|1wk|1mo|3mo + diffandsplits (optional str): The diff and splits of the stock history. Allows one of following : 'true'|'false' + """ + url = "https://yahoo-finance15.p.rapidapi.com/api/v1/markets/stock/history" + + querystring = { + "symbol": stock_name, + "interval": interval, + "diffandsplits": diffandsplits, + } + + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "yahoo-finance15.p.rapidapi.com", + } + + response = requests.get(url, headers=headers, params=querystring) + try: + data = response.json()["body"] + return {key: data[key] for key in list(data)[-10:]} + except: + return response.json() + + +def retrieve_city_based_on_zipcode(zipcode): + """ + Finds the city of a zipcode. + Args: + zipcode (str): The zipcode of the city. + """ + url = f"http://ziptasticapi.com/{zipcode}" + response = requests.get(url) + try: + return response.json()["city"] + except: + return response.json() + + +def retrieve_holiday_by_year(country, year): + """ + Finds the holidays of a year. + Args: + year (str): The year of the holidays. + country (str): The country of the holidays. Possible options: US, AT, DE, ES, FR, GB, IT, NL, PL, RO, SK, UA. + """ + url = f"https://date.nager.at/api/v3/publicholidays/{year}/{country}" + response = requests.get(url) + return response.json() + + +def get_time_zone_by_coord(long, lat): + """ + Finds the timezone of a coordinate. + Args: + long (str): The longitude of the coordinate. + lat (str): The latitude of the coordinate. + """ + url = "https://timezone-by-location.p.rapidapi.com/timezone" + + querystring = {"lat": lat, "lon": long, "c": "1", "s": "0"} + + headers = { + "X-RapidAPI-Key": os.getenv('RAPID_API_KEY'), + "X-RapidAPI-Host": "timezone-by-location.p.rapidapi.com", + } + + response = requests.get(url, headers=headers, params=querystring) + try: + return response.json()["Zones"][0]["TimezoneId"] + except: + return response.json() + + +def linear_regression(x, y, point): + """ + Finds the linear regression of a set of points. + Args: + x (list): The x coordinates of the points. + y (list): The y coordinates of the points. + point (int): The point to calculate the linear regression at. + """ + n = len(x) + sum_x = sum(x) + sum_y = sum(y) + sum_x_squared = sum(x_i**2 for x_i in x) + sum_xy = sum(x[i] * y[i] for i in range(n)) + slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x_squared - sum_x**2) + intercept = (sum_y - slope * sum_x) / n + return slope * point + intercept + + +def add_binary_numbers(a, b): + """ + Adds two binary numbers. + Args: + a (str): The first binary number. + b (str): The second binary number. + """ + return bin(int(a, 2) + int(b, 2))[2:] + + +def maxPoints(points) -> int: + """ + Finds the maximum number of points on a line. + Args: + points (list): The list of points. points are 2 element lists. + """ + counter = 1 + if len(points) < 2: + return 1 + for i in range(len(points)): + lst = {} + for j in range(i + 1, len(points)): + y = points[j][1] - points[i][1] + x = points[j][0] - points[i][0] + if x != 0: + lst[y / x] = 1 + lst.get(y / x, 0) + else: + lst["inf"] = 1 + lst.get("inf", 0) + for key, value in lst.items(): + counter = max(counter, value) + return counter + 1 + + +def calculate_investment_value( + initial_investment, + annual_contribution, + years, + annual_return, + inflation_rate, + adjust_for_inflation=True, +): + """ + Calculates the value of an investment over time. + Args: + initial_investment (integer): The initial investment amount. + annual_contribution (integer): The annual contribution amount. + years (integer): The number of years to calculate the investment value for. + annual_return (float): The annual return rate, ranging from 0 to 1. + inflation_rate (list): The inflation rate for each year in percentage, ranging from 0 to 1. + adjust_for_inflation (optional bool): Whether to adjust the investment value for inflation. + """ + current_value = initial_investment + real_value = initial_investment # Adjusted for inflation + + for year in range(1, years + 1): + # Apply annual return + current_value = current_value * (1 + annual_return) + annual_contribution + + # Adjust for inflation if requested + if adjust_for_inflation: + inflation_adjustment = ( + 1 - inflation_rate[year - 1] + if year <= len(inflation_rate) + else 1 - inflation_rate[-1] + ) + real_value = ( + real_value * (1 + annual_return - inflation_rate[year - 1]) + + annual_contribution * inflation_adjustment + ) + else: + real_value = current_value + + final_value = real_value if adjust_for_inflation else current_value + return final_value + + +def calculate_nutritional_needs(weight, height, age, gender, activity_level, goal): + """ + Calculates the nutritional needs of a person based on their weight, height + Args: + weight (integer): The weight of the person. + height (integer): The height of the person. + age (integer): The age of the person + gender (str): The gender of the person. Possible options [male,female,other] + activity_level (integer): The activity level of the person. Possible options [1,2,3,4,5] + goal (str): The goal of the person. Possible options [lose,gain,maintain] + """ + if gender == "male": + bmr = 88.362 + (13.397 * weight) + (4.799 * height) - (5.677 * age) + else: + bmr = 447.593 + (9.247 * weight) + (3.098 * height) - (4.330 * age) + + # Total Daily Energy Expenditure (TDEE) Calculation + activity_multipliers = [1.2, 1.375, 1.55, 1.725, 1.9] + tdee = bmr * activity_multipliers[activity_level - 1] + + # Adjust TDEE based on goal + if goal == "lose": + tdee -= 500 # Creating a deficit to lose weight + elif goal == "gain": + tdee += 500 # Creating a surplus to gain weight + + # Macronutrient Distribution + proteins = (tdee * 0.30) / 4 # 30% of calories from protein, 4 calories per gram + fats = (tdee * 0.25) / 9 # 25% of calories from fat, 9 calories per gram + carbohydrates = (tdee * 0.45) / 4 # 45% of calories from carbs, 4 calories per gram + + return { + "calories": tdee, + "proteins_g": proteins, + "fats_g": fats, + "carbohydrates_g": carbohydrates, + } + + +def book_room( + room_type, price, check_in_date, check_out_date, customer_id, discount_code=None +): + """ + Books a room for a customer. + Args: + room_type (dict): The room type to book. + check_in_date (str): The check-in date. + check_out_date (str): The check-out date. + customer_id (str): The customer ID. + discount_code (str): The discount code (if any). + """ + # Assume the first available room is booked (for simplicity) + booked_room = room_type + + # Calculate price and apply discount if applicable + if discount_code and discount_code == "DISCOUNT10": + price *= 0.9 # Apply 10% discount + + booking_details = { + "customer_id": customer_id, + "room_number": room_type, + "check_in_date": check_in_date, + "check_out_date": check_out_date, + "total_price": price, + } + + return booking_details + + +def order_food(item, quantity, price): + """ + Orders food for a customer. + Args: + item (list): The item to order. + quantity (list): The quantity of the item. + price (list): The price of the item. + """ + # Calculate total price + total_price = sum([quantity[i] * price[i] for i in range(len(item))]) + return total_price + + +def get_movie_rating(movie_name): + """ + Fetches the age rating of a movie from the OMDB API. + Args: + movie_name (str): The name of the movie. + """ + url = "http://www.omdbapi.com/" + params = {"t": movie_name, "apikey": os.getenv('OMDB_API_KEY')} + response = requests.get(url, params=params) + return response.json()["Rated"] + + +def get_movie_director(movie_name): + """ + Fetches the director of a movie from the OMDB API. + Args: + movie_name (str): The name of the movie. + """ + url = "http://www.omdbapi.com/" + params = {"t": movie_name, "apikey": os.getenv('OMDB_API_KEY')} + response = requests.get(url, params=params) + return response.json()["Director"] + + +def polygon_area(vertices): + """ + Calculate the area of a polygon given its vertices using the shoelace formula. + Args: + vertices (list): The vertices of the polygon. Vertices are 2 element lists. + """ + n = len(vertices) + if n < 3: + raise ValueError("A polygon must have at least 3 vertices.") + + # Append the first vertex to the end to complete the loop + vertices.append(vertices[0]) + + # Apply the shoelace formula + area = 0 + for i in range(n): + area += (vertices[i][0] * vertices[i + 1][1]) - ( + vertices[i + 1][0] * vertices[i][1] + ) + + area = abs(area) / 2.0 + return area diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py index a54333492..07acae22c 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py @@ -11,7 +11,6 @@ def is_empty_output(decoded_output) -> bool: ): return True -@staticmethod def is_function_calling_format_output(decoded_output): # Ensure the output is a list of dictionaries if isinstance(decoded_output, list): @@ -20,3 +19,48 @@ def is_function_calling_format_output(decoded_output): return False return True return False + +def display_api_status_error(rest_error, executable_error, display_success=False): + if not rest_error and not executable_error: + if display_success: + print("🟢 All API Status Test Passed!") + return None + + RED_FONT = "\033[91m" + RESET = "\033[0m" + + print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n") + + if rest_error: + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test category (REST). Please contact API provider.\n") + print(f"{rest_error.error_rate} APIs affected:\n") + for data, status in rest_error.errors: + print(f" - Test Case: {data['ground_truth']}") + print(f" Error Type: {status['error_type']}\n") + + if executable_error: + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test categories (Non-REST). Please contact API provider.\n") + print(f"{executable_error.error_rate} APIs affected:\n") + for data, status in executable_error.errors: + print(f" - Test Case: {data['ground_truth'][0]}") + print(f" Error Type: {status['error_type']}\n") + + print(f"{RED_FONT}{'-' * 100}\n{RESET}") + +def is_rest_format_output(decoded_output): + # Ensure the output is a list of one string + if type(decoded_output) == list: + if len(decoded_output) == 1 and type(decoded_output[0]) == str: + return True + return False + +def is_executable_format_output(decoded_output): + # Ensure the output is a list of strings (one or more strings) + if type(decoded_output) == list: + if len(decoded_output) == 0: + return False + for item in decoded_output: + if type(item) != str: + return False + return True + return False \ No newline at end of file From fa2694a9d4ef45e73d4d1f7483def48a4d6086b0 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Fri, 12 Jul 2024 15:20:44 -0400 Subject: [PATCH 26/35] Standardize checker result --- .../apply_function_credential_config.py | 77 --- .../bfcl/evaluator/checker.py | 495 +++++++++--------- .../bfcl/evaluator/evaluator.py | 61 ++- .../bfcl/evaluator/exec_python_functions.py | 3 + .../bfcl/evaluator/utils.py | 4 +- .../bfcl/types.py | 31 +- 6 files changed, 317 insertions(+), 354 deletions(-) delete mode 100644 berkeley-function-call-leaderboard/apply_function_credential_config.py diff --git a/berkeley-function-call-leaderboard/apply_function_credential_config.py b/berkeley-function-call-leaderboard/apply_function_credential_config.py deleted file mode 100644 index 7b6124896..000000000 --- a/berkeley-function-call-leaderboard/apply_function_credential_config.py +++ /dev/null @@ -1,77 +0,0 @@ -import json -import argparse - - -parser = argparse.ArgumentParser(description="Replace placeholders in the function credential config file.") -parser.add_argument("--input-file", help="Path to the function credential config file.", required=True) -parser.add_argument("--output-file", help="Path to the output file.", default="") -args = parser.parse_args() - -# Load the configuration with actual API keys -with open("function_credential_config.json") as f: - function_credential_config = json.load(f) - -PLACEHOLDERS = { - "YOUR-GEOCODE-API-KEY": function_credential_config[3]["GEOCODE-API-KEY"], - "YOUR-RAPID-API-KEY": function_credential_config[0]["RAPID-API-KEY"], - "YOUR-OMDB-API-KEY": function_credential_config[2]["OMDB-API-KEY"], - "YOUR-EXCHANGERATE-API-KEY": function_credential_config[1]["EXCHANGERATE-API-KEY"] -} - - -def replace_placeholders(data): - """ - Recursively replace placeholders in a nested dictionary or list using string.replace. - """ - if isinstance(data, dict): - for key, value in data.items(): - if isinstance(value, (dict, list)): - replace_placeholders(value) - elif isinstance(value, str): - for placeholder, actual_value in PLACEHOLDERS.items(): - if placeholder in value: # Check if placeholder is in the string - data[key] = value.replace(placeholder, actual_value) - elif isinstance(data, list): - for idx, item in enumerate(data): - if isinstance(item, (dict, list)): - replace_placeholders(item) - elif isinstance(item, str): - for placeholder, actual_value in PLACEHOLDERS.items(): - if placeholder in item: # Check if placeholder is in the string - data[idx] = item.replace(placeholder, actual_value) - return data - -def main(): - # Verify all values are provided - for key, value in PLACEHOLDERS.items(): - if value == "": - print(f"Please provide a value for the placeholder {key}.") - return - print("All API keys are present.") - - modified_data = [] - with open(f"{args.input_file}", 'r') as f: - lines = f.readlines() - for line in lines: - try: - data = json.loads(line) # Parse each line as a JSON object - data = replace_placeholders(data) # Replace placeholders - modified_data.append(json.dumps(data)) # Convert back to string and store - except json.JSONDecodeError: - # Handle the case where a line is not a valid JSON object - print("Invalid JSON line skipped.") - continue - - if args.output_file == "": - with open(f"{args.input_file}", 'w') as f: - for modified_line in modified_data: - f.write(modified_line + '\n') # Write each modified JSON object back to the input file - print(f"All placeholders have been replaced in {args.input_file} 🦍.") - else: - with open(f"{args.output_file}", 'w') as f: - for modified_line in modified_data: - f.write(modified_line + '\n') # Write each modified JSON object overwrite the output file - print(f"All placeholders have been replaced in {args.output_file} 🦍.") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker.py index 2061fbe16..3ff3de85b 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/checker.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker.py @@ -5,12 +5,22 @@ from typing import Dict, List from tqdm import tqdm +from pydantic import BaseModel from bfcl.types import LeaderboardExecutableCategory from bfcl.evaluator.utils import display_api_status_error from bfcl.evaluator.exceptions import BadAPIStatusError, NoAPIKeyError +class CheckerResult(BaseModel): + is_valid: bool + error_type: str + error_message: str + + class Config: + extra = 'allow' + + class ExecutableChecker: REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2 @@ -18,22 +28,26 @@ def __init__(self, cache_dir: str) -> None: self.cache_dir = cache_dir self.data_dir = Path(__file__, '..', '..', '..').resolve() / 'data' self.rest_api_ground_truth_file_path = self.data_dir / 'api_status_check_ground_truth_REST.jsonl' + self.executable_ground_truth_file_path = self.data_dir / 'api_status_check_ground_truth_executable.jsonl' + self.rest_eval_response_v5_file_path = self.data_dir / 'rest-eval-response_v5.jsonl' with open(self.rest_eval_response_v5_file_path, 'r') as file: self.rest_eval_response_data = [json.loads(line) for line in file] - self.executable_ground_truth_file_path = self.data_dir / 'api_status_check_ground_truth_executable.jsonl' + + self._cached_exec_api_ground_truth_results = {} def perform_api_sanity_checks(self) -> None: print("---- Sanity checking API status ----") + rest_api_error = executable_api_error = None try: self.rest_api_status_sanity_check() except BadAPIStatusError as e: - API_STATUS_ERROR_REST = e + rest_api_error = e try: self.executable_api_status_sanity_check() except BadAPIStatusError as e: - API_STATUS_ERROR_EXECUTABLE = e - display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=True) + executable_api_error = e + display_api_status_error(rest_api_error, executable_api_error, display_success=True) def rest_api_status_sanity_check(self) -> None: # Use the ground truth data to make sure the API is working correctly @@ -45,11 +59,11 @@ def rest_api_status_sanity_check(self) -> None: total=len(ground_truth_replaced), desc="API Status Test (REST)", ): - status = self.rest_executable_checker(data["ground_truth"], self.rest_eval_response_data[idx]) - if status["valid"]: + result = self.rest_executable_checker(data["ground_truth"], self.rest_eval_response_data[idx]) + if result.is_valid: correct_count += 1 else: - errors.append((data, status)) + errors.append((data, result.model_dump())) if correct_count != len(ground_truth_replaced): raise BadAPIStatusError(errors, f"{len(ground_truth_replaced) - correct_count} / {len(ground_truth_replaced)}") @@ -57,19 +71,36 @@ def rest_api_status_sanity_check(self) -> None: def executable_api_status_sanity_check(self) -> None: with open(self.executable_ground_truth_file_path, 'r') as file: ground_truth = [json.loads(line) for line in file] + + output_file_path = self.cache_dir / self.executable_ground_truth_file_path.name + if output_file_path.exists(): + with open(output_file_path, 'r') as file: + for line in file: + content = json.loads(line) + self._cached_exec_api_ground_truth_results[content['idx']] = content + correct_count = 0 errors = [] for data in tqdm(ground_truth, total=len(ground_truth), desc="API Status Test (Non-REST)"): - status = self._simple_executable_checker( + idx = data['idx'] + if idx not in self._cached_exec_api_ground_truth_results: + self._cached_exec_api_ground_truth_results[idx] = data + result = self._simple_executable_checker( data["ground_truth"][0], data["execution_result"][0], data["execution_result_type"][0], True, + idx=idx ) - if status["valid"]: + if result.is_valid: correct_count += 1 else: - errors.append((data, status)) + errors.append((data, result.model_dump())) + + # Save/update cache + with open(output_file_path, 'w') as file: + for _, v in sorted(self._cached_exec_api_ground_truth_results.items(), key=lambda x: x[0]): + file.write(json.dumps(v) + '\n') if correct_count != len(ground_truth): raise BadAPIStatusError(errors, f"{len(ground_truth) - correct_count} / {len(ground_truth)}") @@ -79,7 +110,7 @@ def executable_checker( decoded_result: List, func_description: Dict, test_category: LeaderboardExecutableCategory - ): + ) -> CheckerResult: if 'multiple' in test_category.value or 'parallel' in test_category.value: return self._parallel_no_order_executable_checker( decoded_result, @@ -89,11 +120,11 @@ def executable_checker( else: if len(decoded_result) != 1: - return { - "valid": False, - "error": ["Wrong number of functions."], - "error_type": "simple_exec_checker:wrong_count", - } + return CheckerResult( + is_valid=False, + error_type="simple_exec_checker:wrong_count", + error_message="Wrong number of functions." + ) return self._simple_executable_checker( decoded_result[0], func_description["execution_result"][0], @@ -103,141 +134,131 @@ def executable_checker( def _get_updated_rest_ground_truth_data(self) -> List[Dict]: output_file_path = self.cache_dir / self.rest_api_ground_truth_file_path.name - # Avoid loading the output file from the cache, since the api keys might change - - placeholders = {} - env_vars = ('GEOCODE_API_KEY', 'RAPID_API_KEY', 'OMDB_API_KEY', 'EXCHANGERATE_API_KEY') - for var in env_vars: - assert (api_key := os.getenv(var)), f'Please provide your {var} in the `.env` file.' - placeholders['YOUR-' + var.replace('_', '-')] = api_key - print("All API keys are present.") - - def replace_placeholders(data): - if isinstance(data, dict): - for key, value in data.items(): - if isinstance(value, (dict, list)): - replace_placeholders(value) - elif isinstance(value, str): - for placeholder, actual_value in placeholders.items(): - if placeholder in value: # Check if placeholder is in the string - data[key] = value.replace(placeholder, actual_value) - elif isinstance(data, list): - for idx, item in enumerate(data): - if isinstance(item, (dict, list)): - replace_placeholders(item) - elif isinstance(item, str): - for placeholder, actual_value in placeholders.items(): - if placeholder in item: # Check if placeholder is in the string - data[idx] = item.replace(placeholder, actual_value) - return data - - modified_data = [] - with open(self.rest_api_ground_truth_file_path, 'r') as file: - for line in file: - try: - data = replace_placeholders(json.loads(line)) - modified_data.append(data) - except json.JSONDecodeError: - # Handle the case where a line is not a valid JSON object - print('Invalid JSON line!') - - with open(output_file_path, 'w') as f: - for modified_line in modified_data: - f.write(json.dumps(modified_line) + '\n') - print(f'Saved REST API ground truth file with replaced placeholders at {output_file_path} 🦍.') + if output_file_path.exists(): + with open(output_file_path, 'r') as file: + modified_data = [json.loads(line) for line in file] + print(f'Loaded cached REST API ground truth file with replaced placeholders from "{output_file_path}" 🦍.') + else: + placeholders = {} + env_vars = ('GEOCODE_API_KEY', 'RAPID_API_KEY', 'OMDB_API_KEY', 'EXCHANGERATE_API_KEY') + for var in env_vars: + assert (api_key := os.getenv(var)), f'Please provide your {var} in the `.env` file.' + placeholders['YOUR-' + var.replace('_', '-')] = api_key + print("All API keys are present.") + + def replace_placeholders(data): + if isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, (dict, list)): + replace_placeholders(value) + elif isinstance(value, str): + for placeholder, actual_value in placeholders.items(): + if placeholder in value: # Check if placeholder is in the string + data[key] = value.replace(placeholder, actual_value) + elif isinstance(data, list): + for idx, item in enumerate(data): + if isinstance(item, (dict, list)): + replace_placeholders(item) + elif isinstance(item, str): + for placeholder, actual_value in placeholders.items(): + if placeholder in item: # Check if placeholder is in the string + data[idx] = item.replace(placeholder, actual_value) + return data + + modified_data = [] + with open(self.rest_api_ground_truth_file_path, 'r') as file: + for line in file: + try: + data = replace_placeholders(json.loads(line)) + modified_data.append(data) + except json.JSONDecodeError: + # Handle the case where a line is not a valid JSON object + print('Invalid JSON line!') + + with open(output_file_path, 'w') as f: + for modified_line in modified_data: + f.write(json.dumps(modified_line) + '\n') + print(f'Saved REST API ground truth file with replaced placeholders at {output_file_path} 🦍.') return modified_data - def rest_executable_checker(self, func_call, eval_ground_truth): + def rest_executable_checker(self, func_call, eval_ground_truth) -> CheckerResult: if "https://geocode.maps.co" in func_call: time.sleep(2) - if "requests_get" in func_call: - func_call = func_call.replace("requests_get", "requests.get") + func_call = func_call.replace("requests_get", "requests.get") try: - response = eval(func_call) + response = {} + exec("import requests;response=" + func_call, response) + response = response['response'] except Exception as e: - return { - "valid": False, - "error": [f"Execution failed. {str(e)}"], - "error_type": "executable_checker_rest:execution_error", - } + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:execution_error", + error_message=f"Execution failed. {str(e)}" + ) try: if response.status_code != 200: - return { - "valid": False, - "error": [ - f"Execution result status code is not 200, got {response.status_code}" - ], - "error_type": "executable_checker_rest:wrong_status_code", - } + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_status_code", + error_message=f"Execution result status code is not 200, got {response.status_code}", + ) except Exception as e: - return { - "valid": False, - "error": [f"Cannot get status code of the response. Error: {str(e)}"], - "error_type": "executable_checker_rest:cannot_get_status_code", - } + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:cannot_get_status_code", + error_message=f"Cannot get status code of the response. Error: {str(e)}", + ) try: if isinstance(eval_ground_truth, dict): if isinstance(response.json(), dict): if set(eval_ground_truth.keys()) == set(response.json().keys()): - return {"valid": True, "error": [], "error_type": ""} - return { - "valid": False, - "error": ["Key inconsistency"], - "error_type": "executable_checker_rest:wrong_key", - } - return { - "valid": False, - "error": [ - f"Expected dictionary, but got {type(response.json())}" - ], - "error_type": "executable_checker_rest:wrong_type", - } - + return CheckerResult(is_valid=True, error_type="", error_message="") + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_key", + error_message="Key inconsistency" + ) + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_type", + error_message=f"Expected dictionary, but got {type(response.json())}" + ) elif isinstance(eval_ground_truth, list): if isinstance(response.json(), list): if len(eval_ground_truth) != len(response.json()): - return { - "valid": False, - "error": [f"Response list length inconsistency."], - "error_type": "value_error:exec_result_rest_count", - } - + return CheckerResult( + is_valid=False, + error_type="value_error:exec_result_rest_count", + error_message="Response list length inconsistency." + ) else: for i in range(len(eval_ground_truth)): - if set(eval_ground_truth[i].keys()) != set( - response.json()[i].keys() - ): - return { - "valid": False, - "error": [f"Key inconsistency"], - "error_type": "executable_checker_rest:wrong_key", - } - - return {"valid": True, "error": []} + if set(eval_ground_truth[i]) != set(response.json()[i]): + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_key", + error_message="Key inconsistency" + ) + + return CheckerResult(is_valid=True, error_type="", error_message="") else: - return { - "valid": False, - "error": [ - f"Expected list, but got {type(response.json())}" - ], - "error_type": "executable_checker_rest:wrong_type", - } - return { - "valid": False, - "error": [ - f"Expected dict or list, but got {type(response.json())}" - ], - "error_type": "executable_checker_rest:wrong_type", - } + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_type", + error_message=f"Expected list, but got {type(response.json())}" + ) + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:wrong_type", + error_message=f"Expected dict or list, but got {type(response.json())}" + ) except Exception as e: - return { - "valid": False, - "error": [ - f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}" - ], - "error_type": "executable_checker_rest:response_format_error", - } + return CheckerResult( + is_valid=False, + error_type="executable_checker_rest:response_format_error", + error_message=f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}", + ) def _simple_executable_checker( self, @@ -245,26 +266,32 @@ def _simple_executable_checker( expected_result, expected_result_type: str, is_sanity_check=False, - ): - result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} - - exec_dict = {} - + idx: int | None = None + ) -> CheckerResult: + result = CheckerResult(is_valid=True, error_type="executable_checker:unclear", error_message="") + exec_output = None try: - exec( - "from bfcl.evaluator.exec_python_functions import *" + "\nresult=" + function_call, - exec_dict, - ) - exec_output = exec_dict["result"] + if idx is not None: + exec_output = self._cached_exec_api_ground_truth_results[idx].get('exec_output') + if exec_output is None: + exec_dict = {} + # TODO: Instead of importing all the functions, we can use regex to extract + # the function name from the `function_call` and only import that function. + exec( + "from bfcl.evaluator.exec_python_functions import *" + "\nresult=" + function_call, + exec_dict, + ) + exec_output = exec_dict["result"] + if idx is not None: + self._cached_exec_api_ground_truth_results[idx]['exec_output'] = exec_output except NoAPIKeyError as e: raise e except Exception as e: - result["valid"] = False - result["error"].append( - f"Error in execution: {repr(function_call)}. Error: {str(e)}" + return CheckerResult( + is_valid=False, + error_type="executable_checker:execution_error", + error_message=f"Error in execution: {repr(function_call)}. Error: {str(e)}" ) - result["error_type"] = "executable_checker:execution_error" - return result # We need to special handle the case where the execution result is a tuple and convert it to a list # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json @@ -273,13 +300,12 @@ def _simple_executable_checker( if expected_result_type == "exact_match": if exec_output != expected_result: - result["valid"] = False - result["error"].append( - f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}." + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result", + error_message=f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}.", + model_executed_output=exec_output ) - result["error_type"] = "executable_checker:wrong_result" - result["model_executed_output"] = exec_output - return result elif expected_result_type == "real_time_match": # Allow for 5% difference @@ -291,28 +317,29 @@ def _simple_executable_checker( <= exec_output <= expected_result * (1 + ExecutableChecker.REAL_TIME_MATCH_ALLOWED_DIFFERENCE) ): - result["valid"] = False - result["error"].append( - f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, " - f"but got: {exec_output}. {ExecutableChecker.REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed." + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_real_time", + error_message=( + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, " + f"but got: {exec_output}. {ExecutableChecker.REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed." + ), + model_executed_output=exec_output ) - result["error_type"] = "executable_checker:wrong_result_real_time" - result["model_executed_output"] = exec_output - return result else: - result["valid"] = False - result["error"].append( - f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, " - f"but got: {exec_output}. Type needs to be float or int for real time match criteria." + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_real_time", + error_message=( + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, " + f"but got: {exec_output}. Type needs to be float or int for real time match criteria." + ), + model_executed_output=exec_output ) - result["error_type"] = "executable_checker:wrong_result_real_time" - result["model_executed_output"] = exec_output - return result - else: # Structural match pattern_match_result = self._pattern_matcher(exec_output, expected_result, function_call, is_sanity_check) - if not pattern_match_result["valid"]: + if not pattern_match_result.is_valid: return pattern_match_result return result @@ -322,15 +349,13 @@ def _parallel_no_order_executable_checker( decoded_result: List, expected_exec_result: List, expected_exec_result_type: List - ): + ) -> CheckerResult: if len(decoded_result) != len(expected_exec_result): - return { - "valid": False, - "error": [ - f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}." - ], - "error_type": "value_error:exec_result_count", - } + return CheckerResult( + is_valid=False, + error_type="value_error:exec_result_count", + error_message=f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}." + ) matched_indices = [] for i in range(len(expected_exec_result)): @@ -346,98 +371,82 @@ def _parallel_no_order_executable_checker( False, ) - if result["valid"]: + if result.is_valid: matched_indices.append(index) break else: all_errors.append( { f"Model Result Index {index}": { - "sub_error": result["error"], - "sub_error_type": result["error_type"], + "sub_error": result.error_message, + "sub_error_type": result.error_type, "model_executed_output": ( - result["model_executed_output"] - if "model_executed_output" in result - else None + result.model_executed_output if hasattr(result, "model_executed_output") else None ), } } ) - if not result["valid"]: - considered_indices = [ - i for i in range(len(decoded_result)) if i not in matched_indices - ] - all_errors.insert( - 0, - f"Could not find a matching function among index {considered_indices} of model " \ - "output for index {i} of possible answers.", + if not result.is_valid: + considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices] + error_message = ( + f"Could not find a matching function among index {considered_indices} of model " + f"output for index {i} of possible answers." ) - return { - "valid": False, - "error": all_errors, - "error_type": "executable_checker:cannot_find_match", - } - - return {"valid": True, "error": [], "error_type": "executable_checker:unclear"} + error_message += "\nErrors:\n" + '\n'.join(map(json.dumps, all_errors)) + return CheckerResult( + is_valid=False, + error_type="executable_checker:cannot_find_match", + error_message=error_message + ) + return CheckerResult(is_valid=True, error_type="executable_checker:unclear", error_message="") @staticmethod - def _pattern_matcher(exec_output, expected_result, function_call, is_sanity_check): - result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} - + def _pattern_matcher(exec_output, expected_result, function_call, is_sanity_check) -> CheckerResult: + result = CheckerResult(is_valid=True, error_type="executable_checker:unclear", error_message="") if type(exec_output) != type(expected_result): - return { - "valid": False, - "error": [ - f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}." - ], - "error_type": "executable_checker:wrong_result_type", - "model_executed_output": exec_output, - } - if type(exec_output) == dict: + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type", + error_message=f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}", + model_executed_output=exec_output + ) + if isinstance(exec_output, dict): # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one. # This happens when the key is a timestamp or a random number. if is_sanity_check: if len(exec_output) != len(expected_result): - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." - ], - "error_type": "executable_checker:wrong_result_type:dict_length", - "model_executed_output": exec_output, - } + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type:dict_length", + error_message=f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}.", + model_executed_output=exec_output + ) else: return result - for key, value in expected_result.items(): + for key in expected_result: if key not in exec_output: - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output." - ], - "error_type": "executable_checker:wrong_result_type:dict_key_not_found", - "model_executed_output": exec_output, - } - for key, value in exec_output.items(): + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type:dict_key_not_found", + error_message=f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output.", + model_executed_output=exec_output + ) + for key in exec_output: if key not in expected_result: - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output." - ], - "error_type": "executable_checker:wrong_result_type:dict_extra_key", - "model_executed_output": exec_output, - } - if type(exec_output) == list: + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type:dict_extra_key", + error_message=f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output.", + model_executed_output=exec_output + ) + if isinstance(exec_output, list): if len(exec_output) != len(expected_result): - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." - ], - "error_type": "executable_checker:wrong_result_type:list_length", - "model_executed_output": exec_output, - } + return CheckerResult( + is_valid=False, + error_type="executable_checker:wrong_result_type:list_length", + error_message=f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}.", + model_executed_output=exec_output + ) return result \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py index c3306799f..af44bc08a 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py @@ -99,7 +99,7 @@ def run_relevance_evaluator(self, model_responses: List[Dict]) -> Dict: failed_model_responses = [] correct_count = 0 - for response in model_responses: + for response in tqdm(model_responses, total=len(model_responses), desc="Evaluating"): model_response = response['response'] success = False decoded_result = None @@ -145,30 +145,39 @@ def run_executable_evaluator( test_data = self.test_category_to_data[test_category] assert len(model_responses) == len(test_data) - + test_example_id_to_data = {} if test_category != types.LeaderboardExecutableCategory.REST: print(f"---- Getting real-time execution result from ground truth for '{test_category.value}' ----") exec_dict = {} for item in tqdm(test_data, desc="Getting Executable Expected Output"): - if item.get('execution_result'): - # Execution result have already been added to the test dataset - continue - execution_result = [] - ground_truth = item["ground_truth"] - for i in range(len(ground_truth)): - exec( - "from bfcl.evaluator.exec_python_functions import *" - + "\nresult=" - + ground_truth[i], - exec_dict, - ) - execution_result.append(exec_dict["result"]) - item["execution_result"] = execution_result + execution_result = item.get('execution_result') + if execution_result is None or not all(execution_result): # Check if cached value is None then try again. + execution_result = [] + ground_truth = item["ground_truth"] + for i in range(len(ground_truth)): + exec( + "from bfcl.evaluator.exec_python_functions import *" + "\nresult=" + ground_truth[i], + exec_dict, + ) + execution_result.append(exec_dict["result"]) + item["execution_result"] = execution_result + test_example_id_to_data[item['id']] = item + + # Save the test dataset with the added `execution_result` key + # TODO: Decide if you want to cache the execution results or not. + # Edge case: We don't validate the `execution_result` value, hence if the user didn't setup the + # environment variables correctly and we get incorrect `execution_result` from the + # `exec_python_functions`, those values will be cached. + file_path = self.leaderboard.test_data_cache_dir / self.leaderboard.get_file_name(test_category) + with open(file_path, 'w') as file: + for line in test_data: + file.write(json.dumps(line) + '\n') + print(f"---- Ground truth real-time execution result obtained for '{test_category.value}' 🌟 ----") failed_model_responses = [] correct_count = 0 - for idx, response in enumerate(model_responses): + for idx, response in tqdm(enumerate(model_responses), total=len(model_responses), desc="Evaluating"): model_response = response['response'] try: decoded_result = self.model_handler.decode_execute(model_response) @@ -225,22 +234,26 @@ def run_executable_evaluator( failed_model_responses.append(result) continue - checker_result = self._checker.executable_checker(decoded_result, test_data[idx], test_category) + checker_result = self._checker.executable_checker( + decoded_result, + test_example_id_to_data[response['id']], + test_category + ) - if checker_result["valid"]: + if checker_result.is_valid: correct_count += 1 else: result = FailedResult( example_id=response['id'], test_category=test_category.value, - is_valid=checker_result['valid'], - error_type=checker_result['error_type'], - error_message=checker_result['error'], + is_valid=checker_result.is_valid, + error_type=checker_result.error_type, + error_message=checker_result.error_message, llm_response=model_response, decoded_result=decoded_result, ) - if "model_executed_output" in checker_result: - result.model_executed_output = checker_result["model_executed_output"] + if hasattr(checker_result, "model_executed_output"): + result.model_executed_output = checker_result.model_executed_output failed_model_responses.append(result) result = dict( diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py b/berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py index 90fa0faef..93ed6d5b6 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py @@ -3,6 +3,9 @@ import time import requests +from dotenv import load_dotenv + +load_dotenv() # Make sure the env variables are populated env_vars = ('GEOCODE_API_KEY', 'RAPID_API_KEY', 'OMDB_API_KEY', 'EXCHANGERATE_API_KEY') diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py index 07acae22c..ff443246d 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py @@ -32,14 +32,14 @@ def display_api_status_error(rest_error, executable_error, display_success=False print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n") if rest_error: - print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test category (REST). Please contact API provider.\n") + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test group (REST). Please contact API provider.\n") print(f"{rest_error.error_rate} APIs affected:\n") for data, status in rest_error.errors: print(f" - Test Case: {data['ground_truth']}") print(f" Error Type: {status['error_type']}\n") if executable_error: - print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test categories (Non-REST). Please contact API provider.\n") + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test group (Non-REST). Please contact API provider.\n") print(f"{executable_error.error_rate} APIs affected:\n") for data, status in executable_error.errors: print(f" - Test Case: {data['ground_truth'][0]}") diff --git a/berkeley-function-call-leaderboard/bfcl/types.py b/berkeley-function-call-leaderboard/bfcl/types.py index 65572e07f..691fd87b0 100644 --- a/berkeley-function-call-leaderboard/bfcl/types.py +++ b/berkeley-function-call-leaderboard/bfcl/types.py @@ -87,18 +87,33 @@ def model_post_init(self, __context: Any) -> None: self.test_categories = [cat for cat in CATEGORY_GROUP_MAPPING[self.test_group]] self.cache_dir = Path.cwd() / self.cache_dir + @property + def test_data_cache_dir(self) -> Path: + test_data_dir = self.cache_dir / f'gorilla_openfunctions_{self.version.value}_test_data' + test_data_dir.mkdir(exist_ok=True, parents=True) + return test_data_dir + def load_test_data(self) -> Dict[LeaderboardCategory, List[Dict]]: # type: ignore data = {} - for test_category, file_path in self._get_test_data(): + for test_category, infile_path in self._get_test_data(): data[test_category] = [] - with open(file_path, 'r') as file: - for line in file: - item = json.loads(line) - item['test_category'] = test_category.value - item['id'] = self._generate_hash(json.dumps(item)) - data[test_category].append(item) + # We add `id` and `test_category` to each dataset sample + # Save the dataset in the cache with the updated keys for user reference + outfile_path = self.test_data_cache_dir / self.get_file_name(test_category) + if outfile_path.exists(): + with open(outfile_path, 'r') as file: + for line in file: + data[test_category].append(json.loads(line)) + else: + with open(infile_path, 'r') as infile, open(outfile_path, 'w') as outfile: + for line in infile: + item = json.loads(line) + item['test_category'] = test_category.value + item['id'] = self._generate_hash(json.dumps(item)) + data[test_category].append(item) + outfile.write(json.dumps(item) + '\n') return data - + def get_file_name(self, test_category: LeaderboardCategory) -> str: # type: ignore return f'gorilla_openfunctions_{self.version.value}_test_{test_category.value}.json' From 09384e348f4f29548e4035361f331e4a6b888042 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Fri, 12 Jul 2024 19:01:03 -0400 Subject: [PATCH 27/35] Convert checker from module to directory --- .../bfcl/evaluator/checker/__init__.py | 5 ++++ .../evaluator/checker/executable/__init__.py | 3 ++ .../{ => checker/executable}/exceptions.py | 0 .../executable}/exec_python_functions.py | 0 .../executable/executable.py} | 16 +++------- .../bfcl/evaluator/checker/types.py | 10 +++++++ .../bfcl/evaluator/evaluator.py | 30 +++++++++---------- 7 files changed, 37 insertions(+), 27 deletions(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/__init__.py rename berkeley-function-call-leaderboard/bfcl/evaluator/{ => checker/executable}/exceptions.py (100%) rename berkeley-function-call-leaderboard/bfcl/evaluator/{ => checker/executable}/exec_python_functions.py (100%) rename berkeley-function-call-leaderboard/bfcl/evaluator/{checker.py => checker/executable/executable.py} (98%) create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker/types.py diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py new file mode 100644 index 000000000..fa6fa1965 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py @@ -0,0 +1,5 @@ +from .executable import ExecutableChecker + +__all__ = [ + 'ExecutableChecker' +] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/__init__.py new file mode 100644 index 000000000..2276fd33e --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/__init__.py @@ -0,0 +1,3 @@ +from .executable import ExecutableChecker + +__all__ = ['ExecutableChecker'] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/exceptions.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exceptions.py similarity index 100% rename from berkeley-function-call-leaderboard/bfcl/evaluator/exceptions.py rename to berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exceptions.py diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exec_python_functions.py similarity index 100% rename from berkeley-function-call-leaderboard/bfcl/evaluator/exec_python_functions.py rename to berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/exec_python_functions.py diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/executable.py similarity index 98% rename from berkeley-function-call-leaderboard/bfcl/evaluator/checker.py rename to berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/executable.py index 3ff3de85b..7a446fa51 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/checker.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/executable/executable.py @@ -5,28 +5,20 @@ from typing import Dict, List from tqdm import tqdm -from pydantic import BaseModel from bfcl.types import LeaderboardExecutableCategory from bfcl.evaluator.utils import display_api_status_error -from bfcl.evaluator.exceptions import BadAPIStatusError, NoAPIKeyError +from bfcl.evaluator.checker.types import CheckerResult +from bfcl.evaluator.checker.executable.exceptions import BadAPIStatusError, NoAPIKeyError -class CheckerResult(BaseModel): - is_valid: bool - error_type: str - error_message: str - - class Config: - extra = 'allow' - class ExecutableChecker: REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2 def __init__(self, cache_dir: str) -> None: self.cache_dir = cache_dir - self.data_dir = Path(__file__, '..', '..', '..').resolve() / 'data' + self.data_dir = Path(__file__, '../../../../..').resolve() / 'data' self.rest_api_ground_truth_file_path = self.data_dir / 'api_status_check_ground_truth_REST.jsonl' self.executable_ground_truth_file_path = self.data_dir / 'api_status_check_ground_truth_executable.jsonl' @@ -278,7 +270,7 @@ def _simple_executable_checker( # TODO: Instead of importing all the functions, we can use regex to extract # the function name from the `function_call` and only import that function. exec( - "from bfcl.evaluator.exec_python_functions import *" + "\nresult=" + function_call, + "from bfcl.evaluator.checker.executable.exec_python_functions import *" + "\nresult=" + function_call, exec_dict, ) exec_output = exec_dict["result"] diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/types.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/types.py new file mode 100644 index 000000000..b1fb68a88 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/types.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + + +class CheckerResult(BaseModel): + is_valid: bool + error_type: str + error_message: str + + class Config: + extra = 'allow' \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py index af44bc08a..242c10646 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py @@ -8,7 +8,7 @@ import bfcl.types as types from bfcl.model_handler.base import BaseHandler from bfcl.evaluator.metrics import LeaderboardModelMetrics -from bfcl.evaluator.checker import ExecutableChecker +from bfcl.evaluator import checker from bfcl.evaluator import utils as evaluator_utils @@ -35,11 +35,11 @@ def __init__( self.model_name = model_handler.model_name self.model_handler = model_handler self.leaderboard = leaderboard + self.perform_api_sanity_check = perform_api_sanity_check self.test_category_to_data = leaderboard.load_test_data() - self._checker = ExecutableChecker(leaderboard.cache_dir) - if perform_api_sanity_check: - self._checker.perform_api_sanity_checks() + self._executable_checker = None + self._ast_checker = None self._model_metrics = LeaderboardModelMetrics(self.model_name) self._test_category_to_metrics = {} @@ -49,13 +49,6 @@ def __call__(self, file_path: Path, test_category) -> None: print(f'Skipping evaluation of test category "{test_category.value}" due to empty model responses!') return - if test_category == types.LeaderboardCategory.JAVA: - language = 'java' - elif test_category == types.LeaderboardCategory.JAVASCRIPT: - language = 'javascript' - else: - language = 'python' - print('🔍 Running test:', test_category.value) self._model_metrics(model_responses) @@ -63,7 +56,13 @@ def __call__(self, file_path: Path, test_category) -> None: if test_category == types.LeaderboardCategory.RELEVANCE: result = self.run_relevance_evaluator(model_responses) elif test_category.value in types.LeaderboardExecutableCategory: + if self._executable_checker is None: + self._executable_checker = checker.ExecutableChecker(self.leaderboard.cache_dir) + if self.perform_api_sanity_check: + self._executable_checker.perform_api_sanity_checks() result = self.run_executable_evaluator(test_category, model_responses) + elif test_category.value in types.LeaderboardAstCategory: + pass if result: accuracy = result['accuracy'] @@ -156,7 +155,8 @@ def run_executable_evaluator( ground_truth = item["ground_truth"] for i in range(len(ground_truth)): exec( - "from bfcl.evaluator.exec_python_functions import *" + "\nresult=" + ground_truth[i], + "from bfcl.evaluator.checker.executable.exec_python_functions import *" + + "\nresult=" + ground_truth[i], exec_dict, ) execution_result.append(exec_dict["result"]) @@ -213,9 +213,9 @@ def run_executable_evaluator( failed_model_responses.append(result) continue - checker_result = self._checker.rest_executable_checker( + checker_result = self._executable_checker.rest_executable_checker( decoded_result[0], - eval_ground_truth=self._checker.rest_eval_response_data[idx] + eval_ground_truth=self._executable_checker.rest_eval_response_data[idx] ) else: if not evaluator_utils.is_executable_format_output(decoded_result): @@ -234,7 +234,7 @@ def run_executable_evaluator( failed_model_responses.append(result) continue - checker_result = self._checker.executable_checker( + checker_result = self._executable_checker.executable_checker( decoded_result, test_example_id_to_data[response['id']], test_category From 7bd671ea92b8b9ea56eec953b8b49e2289e84183 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Fri, 12 Jul 2024 21:46:01 -0400 Subject: [PATCH 28/35] Add evaluation for ast group --- .../bfcl/evaluator/checker/__init__.py | 4 +- .../bfcl/evaluator/checker/ast/__init__.py | 3 + .../bfcl/evaluator/checker/ast/ast.py | 551 ++++++++++++++++++ .../checker/ast/type_converter/__init__.py | 4 + .../checker/ast/type_converter/java.py | 408 +++++++++++++ .../checker/ast/type_converter/javascript.py | 294 ++++++++++ .../bfcl/evaluator/checker/ast/utils.py | 31 + .../bfcl/evaluator/evaluator.py | 95 ++- 8 files changed, 1386 insertions(+), 4 deletions(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/__init__.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/__init__.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/java.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/javascript.py create mode 100644 berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/utils.py diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py index fa6fa1965..c27828ec8 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/__init__.py @@ -1,5 +1,7 @@ from .executable import ExecutableChecker +from .ast import AstChecker __all__ = [ - 'ExecutableChecker' + 'ExecutableChecker', + 'AstChecker', ] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/__init__.py new file mode 100644 index 000000000..04ec14c4a --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/__init__.py @@ -0,0 +1,3 @@ +from .ast import AstChecker + +__all__ = ['AstChecker'] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py new file mode 100644 index 000000000..23dcdfaa1 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py @@ -0,0 +1,551 @@ +import re +import json +from typing import List, Dict +from pathlib import Path + +from bfcl.evaluator.checker.types import CheckerResult +from bfcl.types import LeaderboardAstCategory, Leaderboard +from bfcl.model_handler import constants +from bfcl.evaluator.checker.ast import type_converter, utils + + +class AstChecker: + NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"] + PYTHON_TYPE_MAPPING = { + "string": str, + "integer": int, + "float": float, + "boolean": bool, + "array": list, + "tuple": list, + "dict": dict, + "any": str, + } + # This is the list of types that we need to recursively check its values + PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"] + + def __init__(self, model_name: str, leaderboard: Leaderboard) -> None: + self.model_name = model_name + self.leaderboard = leaderboard + self.possible_ans_dir = Path(__file__, '../../../../..').resolve() / 'data/possible_answer' + self.test_category_to_possible_ans = {} + + def load_possible_answers(self, test_category: LeaderboardAstCategory) -> None: + if test_category not in self.test_category_to_possible_ans: + file_name = self.leaderboard.get_file_name(test_category) + with open(self.possible_ans_dir / file_name, 'r') as file: + self.test_category_to_possible_ans[test_category] = [json.loads(line) for line in file] + + def __call__( + self, + idx: int, + func_description, + model_output: List, + test_category: LeaderboardAstCategory, + ) -> CheckerResult: + + language = self.get_language(test_category) + self.load_possible_answers(test_category) + possible_answers = self.test_category_to_possible_ans[test_category][idx] + + if 'multiple' in test_category.value or 'parallel' in test_category.value: + # Some formatting issues that needs to be handled + if test_category == "parallel_function": + func_description = [func_description] + return self._parallel_function_no_order_checker( + func_description, + model_output, + possible_answers, + language, + ) + else: + if len(model_output) != 1: + return CheckerResult( + is_valid=False, + error_type="simple_function_checker:wrong_count", + error_message="Wrong number of functions." + ) + model_output = model_output[0] + return self._simple_function_checker( + func_description, + model_output, + possible_answers, + language, + ) + + def _parallel_function_no_order_checker( + self, + func_descriptions: List, + model_output: List, + possible_answers: Dict, + language: str, + ) -> CheckerResult: + + if len(model_output) != len(possible_answers): + return CheckerResult( + is_valid=False, + error_type='parallel_function_checker_no_order:wrong_count', + error_message='Wrong number of functions.' + ) + + func_name_list = list(possible_answers.keys()) + possible_answers_list = [{key: value} for key, value in possible_answers.items()] + matched_indices = [] + # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer. + # It must be this way because we need ground truth to fetch the correct function description. + for i in range(len(possible_answers_list)): + func_description = utils.find_description(func_descriptions, func_name_list[i]) + # This should not happen. As possible_answers is the ground truth, and it should have the correct function name. + if func_description is None: + return CheckerResult( + is_valid=False, + error_type='parallel_function_checker_no_order:cannot_find_description', + error_message=f"Function doc description not found for function name: {repr(func_name_list[i])}." + ) + + all_errors = [] + for index in range(len(model_output)): + if index in matched_indices: + continue + + result = self._simple_function_checker( + func_description, + model_output[index], + possible_answers_list[i], + language, + ) + if result.is_valid: + matched_indices.append(index) + break + else: + all_errors.append( + { + f"Model Result Index {index}": { + "sub_error": result.error_message, + "sub_error_type": result.error_type, + "model_output_item": model_output[index], + "possible_answer_item": possible_answers_list[i], + } + } + ) + + if not result.is_valid: + considered_indices = [i for i in range(len(model_output)) if i not in matched_indices] + error_message = ( + f"Could not find a matching function among index {considered_indices} of model " + f"output for index {i} of possible answers." + ) + error_message += "\nErrors:\n" + '\n'.join(map(json.dumps, all_errors)) + return CheckerResult( + is_valid=False, + error_type="parallel_function_checker_no_order:cannot_find_match", + error_message=error_message + ) + + return CheckerResult(is_valid=True, error_type='', error_message='') + + def _simple_function_checker( + self, + func_description: dict, + model_output: dict, + possible_answer: dict, + language: str, + ) -> CheckerResult: + + language = language.lower() + possible_answer = list(possible_answer.values())[0] + # Extract function name and parameters details + func_name = func_description["name"] + param_details = func_description["parameters"]["properties"] + required_params = func_description["parameters"]["required"] + + result = CheckerResult(is_valid=True, error_type="simple_function_checker:unclear", error_message="") + func_name = utils.convert_func_name(func_name, self.model_name) + # Check if function name matches + if func_name not in model_output: + return CheckerResult( + is_valid=False, + error_type="simple_function_checker:wrong_func_name", + error_message=f"Function name {repr(func_name)} not found in model output." + ) + + model_params = model_output[func_name] + # Check for required parameters in model output + for param in required_params: + if param not in model_params: + return CheckerResult( + is_valid=False, + error_type="simple_function_checker:missing_required", + error_message=f"Missing required parameter: {repr(param)}." + ) + + # Validate types and values for each parameter in model output + for param, value in model_params.items(): + if param not in param_details or param not in possible_answer: + return CheckerResult( + is_valid=False, + error_type="simple_function_checker:unexpected_param", + error_message=f"Unexpected parameter: {repr(param)}." + ) + + full_param_details = param_details[param] + expected_type_description = full_param_details["type"] # This is a string + is_variable = False + nested_type_converted = None + + if language == "java": + expected_type_converted = constants.JAVA_TYPE_CONVERSION[expected_type_description] + if expected_type_description in constants.JAVA_TYPE_CONVERSION: + if not isinstance(value, str): + return CheckerResult( + is_valid=False, + error_type="type_error:java", + error_message=f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + + if expected_type_description in self.NESTED_CONVERSION_TYPE_LIST: + nested_type = param_details[param]["items"]["type"] + nested_type_converted = constants.JAVA_TYPE_CONVERSION[nested_type] + value = type_converter.java.java_type_converter(value, expected_type_description, nested_type) + else: + value = type_converter.java.java_type_converter(value, expected_type_description) + elif language == "javascript": + expected_type_converted = constants.JS_TYPE_CONVERSION[expected_type_description] + if expected_type_description in constants.JS_TYPE_CONVERSION: + if not isinstance(value, str): + return CheckerResult( + is_valid=False, + error_type="type_error:js", + error_message=f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + + if expected_type_description in self.NESTED_CONVERSION_TYPE_LIST: + nested_type = param_details[param]["items"]["type"] + nested_type_converted = constants.JS_TYPE_CONVERSION[nested_type] + value = type_converter.javascript.js_type_converter(value, expected_type_description, nested_type) + else: + value = type_converter.javascript.js_type_converter(value, expected_type_description) + elif language == "python": + expected_type_converted = self.PYTHON_TYPE_MAPPING[expected_type_description] + if expected_type_description in self.PYTHON_NESTED_TYPE_CHECK_LIST: + nested_type = param_details[param]["items"]["type"] + nested_type_converted = self.PYTHON_TYPE_MAPPING[nested_type] + + # We convert all tuple value to list when the expected type is tuple. + # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load(). + # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future. + if expected_type_description == "tuple" and type(value) == tuple: + value = list(value) + + # Allow python auto conversion from int to float + if ( + language == "python" + and expected_type_description == "float" + and type(value) == int + ): + value = float(value) + + # Type checking + # In fact, we only check for Python here. + # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct. + type_check_result = AstChecker.type_checker( + param, + value, + possible_answer[param], + expected_type_description, + expected_type_converted, + nested_type_converted, + ) + if not type_check_result.is_valid: + return type_check_result + is_variable = type_check_result.is_variable + + # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable. + # We can just treat the variable as a string and use the normal flow. + if not is_variable: + # Special handle for dictionaries + if expected_type_converted == dict: + result = AstChecker.dict_checker(param, value, possible_answer[param]) + if not result.is_valid: + return result + continue + + # Special handle for list of dictionaries + elif expected_type_converted == list and nested_type_converted == dict: + result = AstChecker.list_dict_checker(param, value, possible_answer[param]) + if not result.is_valid: + return result + continue + + # Special handle for strings + elif expected_type_converted == str: + # We don't check for case sensitivity for string, as long as it's not a variable + result = AstChecker.string_checker(param, value, possible_answer[param]) + if not result.is_valid: + return result + continue + + elif expected_type_converted == list: + result = AstChecker.list_checker(param, value, possible_answer[param]) + if not result.is_valid: + return result + continue + + # Check if the value is within the possible answers + if value not in possible_answer[param]: + result.is_valid = False + result.error_message = ( + f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}." + ) + result.error_type = "value_error:others" + return result + + # Check for optional parameters not provided but allowed + for param in possible_answer: + if param not in model_params and "" not in possible_answer[param]: + result.is_valid = False + result.error_message = f"Optional parameter {repr(param)} not provided and not marked as optional." + result.error_type = "simple_function_checker:missing_optional" + return result + + return result + + @staticmethod + def type_checker( + param: str, + value, + possible_answer: List, + expected_type_description: str, + expected_type_converted, + nested_type_converted, + ) -> CheckerResult: + # NOTE: This type checker only supports nested type checking for one level deep. + # We didn't implement recursive type checking for nested types, as it's not needed for + # the current use case and it's very complex. + + result = CheckerResult( + is_valid=True, + error_type="type_error:simple", + error_message='', + is_variable=True + ) + is_variable = False + # check for the case where a variable is used instead of a actual value. + # use the type in possible_answer as the expected type + possible_answer_type = utils.get_possible_answer_type(possible_answer) + # if possible_answer only contains optional parameters, we can't determine the type + if possible_answer_type != None: + # we are being precise here. + # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer + if possible_answer_type != expected_type_converted: + is_variable = True + + # value is the same type as in function description + if type(value) == expected_type_converted: + # We don't need to do recursive check for simple types + if nested_type_converted == None: + result.is_variable = is_variable + return result + else: + for possible_answer_item in possible_answer: + flag = True # Each parameter should match to at least one possible answer type. + # Here, we assume that each item should be the same type. We could also relax it. + if type(possible_answer_item) == list: + for value_item in value: + checker_result = AstChecker.type_checker( + param, + value_item, + possible_answer_item, + str(nested_type_converted), + nested_type_converted, + None, + ) + if not checker_result.is_valid: + flag = False + break + + if flag: + return CheckerResult( + is_valid=True, + error_type='', + error_message='', + is_variable=is_variable + ) + + result.is_valid = False + result.error_type = "type_error:nested" + result.error_message = ( + f"Nested type checking failed for parameter {repr(param)}. " + f'Expected outer type {expected_type_description} with inner type ' + f'{str(nested_type_converted)}. Parameter value: {repr(value)}.' + ) + + # value is not as expected, check for the case where a variable is used instead of a actual value + # use the type in possible_answer as the expected type + possible_answer_type = utils.get_possible_answer_type(possible_answer) + # if possible_answer only contains optional parameters, we can't determine the type + if possible_answer_type is not None: + # we are being precise here. + # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer + if type(value) == possible_answer_type: + result.is_variable = True + return result + + return CheckerResult( + is_valid=False, + error_type='type_error:simple', + error_message=f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + + def string_checker(param: str, model_output: str, possible_answer: List) -> CheckerResult: + standardize_possible_answer = [] + standardize_model_output = AstChecker.standardize_string(model_output) + for i in range(len(possible_answer)): + if type(possible_answer[i]) == str: + standardize_possible_answer.append(AstChecker.standardize_string(possible_answer[i])) + + if standardize_model_output not in standardize_possible_answer: + return CheckerResult( + is_valid=False, + error_type="value_error:string", + error_message=f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive." + ) + + return CheckerResult(is_valid=True, error_type='', error_message='',) + + @staticmethod + def list_checker(param: str, model_output: List, possible_answer: List) -> CheckerResult: + # Convert the tuple to a list + standardize_model_output = list(model_output) + + # If the element in the list is a string, we need to standardize it + for i in range(len(standardize_model_output)): + if type(standardize_model_output[i]) == str: + standardize_model_output[i] = AstChecker.standardize_string(model_output[i]) + + standardize_possible_answer = [] + # We also need to standardize the possible answers + for i in range(len(possible_answer)): + standardize_possible_answer.append([]) + for j in range(len(possible_answer[i])): + if type(possible_answer[i][j]) == str: + standardize_possible_answer[i].append(AstChecker.standardize_string(possible_answer[i][j])) + else: + standardize_possible_answer[i].append(possible_answer[i][j]) + + if standardize_model_output not in standardize_possible_answer: + return CheckerResult( + is_valid=False, + error_type="value_error:list/tuple", + error_message=f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}." + ) + + return CheckerResult(is_valid=True, error_type='', error_message='') + + @staticmethod + def dict_checker(param: str, model_output: Dict, possible_answers: List) -> CheckerResult: + # This function works for simple dictionaries, as well as dictionaries with nested dictionaries + result = CheckerResult(is_valid=False, error_type='dict_checker:unclear', error_message='') + for i in range(len(possible_answers)): + if possible_answers[i] == "": + continue + + result = CheckerResult(is_valid=False, error_type='dict_checker:unclear', error_message='') + flag = True + possible_answer = possible_answers[i] + # possible_answer is a single dictionary + if len(model_output.keys()) != len(possible_answer.keys()): + result.is_valid = False + result.error_message = "Wrong number of parameters for dictionary." + result.error_type = "value_error:dict_items" + flag = False + continue + + for key, value in model_output.items(): + if key not in possible_answer: + result.is_valid = False + result.error_message = f"Unexpected parameter: '{key}'." + result.error_type = "value_error:dict_key" + flag = False + break + + expected_values = possible_answer[key] + if isinstance(expected_values, dict): + result = AstChecker.dict_checker(param, value, [expected_values]) + if not result.is_valid: + flag = False + break + else: + standardize_value = value + # If the value is a string, we need to standardize it + if type(value) == str: + standardize_value = AstChecker.standardize_string(value) + # We also need to standardize the possible answers + standardize_possible_answer = [] + for i in range(len(possible_answer[key])): + if type(possible_answer[key][i]) == str: + standardize_possible_answer.append( + AstChecker.standardize_string(possible_answer[key][i]) + ) + else: + standardize_possible_answer.append(possible_answer[key][i]) + + if standardize_value not in standardize_possible_answer: + result.is_valid = False + result.error_message = f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}." + result.error_type = "value_error:dict_value" + flag = False + break + if flag: + return CheckerResult(is_valid=True, error_type='', error_message='') + + return result + + @staticmethod + def list_dict_checker(param: str, model_output: List, possible_answers: List) -> CheckerResult: + # This function takes in a list of dictionaries and checks if each dictionary is valid + # The order of the dictionaries in the list must match the order of the possible answers + result = CheckerResult(is_valid=False, error_type='list_dict_checker:unclear', error_message='') + for answer_index in range(len(possible_answers)): + flag = True # True means so far, all dictionaries are valid + + # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers + if len(model_output) != len(possible_answers[answer_index]): + result.is_valid = False + result.error_message = "Wrong number of dictionaries in the list." + result.error_type = "value_error:list_dict_count" + flag = False + continue + + for dict_index in range(len(model_output)): + result = AstChecker.dict_checker( + param, + model_output[dict_index], + [possible_answers[answer_index][dict_index]], + ) + if not result.is_valid: + flag = False + break + if flag: + return CheckerResult(is_valid=True, error_type='', error_message='') + + return result + + @staticmethod + def standardize_string(input_string: str): + # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase + # It will also convert all the single quotes to double quotes + # This is used to compare the model output with the possible answers + # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024 + regex_string = r"[ \,\.\/\-\_\*\^]" + return re.sub(regex_string, "", input_string).lower().replace("'", '"') + + @staticmethod + def get_language(test_category: LeaderboardAstCategory) -> str: + if test_category == LeaderboardAstCategory.JAVA: + language = 'java' + elif test_category == LeaderboardAstCategory.JAVASCRIPT: + language = 'javascript' + else: + language = 'python' + return language \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/__init__.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/__init__.py new file mode 100644 index 000000000..a8cecd5cf --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/__init__.py @@ -0,0 +1,4 @@ +from . import java +from . import javascript + +__all__ = ['java', 'javascript'] \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/java.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/java.py new file mode 100644 index 000000000..ab16ac310 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/java.py @@ -0,0 +1,408 @@ +import re +from typing import List, Dict, Union + +from bfcl.model_handler.constants import JAVA_TYPE_CONVERSION + + +def java_type_converter(value, expected_type, nested_type=None): + if expected_type not in JAVA_TYPE_CONVERSION: + raise ValueError(f"Unsupported type: {expected_type}") + if ( + expected_type == "byte" + or expected_type == "short" + or expected_type == "integer" + ): + if not re.match(r"^-?\d+$", value): + return str(value) # default to string + return int(value) + elif expected_type == "float": + if not re.match(r"^-?\d+(\.\d+)?([eE][+-]?\d+)?[fF]$", value): + return str(value) # default to string + return float(re.sub(r"[fF]$", "", value)) + elif expected_type == "double": + if not re.match(r"^-?\d+(\.\d+)?([eE][+-]?\d+)?$", value): + return str(value) # default to string + return float(value) + elif expected_type == "long": + if not re.match(r"^-?\d+[lL]$", value): + return str(value) # default to string + return int(re.sub(r"[lL]$", "", value)) + elif expected_type == "boolean": + if value not in ["true", "false"]: + return str(value) # default to string + return parse_java_boolean(value) + elif expected_type == "char": + if not re.match(r"^\'.$\'", value): + return str(value) # default to string + return value # Remove the single quotes + elif expected_type == "Array" or expected_type == "ArrayList": + return parse_java_collection(value, expected_type, nested_type) + elif expected_type == "Set": + raise NotImplementedError("Set conversion is not implemented") + elif expected_type == "HashMap": + return parse_java_collection(value, expected_type, nested_type) + elif expected_type == "Hashtable": + raise NotImplementedError("Set conversion is not implemented") + elif expected_type == "Queue" or expected_type == "Stack": + raise NotImplementedError(f"{expected_type} conversion is not implemented") + elif expected_type == "String" or expected_type == "any": + return str(value) # we output as string for `any` type + else: + raise ValueError(f"Unsupported type: {expected_type}") + + +def parse_java_boolean(value): + return value == "true" + + +def parse_java_collection( + input_str: str, type_str: str, nested_type=None +) -> Union[List, Dict]: + if type_str == "ArrayList": + return parse_arraylist(input_str, nested_type) + elif type_str == "Array": + return parse_array(input_str, nested_type) + elif type_str == "HashMap": + return parse_hashmap(input_str) + else: + raise ValueError(f"Unsupported type: {type_str}") + + +def parse_arraylist(input_str: str, nested_type=None) -> List: + match_asList = re.search( + r"new\s+ArrayList<\w*>\(Arrays\.asList\((.+?)\)\)", input_str + ) + if match_asList: + elements_str = match_asList.group(1) + elements = [] + for element_str in elements_str.split(","): + element_str = element_str.strip() + if nested_type == "char": + element = element_str[1:-1] # Remove the single quotes + elif nested_type == "String": + element = element_str[1:-1] # Remove the double quotes + else: + element = ( + java_type_converter(element_str, nested_type) + if nested_type + else parse_java_value(element_str) + ) + elements.append(element) + return elements + + match_add = re.search( + r"new\s+ArrayList<\w*>\(\)\s*\{\{\s*(.+?)\s*\}\}", input_str, re.DOTALL + ) + if match_add: + adds_str = match_add.group(1) + elements = [] + matches = re.findall(r"add\((.+?)\)", adds_str) + for match in matches: + value_str = match.strip() + if nested_type == "char": + value = value_str[1:-1] # Remove the single quotes + elif nested_type == "String": + value = value_str[1:-1] # Remove the double quotes + else: + value = ( + java_type_converter(value_str, nested_type) + if nested_type + else parse_java_value(value_str) + ) + elements.append(value) + return elements + + match_empty = re.search(r"new\s+ArrayList<\w*>\(\)", input_str) + if match_empty: + return [] # Return an empty list for an empty ArrayList + + return input_str # default to string + + +def parse_array(input_str: str, nested_type=None) -> List: + match = re.search(r"new\s+\w+\[\]\s*\{(.*?)\}", input_str) + if match: + elements_str = match.group(1) + if nested_type: + elements = [ + java_type_converter(x.strip(), nested_type) + for x in elements_str.split(",") + if x.strip() + ] + else: + elements = [ + parse_java_value(x.strip()) + for x in elements_str.split(",") + if x.strip() + ] + + return elements + else: + return input_str # default to string + + +def parse_hashmap(input_str: str) -> Dict: + elements = {} + match = re.search( + r"new\s+HashMap<.*?>\s*\(\)\s*\{\s*\{?\s*(.*?)\s*\}?\s*\}", input_str, re.DOTALL + ) + if match: + puts_str = match.group(1) + if puts_str.strip(): + matches = re.findall(r"put\(\"(.*?)\",\s*(.*?)\)", puts_str) + for match in matches: + key = match[0] + value = parse_java_value(match[1].strip()) + elements[key] = value + return elements + + match_empty = re.search(r"new\s+HashMap<.*?>\s*\(\)", input_str) + if match_empty: + return {} # Return an empty dictionary for an empty HashMap + + return input_str # default to string + + +# This method parses without the information of what each element type is, contrary of the previous +def parse_java_value(value_str: str): + # check if it's boolean + if value_str == "true": + return True + elif value_str == "false": + return False + # check if it's a string + elif value_str.startswith('"') and value_str.endswith('"'): + return value_str[1:-1] + # check if it's a long + elif re.match(r"^-?\d+[lL]$", value_str): + return int(value_str[:-1]) + # check if it's a float + elif re.match(r"^-?\d+(\.\d+)?([eE][+-]?\d+)?[fF]$", value_str): + return float(re.sub(r"[fF]$", "", value_str)) + # check if it's a integer-like and float-like types (including byte, short, integer, double, etc) + else: + try: + return int(value_str) + except ValueError: + try: + return float(value_str) + except ValueError: + # this assuming all other types are converted to string + return value_str + + +# Write tests for the `java_type_converter` function +def test_java_type_converter(): + # Test valid conversions + assert java_type_converter("true", "boolean") == True + assert java_type_converter("false", "boolean") == False + assert java_type_converter("123", "integer") == 123 + assert java_type_converter("-123", "integer") == -123 + assert java_type_converter("3.14f", "float") == 3.14 + assert java_type_converter("-3.14f", "float") == -3.14 + assert java_type_converter("3.14", "double") == 3.14 + assert java_type_converter("-3.14", "double") == -3.14 + assert java_type_converter("123L", "long") == 123 + assert java_type_converter("-123L", "long") == -123 + assert java_type_converter("a", "char") == "a" + assert java_type_converter("abc", "String") == "abc" + assert java_type_converter("new int[]{1, 2, 3}", "Array") == [1, 2, 3] + assert java_type_converter( + 'new ArrayList<>(Arrays.asList("a", "b"))', "ArrayList" + ) == ["a", "b"] + assert java_type_converter( + 'new HashMap() {{ put("key", "value"); }}', "HashMap" + ) == {"key": "value"} + assert java_type_converter("3f", "float") == 3.0 + assert java_type_converter("3e3F", "float") == 3e3 + assert java_type_converter("3e-3F", "float") == 3e-3 + assert java_type_converter("3.14e2", "double") == 3.14e2 + assert java_type_converter("3.14e-2", "double") == 3.14e-2 + assert java_type_converter("127", "byte") == 127 + assert java_type_converter("-128", "byte") == -128 + assert java_type_converter("32767", "short") == 32767 + assert java_type_converter("-32768", "short") == -32768 + assert java_type_converter("9223372036854775807L", "long") == 9223372036854775807 + assert java_type_converter("-9223372036854775808L", "long") == -9223372036854775808 + assert java_type_converter("123", "any") == "123" + assert java_type_converter("abc", "any") == "abc" + + # Test empty collections + assert java_type_converter("new int[]{}", "Array") == [] + assert java_type_converter("new ArrayList<>()", "ArrayList") == [] + assert java_type_converter("new HashMap<>()", "HashMap") == {} + + # Test collections with mixed types + assert java_type_converter('new Object[]{1, "abc", true}', "Array") == [ + 1, + "abc", + True, + ] + assert java_type_converter( + 'new ArrayList<>(Arrays.asList(1, "abc", true))', "ArrayList" + ) == [1, "abc", True] + assert java_type_converter( + 'new HashMap() {{ put("key1", 1); put("key2", "value"); put("key3", true); }}', + "HashMap", + ) == {"key1": 1, "key2": "value", "key3": True} + + # Test invalid values + try: + java_type_converter("true", "integer") + except ValueError as e: + assert str(e) == "Invalid integer value: true" + + try: + java_type_converter("abc", "integer") + except ValueError as e: + assert str(e) == "Invalid integer value: abc" + + try: + java_type_converter("abc", "long") + except ValueError as e: + assert str(e) == "Invalid long value: abc" + + try: + java_type_converter("3.14", "float") + except ValueError as e: + assert str(e) == "Invalid float value: 3.14" + + try: + java_type_converter("3.14f", "double") + except ValueError as e: + assert str(e) == "Invalid double value: 3.14f" + + try: + java_type_converter("128", "byte") + except ValueError as e: + assert str(e) == "Invalid byte value: 128" + + try: + java_type_converter("32768", "short") + except ValueError as e: + assert str(e) == "Invalid short value: 32768" + + try: + java_type_converter("invalid", "boolean") + except ValueError as e: + assert str(e) == "Invalid boolean value: invalid" + + try: + java_type_converter("abc", "char") + except ValueError as e: + assert str(e) == "Invalid char value: abc" + + # Test unsupported types + try: + java_type_converter("abc", "Set") + except NotImplementedError as e: + assert str(e) == "Set conversion is not implemented" + + try: + java_type_converter("abc", "Hashtable") + except NotImplementedError as e: + assert str(e) == "Set conversion is not implemented" + + try: + java_type_converter("abc", "Queue") + except NotImplementedError as e: + assert str(e) == "Queue conversion is not implemented" + + try: + java_type_converter("abc", "Stack") + except NotImplementedError as e: + assert str(e) == "Stack conversion is not implemented" + + # extra array testing + assert java_type_converter("new int[]{}", "Array") == [] + assert java_type_converter("new int[] {}", "Array") == [] + assert java_type_converter("new int[] { }", "Array") == [] + assert java_type_converter("new int[]{1,2,3}", "Array") == [1, 2, 3] + assert java_type_converter("new int[]{1, 2, 3}", "Array") == [1, 2, 3] + assert java_type_converter("new int[] {1, 2, 3}", "Array") == [1, 2, 3] + assert java_type_converter("new int[] { 1, 2, 3 }", "Array") == [1, 2, 3] + + # extra hashmap testing + assert java_type_converter("new HashMap<>()", "HashMap") == {} + assert java_type_converter("new HashMap<>() {}", "HashMap") == {} + assert java_type_converter("new HashMap<>() {{}}", "HashMap") == {} + assert java_type_converter("new HashMap<>() {{ }}", "HashMap") == {} + assert java_type_converter( + 'new HashMap() {{ put("key", "value"); }}', "HashMap" + ) == {"key": "value"} + assert java_type_converter( + 'new HashMap() {{put("key", "value");}}', "HashMap" + ) == {"key": "value"} + assert java_type_converter( + 'new HashMap() { { put("key", "value"); } }', "HashMap" + ) == {"key": "value"} + assert java_type_converter( + 'new HashMap() {{ put("key1", 123); put("key2", true); }}', + "HashMap", + ) == {"key1": 123, "key2": True} + assert java_type_converter( + 'new HashMap() {{ put("key1", "value 1"); put("key2", "value 2"); }}', + "HashMap", + ) == {"key1": "value 1", "key2": "value 2"} + + def test_parse_array_long(): + input_str = "new long[]{1L, 2L, 3L}" + expected_output = [1, 2, 3] + assert parse_array(input_str, nested_type="long") == expected_output + + def test_parse_array_mixed_long(): + input_str = "new long[]{1L, 2, 3L}" + expected_output = [1, "2", 3] + assert parse_array(input_str, nested_type="long") == expected_output + + def test_parse_array_invalid_long(): + input_str = "new long[]{1L, 2.0, 3L}" + expected_output = [1, "2.0", 3] + assert parse_array(input_str, nested_type="long") == expected_output + + def test_parse_arraylist_int(): + input_str = "new ArrayList(Arrays.asList(1, 2, 3))" + expected_output = [1, 2, 3] + assert parse_arraylist(input_str, nested_type="integer") == expected_output + + def test_parse_arraylist_float(): + input_str = "new ArrayList() {{ add(1.0f); add(2.0f); add(3.0f); }}" + expected_output = [1.0, 2.0, 3.0] + assert parse_arraylist(input_str, nested_type="float") == expected_output + + def test_parse_arraylist_double(): + input_str = "new ArrayList() {{ add(1.0); add(2.0); add(3.0); }}" + expected_output = [1.0, 2.0, 3.0] + assert parse_arraylist(input_str, nested_type="double") == expected_output + + def test_parse_arraylist_boolean(): + input_str = "new ArrayList(Arrays.asList(true, false, true))" + expected_output = [True, False, True] + assert parse_arraylist(input_str, nested_type="boolean") == expected_output + + def test_parse_arraylist_char(): + input_str = "new ArrayList() {{ add('a'); add('b'); add('c'); }}" + expected_output = ["a", "b", "c"] + print(parse_arraylist(input_str, nested_type="char")) + assert parse_arraylist(input_str, nested_type="char") == expected_output + + def test_parse_arraylist_string(): + input_str = 'new ArrayList() {{ add("aasdasd"); add("basdasd"); add("casdasd"); }}' + expected_output = ["aasdasd", "basdasd", "casdasd"] + print(parse_arraylist(input_str)) + assert parse_arraylist(input_str) == expected_output + + test_parse_array_long() + test_parse_array_mixed_long() + test_parse_array_invalid_long() + test_parse_arraylist_int() + test_parse_arraylist_float() + test_parse_arraylist_double() + test_parse_arraylist_boolean() + test_parse_arraylist_char() + test_parse_arraylist_string() + print("All tests passed successfully!") + + +if __name__ == "__main__": + test_java_type_converter() diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/javascript.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/javascript.py new file mode 100644 index 000000000..d85fc0a42 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/type_converter/javascript.py @@ -0,0 +1,294 @@ +import re + +from bfcl.model_handler.constants import JS_TYPE_CONVERSION + + +def js_type_converter(value, expected_type, nested_type=None): + if expected_type not in JS_TYPE_CONVERSION: + raise ValueError(f"Unsupported type: {expected_type}") + + if expected_type == "String": + if not (value.startswith('"') and value.endswith('"')) and not ( + value.startswith("'") and value.endswith("'") + ): + return str(value) + return value[1:-1] + + elif expected_type == "integer": + if not re.match(r"^-?\d+$", value): + return str(value) # default to string + return int(value) + elif expected_type == "float": + if not re.match(r"^-?\d+(\.\d+)?$", value): + return str(value) # default to string + return float(value) + elif expected_type == "Bigint": + if not re.match(r"^-?\d+n$", value): + return str(value) # default to string + return int(value[:-1]) + elif expected_type == "Boolean": + if value not in ["true", "false"]: + return str(value) # default to string + return value == "true" + elif expected_type == "dict": + return parse_js_collection(value, "dict", nested_type) + elif expected_type == "array": + return parse_js_collection(value, "array", nested_type) + elif expected_type == "any": + return str(value) + else: + raise ValueError(f"Unsupported type: {expected_type}") + + +def parse_js_collection(code, type_str, nested_type=None): + code = code.strip() + if type_str == "array": + # Regular expression patterns + array_2d_pattern = r"\[\s*\[.*?\]\s*(,\s*\[.*?\]\s*)*\]|\bnew\s+Array\(\s*\[.*?\]\s*(,\s*\[.*?\]\s*)*\)" + array_pattern = r"\[(.*?)\]|\bnew\s+Array\((.*?)\)" + + # Check if the code is a 2D array + array_2d_match = re.match(array_2d_pattern, code) + try: + if array_2d_match: + elements_str = array_2d_match.group(0) + inner_arrays = re.findall(r"\[(.*?)\]", elements_str) + elements = [] + for idx, inner_array_str in enumerate(inner_arrays): + inner_array_str = inner_array_str.strip() + if idx == 0 and inner_array_str.startswith("["): + inner_array_str = inner_array_str[1:] + inner_array_elements = [ + e.strip() for e in inner_array_str.split(",") + ] + if nested_type: + inner_array = [parse_js_value(e) for e in inner_array_elements] + else: + inner_array = [parse_js_value(e) for e in inner_array_elements] + elements.append(inner_array) + return elements + + # Check if the code is a 1D array + array_match = re.match(array_pattern, code) + if array_match: + if array_match.group(1) is not None: + elements_str = array_match.group(1).strip() + if elements_str: + elements = elements_str.split(",") + else: + elements = [] + elif array_match.group(2) is not None: + elements_str = array_match.group(2).strip() + if elements_str: + elements = elements_str.split(",") + else: + elements = [] + else: + elements = [] + if nested_type: + elements = [ + ( + js_type_converter(e.strip(), nested_type, "String") + if (e.strip().startswith("'") or e.strip().startswith('"')) + else js_type_converter(e.strip(), nested_type) + ) + for e in elements + ] + else: + elements = [parse_js_value(e.strip()) for e in elements] + return elements + else: + return code + except: + return code + + elif type_str == "dict": + + if code == "{}": + return {} # Return an empty dictionary for an empty object + dict_pattern = r"\{(.*?)\}" + # Check if the code is a dictionary + dict_match = re.match(dict_pattern, code) + if dict_match: + try: + pairs = dict_match.group(1).split(",") + dictionary = {} + for pair in pairs: + key, value = pair.split(":") + key = parse_js_value(key.strip().strip("'")) + value = parse_js_value(value.strip().strip("'")) + dictionary[key] = value + return dictionary + except: + return code + else: + return code # default to string + else: + raise ValueError(f"Unsupported type: {type_str}") + + +def parse_js_value(value_str: str): + value_str = value_str.strip() + if value_str == "true": + return True + elif value_str == "false": + return False + elif (value_str.startswith('"') and value_str.endswith('"')) or ( + value_str.startswith("'") and value_str.endswith("'") + ): + return value_str[1:-1] + else: + try: + return int(value_str) + except ValueError: + try: + return float(value_str) + except ValueError: + return value_str + + +# Write tests for the `js_type_converter` function + + +def test_js_type_converter(): + assert js_type_converter("true", "Boolean") == True + assert js_type_converter("false", "Boolean") == False + assert js_type_converter("123", "integer") == 123 + assert js_type_converter("3.14", "float") == 3.14 + assert js_type_converter("123n", "Bigint") == 123 + assert js_type_converter("abc", "String") == "abc" + assert js_type_converter("[1, 2, 3]", "array") == [1, 2, 3] + assert js_type_converter("new Array(1, 2, 3)", "array") == [1, 2, 3] + assert js_type_converter("{'key': 'value'}", "dict") == {"key": "value"} + assert js_type_converter("{'key': 123}", "dict") == {"key": 123} + assert js_type_converter("{'key': true}", "dict") == {"key": True} + + # Additional test cases + # Test empty array and dictionary + assert js_type_converter("[]", "array") == [] + assert js_type_converter("{}", "dict") == {} + + # Test array with mixed types + assert js_type_converter("[1, 'two', true]", "array") == [1, "two", True] + + # Test dictionary with mixed types + assert js_type_converter( + "{'key1': 123, 'key2': 'value', 'key3': false}", "dict" + ) == {"key1": 123, "key2": "value", "key3": False} + + # Test string with special characters + + # Test negative integer and float values + assert js_type_converter("-123", "integer") == -123 + assert js_type_converter("-3.14", "float") == -3.14 + + # Test invalid type + try: + js_type_converter("123", "InvalidType") + except ValueError as e: + assert str(e) == "Unsupported type: InvalidType" + + # Test invalid integer value + try: + js_type_converter("123.45", "integer") + except ValueError as e: + assert str(e) == "Invalid integer value: 123.45" + + # Test invalid float value + try: + js_type_converter("3.14abc", "float") + except ValueError as e: + assert str(e) == "Invalid float value: 3.14abc" + + # Test invalid Bigint value + try: + js_type_converter("123", "Bigint") + except ValueError as e: + assert str(e) == "Invalid Bigint value: 123" + + # Test invalid boolean value + try: + js_type_converter("not_a_boolean", "Boolean") + except ValueError as e: + assert str(e) == "Invalid boolean value: not_a_boolean" + + print("All tests passed successfully!") + + +def test_js_type_converter_nested_array(): + # Test array with nested integers + assert js_type_converter("[1, 2, 3]", "array", "integer") == [1, 2, 3] + assert js_type_converter("new Array(4, 5, 6)", "array", "integer") == [4, 5, 6] + + # Test array with nested floats + assert js_type_converter("[1.1, 2.2, 3.3]", "array", "float") == [1.1, 2.2, 3.3] + assert js_type_converter("new Array(4.4, 5.5, 6.6)", "array", "float") == [ + 4.4, + 5.5, + 6.6, + ] + + # Test array with nested Bigints + assert js_type_converter("[1n, 2n, 3n]", "array", "Bigint") == [1, 2, 3] + assert js_type_converter("new Array(4n, 5n, 6n)", "array", "Bigint") == [4, 5, 6] + + # Test array with nested booleans + assert js_type_converter("[true, false, true]", "array", "Boolean") == [ + True, + False, + True, + ] + assert js_type_converter("new Array(false, true, false)", "array", "Boolean") == [ + False, + True, + False, + ] + + # Test array with nested strings + print(js_type_converter('["hello", "world", "!"]', "array", "String")) + assert js_type_converter('["hello", "world", "!"]', "array", "String") == [ + "hello", + "world", + "!", + ] + assert js_type_converter('new Array("foo", "bar", "baz")', "array", "String") == [ + "foo", + "bar", + "baz", + ] + + # Test array with mixed nested types + assert js_type_converter('[1, "two", true]', "array") == [1, "two", True] + assert js_type_converter('new Array(3.14, "pi", false)', "array") == [ + 3.14, + "pi", + False, + ] + + # Test array with nested arrays + print(js_type_converter(" [ [1, 2], [3, 4], [5, 6]]", "array", "array")) + assert js_type_converter(" [ [ 1, 2 ], [ 3, 4], [5, 6]]", "array", "array") == [ + [1, 2], + [3, 4], + [5, 6], + ] # this example has many weird spacings + assert js_type_converter("new Array([1, 2], [3, 4], [5, 6])", "array", "array") == [ + [1, 2], + [3, 4], + [5, 6], + ] + + # Test array with nested dictionaries + assert js_type_converter( + '[{"key1": 1}, {"key2": 2}, {"key3": 3}]', "array", "dict" + ) == [{"key1": 1}, {"key2": 2}, {"key3": 3}] + assert js_type_converter( + 'new Array({"key1": 1}, {"key2": 2}, {"key3": 3})', "array", "dict" + ) == [{"key1": 1}, {"key2": 2}, {"key3": 3}] + + print("All nested array tests passed successfully!") + + +if __name__ == "__main__": + test_js_type_converter() + test_js_type_converter_nested_array() diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/utils.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/utils.py new file mode 100644 index 000000000..15b703dfa --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/utils.py @@ -0,0 +1,31 @@ +import re + +from bfcl.model_handler.constants import UNDERSCORE_TO_DOT + + +def convert_func_name(function_name, model_name: str): + model_name_escaped = model_name.replace("_", "/") + if "." in function_name: + if model_name_escaped in UNDERSCORE_TO_DOT: + # OAI does not support "." in the function name so we replace it with "_". + # ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name. + # This happens for OpenAI, Mistral, and Google models + return re.sub(r"\.", "_", function_name) + return function_name + +def find_description(func_descriptions, name): + # If func_descriptions is a list, this is the multiple or multiple_parallel case + if type(func_descriptions) == list: + for func_description in func_descriptions: + if func_description["name"] in name: + return func_description + return None + else: + # This is the parallel case, there is no need to loop through the list, as there is only one function + return func_descriptions + +def get_possible_answer_type(possible_answer: list): + for answer in possible_answer: + if answer != "": # Optional parameter + return type(answer) + return None \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py index 242c10646..2ed503325 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py @@ -19,7 +19,7 @@ class FailedResult(BaseModel): error_type: str error_message: str llm_response: str - decoded_result: Any + decoded_result: Any | None = None class Config: extra = 'allow' @@ -53,7 +53,7 @@ def __call__(self, file_path: Path, test_category) -> None: self._model_metrics(model_responses) result = None - if test_category == types.LeaderboardCategory.RELEVANCE: + if test_category.value == types.LeaderboardCategory.RELEVANCE.value: result = self.run_relevance_evaluator(model_responses) elif test_category.value in types.LeaderboardExecutableCategory: if self._executable_checker is None: @@ -62,7 +62,9 @@ def __call__(self, file_path: Path, test_category) -> None: self._executable_checker.perform_api_sanity_checks() result = self.run_executable_evaluator(test_category, model_responses) elif test_category.value in types.LeaderboardAstCategory: - pass + if self._ast_checker is None: + self._ast_checker = checker.AstChecker(self.model_name, self.leaderboard) + result = self.run_ast_evaluator(test_category, model_responses) if result: accuracy = result['accuracy'] @@ -265,6 +267,93 @@ def run_executable_evaluator( self._save_scores(test_category, result) return result + def run_ast_evaluator( + self, + test_category: types.LeaderboardCategory, + model_responses: List[Dict] + ) -> Dict: + + self._ast_checker.load_possible_answers(test_category) + test_data = self.test_category_to_data[test_category] + possible_answers = self._ast_checker.test_category_to_possible_ans[test_category] + language = self._ast_checker.get_language(test_category) + assert len(model_responses) == len(test_data) == len(possible_answers), ( + "No. of the model responses does not match the no. of test data or " + "no. of possible answers. Please check the input files for completeness." + ) + + test_example_id_to_data = {data['id']: data for data in test_data} + failed_model_responses = [] + correct_count = 0 + for idx, response in tqdm(enumerate(model_responses), total=len(model_responses), desc="Evaluating"): + model_result_item = response['response'] + possible_answer_item = possible_answers[idx] + + try: + model_result_item_raw = model_result_item + model_result_item = self.model_handler.decode_ast(model_result_item, language) + except Exception as e: + failed_model_responses.append( + FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_message=f"Invalid syntax. Failed to decode AST. {str(e)}", + error_type="ast_decoder:decoder_failed", + llm_response=model_result_item_raw, + possible_answer=possible_answer_item, + ) + ) + continue + + decoder_output_valid = evaluator_utils.is_function_calling_format_output(model_result_item) + if not decoder_output_valid: + failed_model_responses.append( + FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=False, + error_message="Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability.", + error_type="ast_decoder:decoder_wrong_output_format", + llm_response=str(model_result_item_raw), + decoded_result=str(model_result_item), + possible_answer=possible_answer_item, + ) + ) + continue + + checker_result = self._ast_checker( + idx, + test_example_id_to_data[response['id']]['function'], + model_result_item, + test_category, + ) + + if checker_result.is_valid: + correct_count += 1 + else: + failed_model_responses.append( + FailedResult( + example_id=response['id'], + test_category=test_category.value, + is_valid=checker_result.is_valid, + error_message=checker_result.error_message, + error_type=checker_result.error_type, + llm_response=model_result_item_raw, + decoded_result=model_result_item, + possible_answer=possible_answer_item, + ) + ) + + result = dict( + accuracy=correct_count / len(model_responses), + correct_count=correct_count, + total_count=len(model_responses), + failed_model_responses=failed_model_responses, + ) + self._save_scores(test_category, result) + return result + def _save_scores(self, test_category, result) -> None: if ( (failed_model_responses := result.get('failed_model_responses')) From 7c6549571be450d35cd5b0d720ba3c12be7c8863 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Fri, 12 Jul 2024 21:47:32 -0400 Subject: [PATCH 29/35] Remove `eval_checker` dir --- .../bfcl/eval_checker/__init__.py | 0 .../bfcl/eval_checker/checker.py | 971 ---------------- .../bfcl/eval_checker/constants.py | 18 - .../bfcl/eval_checker/custom_exception.py | 10 - .../bfcl/eval_checker/eval_runner.py | 535 --------- .../bfcl/eval_checker/eval_runner_helper.py | 1030 ----------------- .../executable_python_function.py | 883 -------------- .../bfcl/eval_checker/java_type_converter.py | 407 ------- .../eval_checker/javascript_type_converter.py | 293 ----- 9 files changed, 4147 deletions(-) delete mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/__init__.py delete mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py delete mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/constants.py delete mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py delete mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py delete mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py delete mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/executable_python_function.py delete mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/java_type_converter.py delete mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/javascript_type_converter.py diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/__init__.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py deleted file mode 100644 index ee9562145..000000000 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py +++ /dev/null @@ -1,971 +0,0 @@ -from model_handler.constant import ( - UNDERSCORE_TO_DOT, - JAVA_TYPE_CONVERSION, - JS_TYPE_CONVERSION, -) -from eval_checker_constant import REAL_TIME_MATCH_ALLOWED_DIFFERENCE -from custom_exception import NoAPIKeyError -import re -import requests # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function. -import time -import json - -# We switch to conditional import for the following two imports to avoid unnecessary installations. -# User doesn't need to setup the tree-sitter packages if they are not running the test for that language. -# from js_type_converter import js_type_converter -# from java_type_converter import java_type_converter - -PYTHON_TYPE_MAPPING = { - "string": str, - "integer": int, - "float": float, - "boolean": bool, - "array": list, - "tuple": list, - "dict": dict, - "any": str, -} - -# This is the list of types that we need to recursively check its values -PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"] - - -NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"] - - -EVAL_GROUND_TRUTH_PATH = ( - "./rest-eval-response_v5.jsonl" # Ground truth file for v5 for rest execution -) -with open(EVAL_GROUND_TRUTH_PATH, "r") as f: - EVAL_GROUND_TRUTH = f.readlines() - - -#### Helper functions for AST #### -def find_description(func_descriptions, name): - # If func_descriptions is a list, this is the multiple or multiple_parallel case - if type(func_descriptions) == list: - for func_description in func_descriptions: - if func_description["name"] in name: - return func_description - return None - else: - # This is the parallel case, there is no need to loop through the list, as there is only one function - return func_descriptions - - -def get_possible_answer_type(possible_answer: list): - for answer in possible_answer: - if answer != "": # Optional parameter - return type(answer) - return None - - -def convert_func_name(function_name, model_name: str): - model_name_escaped = model_name.replace("_", "/") - if "." in function_name: - if model_name_escaped in UNDERSCORE_TO_DOT: - # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name. - # This happens for OpenAI, Mistral, and Google models - return re.sub(r"\.", "_", function_name) - return function_name - - -def type_checker( - param: str, - value, - possible_answer: list, - expected_type_description: str, - expected_type_converted, - nested_type_converted, -): - # NOTE: This type checker only supports nested type checking for one level deep. - # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex. - - result = { - "valid": True, - "error": [], - "is_variable": False, - "error_type": "type_error:simple", - } - - is_variable = False - # check for the case where a variable is used instead of a actual value. - # use the type in possible_answer as the expected type - possible_answer_type = get_possible_answer_type(possible_answer) - # if possible_answer only contains optional parameters, we can't determine the type - if possible_answer_type != None: - # we are being precise here. - # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer - if possible_answer_type != expected_type_converted: - is_variable = True - - # value is the same type as in function description - if type(value) == expected_type_converted: - # We don't need to do recursive check for simple types - if nested_type_converted == None: - result["is_variable"] = is_variable - return result - else: - for possible_answer_item in possible_answer: - flag = True # Each parameter should match to at least one possible answer type. - # Here, we assume that each item should be the same type. We could also relax it. - if type(possible_answer_item) == list: - for value_item in value: - checker_result = type_checker( - param, - value_item, - possible_answer_item, - str(nested_type_converted), - nested_type_converted, - None, - ) - if not checker_result["valid"]: - flag = False - break - - if flag: - return {"valid": True, "error": [], "is_variable": is_variable} - - result["valid"] = False - result["error"] = [ - f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}." - ] - result["error_type"] = "type_error:nested" - - # value is not as expected, check for the case where a variable is used instead of a actual value - # use the type in possible_answer as the expected type - possible_answer_type = get_possible_answer_type(possible_answer) - # if possible_answer only contains optional parameters, we can't determine the type - if possible_answer_type != None: - # we are being precise here. - # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer - if type(value) == possible_answer_type: - result["is_variable"] = True - return result - - result["valid"] = False - result["error"].append( - f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}." - ) - result["error_type"] = "type_error:simple" - return result - - -def standardize_string(input_string: str): - # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase - # It will also convert all the single quotes to double quotes - # This is used to compare the model output with the possible answers - # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024 - regex_string = r"[ \,\.\/\-\_\*\^]" - return re.sub(regex_string, "", input_string).lower().replace("'", '"') - - -def string_checker(param: str, model_output: str, possible_answer: list): - standardize_possible_answer = [] - standardize_model_output = standardize_string(model_output) - for i in range(len(possible_answer)): - if type(possible_answer[i]) == str: - standardize_possible_answer.append(standardize_string(possible_answer[i])) - - if standardize_model_output not in standardize_possible_answer: - return { - "valid": False, - "error": [ - f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive." - ], - "error_type": "value_error:string", - } - - return {"valid": True, "error": []} - - -def list_checker(param: str, model_output: list, possible_answer: list): - # Convert the tuple to a list - - standardize_model_output = list(model_output) - - # If the element in the list is a string, we need to standardize it - for i in range(len(standardize_model_output)): - if type(standardize_model_output[i]) == str: - standardize_model_output[i] = standardize_string(model_output[i]) - - standardize_possible_answer = [] - # We also need to standardize the possible answers - for i in range(len(possible_answer)): - standardize_possible_answer.append([]) - for j in range(len(possible_answer[i])): - if type(possible_answer[i][j]) == str: - standardize_possible_answer[i].append( - standardize_string(possible_answer[i][j]) - ) - else: - standardize_possible_answer[i].append(possible_answer[i][j]) - - if standardize_model_output not in standardize_possible_answer: - return { - "valid": False, - "error": [ - f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}." - ], - "error_type": "value_error:list/tuple", - } - - return {"valid": True, "error": []} - - -def dict_checker(param: str, model_output: dict, possible_answers: list): - # This function works for simple dictionaries, as well as dictionaries with nested dictionaries - - result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"} - for i in range(len(possible_answers)): - - if possible_answers[i] == "": - continue - - result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"} - - flag = True - - possible_answer = possible_answers[i] - # possible_anwer is a single dictionary - if len(model_output.keys()) != len(possible_answer.keys()): - result["valid"] = False - result["error"].append("Wrong number of parameters for dictionary.") - result["error_type"] = "value_error:dict_items" - flag = False - continue - - for key, value in model_output.items(): - if key not in possible_answer: - result["valid"] = False - result["error"].append(f"Unexpected parameter: '{key}'.") - result["error_type"] = "value_error:dict_key" - flag = False - break - - expected_values = possible_answer[key] - if isinstance(expected_values, dict): - result = dict_checker(param, value, [expected_values]) - if not result["valid"]: - flag = False - break - else: - standardize_value = value - # If the value is a string, we need to standardize it - if type(value) == str: - standardize_value = standardize_string(value) - # We also need to standardize the possible answers - standardize_possible_answer = [] - for i in range(len(possible_answer[key])): - if type(possible_answer[key][i]) == str: - standardize_possible_answer.append( - standardize_string(possible_answer[key][i]) - ) - else: - standardize_possible_answer.append(possible_answer[key][i]) - - if standardize_value not in standardize_possible_answer: - result["valid"] = False - result["error"].append( - f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}." - ) - result["error_type"] = "value_error:dict_value" - flag = False - break - if flag: - return {"valid": True, "error": []} - - return result - - -def list_dict_checker(param: str, model_output: list, possible_answers: list): - # This function takes in a list of dictionaries and checks if each dictionary is valid - # The order of the dictionaries in the list must match the order of the possible answers - - result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"} - - for answer_index in range(len(possible_answers)): - flag = True # True means so far, all dictionaries are valid - - # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers - if len(model_output) != len(possible_answers[answer_index]): - result["valid"] = False - result["error"] = ["Wrong number of dictionaries in the list."] - result["error_type"] = "value_error:list_dict_count" - flag = False - continue - - for dict_index in range(len(model_output)): - result = dict_checker( - param, - model_output[dict_index], - [possible_answers[answer_index][dict_index]], - ) - if not result["valid"]: - flag = False - break - if flag: - return {"valid": True, "error": []} - - return result - - -def simple_function_checker( - func_description: dict, - model_output: dict, - possible_answer: dict, - language: str, - model_name: str, -): - possible_answer = list(possible_answer.values())[0] - # Extract function name and parameters details - func_name = func_description["name"] - param_details = func_description["parameters"]["properties"] - required_params = func_description["parameters"]["required"] - - # Initialize a result dictionary - result = { - "valid": True, - "error": [], - "error_type": "simple_function_checker:unclear", - } - - func_name = convert_func_name(func_name, model_name) - - # Check if function name matches - if func_name not in model_output: - result["valid"] = False - result["error"].append( - f"Function name {repr(func_name)} not found in model output." - ) - result["error_type"] = "simple_function_checker:wrong_func_name" - return result - - model_params = model_output[func_name] - - # Check for required parameters in model output - for param in required_params: - if param not in model_params: - result["valid"] = False - result["error"].append(f"Missing required parameter: {repr(param)}.") - result["error_type"] = "simple_function_checker:missing_required" - return result - - # Validate types and values for each parameter in model output - for param, value in model_params.items(): - if param not in param_details or param not in possible_answer: - result["valid"] = False - result["error"].append(f"Unexpected parameter: {repr(param)}.") - result["error_type"] = "simple_function_checker:unexpected_param" - return result - - full_param_details = param_details[param] - expected_type_description = full_param_details["type"] # This is a string - is_variable = False - nested_type_converted = None - - if language == "Java": - from java_type_converter import java_type_converter - - expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description] - - if expected_type_description in JAVA_TYPE_CONVERSION: - if type(value) != str: - result["valid"] = False - result["error"].append( - f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." - ) - result["error_type"] = "type_error:java" - return result - - if expected_type_description in NESTED_CONVERSION_TYPE_LIST: - nested_type = param_details[param]["items"]["type"] - nested_type_converted = JAVA_TYPE_CONVERSION[nested_type] - value = java_type_converter( - value, expected_type_description, nested_type - ) - else: - value = java_type_converter(value, expected_type_description) - - elif language == "JavaScript": - from js_type_converter import js_type_converter - - expected_type_converted = JS_TYPE_CONVERSION[expected_type_description] - - if expected_type_description in JS_TYPE_CONVERSION: - if type(value) != str: - result["valid"] = False - result["error"].append( - f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." - ) - result["error_type"] = "type_error:js" - return result - - if expected_type_description in NESTED_CONVERSION_TYPE_LIST: - nested_type = param_details[param]["items"]["type"] - nested_type_converted = JS_TYPE_CONVERSION[nested_type] - value = js_type_converter( - value, expected_type_description, nested_type - ) - else: - value = js_type_converter(value, expected_type_description) - - elif language == "Python": - expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description] - if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST: - nested_type = param_details[param]["items"]["type"] - nested_type_converted = PYTHON_TYPE_MAPPING[nested_type] - - # We convert all tuple value to list when the expected type is tuple. - # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load(). - # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future. - if expected_type_description == "tuple" and type(value) == tuple: - value = list(value) - - # Allow python auto conversion from int to float - if ( - language == "Python" - and expected_type_description == "float" - and type(value) == int - ): - value = float(value) - - # Type checking - # In fact, we only check for Python here. - # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct. - type_check_result = type_checker( - param, - value, - possible_answer[param], - expected_type_description, - expected_type_converted, - nested_type_converted, - ) - is_variable = type_check_result["is_variable"] - if not type_check_result["valid"]: - return type_check_result - - # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable. - # We can just treat the variable as a string and use the normal flow. - if not is_variable: - # Special handle for dictionaries - if expected_type_converted == dict: - result = dict_checker(param, value, possible_answer[param]) - if not result["valid"]: - return result - continue - - # Special handle for list of dictionaries - elif expected_type_converted == list and nested_type_converted == dict: - result = list_dict_checker(param, value, possible_answer[param]) - if not result["valid"]: - return result - continue - - # Special handle for strings - elif expected_type_converted == str: - # We don't check for case sensitivity for string, as long as it's not a variable - result = string_checker(param, value, possible_answer[param]) - if not result["valid"]: - return result - continue - - elif expected_type_converted == list: - result = list_checker(param, value, possible_answer[param]) - if not result["valid"]: - return result - continue - - # Check if the value is within the possible answers - if value not in possible_answer[param]: - result["valid"] = False - result["error"].append( - f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}." - ) - result["error_type"] = "value_error:others" - return result - - # Check for optional parameters not provided but allowed - for param in possible_answer: - if param not in model_params and "" not in possible_answer[param]: - result["valid"] = False - result["error"].append( - f"Optional parameter {repr(param)} not provided and not marked as optional." - ) - result["error_type"] = "simple_function_checker:missing_optional" - return result - - return result - - -def parallel_function_checker_enforce_order( - func_descriptions: list, - model_output: list, - possible_answers: dict, - language: str, - model_name: str, -): - if len(model_output) != len(possible_answers): - return { - "valid": False, - "error": ["Wrong number of functions."], - "error_type": "parallel_function_checker_enforce_order:wrong_count", - } - - func_name_list = list(possible_answers.keys()) - possible_answers_list = [] - - for key, value in possible_answers.items(): - possible_answers_list.append({key: value}) - - for i in range(len(possible_answers_list)): - func_description = find_description(func_descriptions, func_name_list[i]) - if func_description is None: - return { - "valid": False, - "error": [ - f"Function doc description not found for function name: {repr(func_name_list[i])}." - ], - "error_type": "parallel_function_checker_enforce_order:cannot_find_description", - } - result = simple_function_checker( - func_description, - model_output[i], - possible_answers_list[i], - language, - model_name, - ) - if not result["valid"]: - return result - - return {"valid": True, "error": []} - - -def parallel_function_checker_no_order( - func_descriptions: list, - model_output: list, - possible_answers: dict, - language: str, - model_name: str, -): - if len(model_output) != len(possible_answers): - return { - "valid": False, - "error": ["Wrong number of functions."], - "error_type": "parallel_function_checker_no_order:wrong_count", - } - - func_name_list = list(possible_answers.keys()) - possible_answers_list = [] - - for key, value in possible_answers.items(): - possible_answers_list.append({key: value}) - - matched_indices = [] - - # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer - # It must be this way because we need ground truth to fetch the correct function description - for i in range(len(possible_answers_list)): - func_description = find_description(func_descriptions, func_name_list[i]) - - # This should not happen. As possible_answers is the ground truth, and it should have the correct function name. - if func_description is None: - return { - "valid": False, - "error": [ - f"Function doc description not found for function name: {repr(func_name_list[i])}." - ], - "error_type": "parallel_function_checker_no_order:cannot_find_description", - } - - all_errors = [] - - for index in range(len(model_output)): - if index in matched_indices: - continue - - result = simple_function_checker( - func_description, - model_output[index], - possible_answers_list[i], - language, - model_name, - ) - - if result["valid"]: - matched_indices.append(index) - break - else: - all_errors.append( - { - f"Model Result Index {index}": { - "sub_error": result["error"], - "sub_error_type": result["error_type"], - "model_output_item": model_output[index], - "possible_answer_item": possible_answers_list[i], - } - } - ) - - if not result["valid"]: - considered_indices = [ - i for i in range(len(model_output)) if i not in matched_indices - ] - all_errors.insert( - 0, - f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.", - ) - return { - "valid": False, - "error": all_errors, - "error_type": "parallel_function_checker_no_order:cannot_find_match", - } - - return {"valid": True, "error": []} - - -def patten_matcher(exec_output, expected_result, function_call, is_sanity_check): - result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} - - if type(exec_output) != type(expected_result): - return { - "valid": False, - "error": [ - f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}." - ], - "error_type": "executable_checker:wrong_result_type", - "model_executed_output": exec_output, - } - if type(exec_output) == dict: - # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one. - # This happens when the key is a timestamp or a random number. - if is_sanity_check: - if len(exec_output) != len(expected_result): - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." - ], - "error_type": "executable_checker:wrong_result_type:dict_length", - "model_executed_output": exec_output, - } - else: - return result - - for key, value in expected_result.items(): - if key not in exec_output: - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output." - ], - "error_type": "executable_checker:wrong_result_type:dict_key_not_found", - "model_executed_output": exec_output, - } - for key, value in exec_output.items(): - if key not in expected_result: - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output." - ], - "error_type": "executable_checker:wrong_result_type:dict_extra_key", - "model_executed_output": exec_output, - } - if type(exec_output) == list: - if len(exec_output) != len(expected_result): - return { - "valid": False, - "error": [ - f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." - ], - "error_type": "executable_checker:wrong_result_type:list_length", - "model_executed_output": exec_output, - } - return result - - -#### Helper functions for Exec #### -def executable_checker_simple( - function_call: str, - expected_result, - expected_result_type: str, - is_sanity_check=False, -): - result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} - - exec_dict = {} - - try: - exec( - "from executable_python_function import *" + "\nresult=" + function_call, - exec_dict, - ) - exec_output = exec_dict["result"] - except NoAPIKeyError as e: - raise e - except Exception as e: - result["valid"] = False - result["error"].append( - f"Error in execution: {repr(function_call)}. Error: {str(e)}" - ) - result["error_type"] = "executable_checker:execution_error" - return result - - # We need to special handle the case where the execution result is a tuple and convert it to a list - # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json - if isinstance(exec_output, tuple): - exec_output = list(exec_output) - - if expected_result_type == "exact_match": - if exec_output != expected_result: - result["valid"] = False - result["error"].append( - f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}." - ) - result["error_type"] = "executable_checker:wrong_result" - result["model_executed_output"] = exec_output - return result - - elif expected_result_type == "real_time_match": - # Allow for 5% difference - if (type(expected_result) == float or type(expected_result) == int) and ( - type(exec_output) == float or type(exec_output) == int - ): - if not ( - expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE) - <= exec_output - <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE) - ): - result["valid"] = False - result["error"].append( - f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed." - ) - result["error_type"] = "executable_checker:wrong_result_real_time" - result["model_executed_output"] = exec_output - return result - else: - result["valid"] = False - result["error"].append( - f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria." - ) - result["error_type"] = "executable_checker:wrong_result_real_time" - result["model_executed_output"] = exec_output - return result - - else: - # structural match - pattern_match_result = patten_matcher( - exec_output, expected_result, function_call, is_sanity_check - ) - if not pattern_match_result["valid"]: - return pattern_match_result - - return result - - -def executable_checker_parallel_no_order( - decoded_result: list, expected_exec_result: list, expected_exec_result_type: list -): - - if len(decoded_result) != len(expected_exec_result): - return { - "valid": False, - "error": [ - f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}." - ], - "error_type": "value_error:exec_result_count", - } - - matched_indices = [] - for i in range(len(expected_exec_result)): - all_errors = [] - for index in range(len(decoded_result)): - if index in matched_indices: - continue - - result = executable_checker_simple( - decoded_result[index], - expected_exec_result[i], - expected_exec_result_type[i], - False, - ) - - if result["valid"]: - matched_indices.append(index) - break - else: - all_errors.append( - { - f"Model Result Index {index}": { - "sub_error": result["error"], - "sub_error_type": result["error_type"], - "model_executed_output": ( - result["model_executed_output"] - if "model_executed_output" in result - else None - ), - } - } - ) - - if not result["valid"]: - considered_indices = [ - i for i in range(len(decoded_result)) if i not in matched_indices - ] - all_errors.insert( - 0, - f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.", - ) - return { - "valid": False, - "error": all_errors, - "error_type": "executable_checker:cannot_find_match", - } - - return {"valid": True, "error": [], "error_type": "executable_checker:unclear"} - - -#### Main function #### -def executable_checker_rest(func_call, idx): - if "https://geocode.maps.co" in func_call: - time.sleep(2) - if "requests_get" in func_call: - func_call = func_call.replace("requests_get", "requests.get") - try: - response = eval(func_call) - except Exception as e: - return { - "valid": False, - "error": [f"Execution failed. {str(e)}"], - "error_type": "executable_checker_rest:execution_error", - } - - try: - if response.status_code == 200: - - eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx]) - try: - if isinstance(eval_GT_json, dict): - if isinstance(response.json(), dict): - if set(eval_GT_json.keys()) == set(response.json().keys()): - return {"valid": True, "error": [], "error_type": ""} - return { - "valid": False, - "error": ["Key inconsistency"], - "error_type": "executable_checker_rest:wrong_key", - } - return { - "valid": False, - "error": [ - f"Expected dictionary, but got {type(response.json())}" - ], - "error_type": "executable_checker_rest:wrong_type", - } - - elif isinstance(eval_GT_json, list): - if isinstance(response.json(), list): - if len(eval_GT_json) != len(response.json()): - return { - "valid": False, - "error": [f"Response list length inconsistency."], - "error_type": "value_error:exec_result_rest_count", - } - - else: - for i in range(len(eval_GT_json)): - if set(eval_GT_json[i].keys()) != set( - response.json()[i].keys() - ): - return { - "valid": False, - "error": [f"Key inconsistency"], - "error_type": "executable_checker_rest:wrong_key", - } - - return {"valid": True, "error": []} - else: - return { - "valid": False, - "error": [ - f"Expected list, but got {type(response.json())}" - ], - "error_type": "executable_checker_rest:wrong_type", - } - return { - "valid": False, - "error": [ - f"Expected dict or list, but got {type(response.json())}" - ], - "error_type": "executable_checker_rest:wrong_type", - } - except Exception as e: - return { - "valid": False, - "error": [ - f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}" - ], - "error_type": "executable_checker_rest:response_format_error", - } - else: - return { - "valid": False, - "error": [ - f"Execution result status code is not 200, got {response.status_code}" - ], - "error_type": "executable_checker_rest:wrong_status_code", - } - except Exception as e: - return { - "valid": False, - "error": [f"Cannot get status code of the response. Error: {str(e)}"], - "error_type": "executable_checker_rest:cannot_get_status_code", - } - - -def ast_checker( - func_description, model_output, possible_answer, language, test_category, model_name -): - if "multiple" in test_category or "parallel" in test_category: - # Some formatting issues that needs to be handled - if test_category == "parallel_function": - func_description = [func_description] - - return parallel_function_checker_no_order( - func_description, model_output, possible_answer, language, model_name - ) - - else: - if len(model_output) != 1: - return { - "valid": False, - "error": ["Wrong number of functions."], - "error_type": "simple_function_checker:wrong_count", - } - model_output = model_output[0] - return simple_function_checker( - func_description, model_output, possible_answer, language, model_name - ) - - -def exec_checker(decoded_result: list, func_description: dict, test_category: str): - if "multiple" in test_category or "parallel" in test_category: - return executable_checker_parallel_no_order( - decoded_result, - func_description["execution_result"], - func_description["execution_result_type"], - ) - - else: - if len(decoded_result) != 1: - return { - "valid": False, - "error": ["Wrong number of functions."], - "error_type": "simple_exec_checker:wrong_count", - } - return executable_checker_simple( - decoded_result[0], - func_description["execution_result"][0], - func_description["execution_result_type"][0], - False, - ) \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/constants.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/constants.py deleted file mode 100644 index fe11bcead..000000000 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/constants.py +++ /dev/null @@ -1,18 +0,0 @@ -REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2 - -FILENAME_INDEX_MAPPING = { - "executable_parallel_function": (0, 49), - "parallel_multiple_function": (50, 249), - "executable_simple": (250, 349), - "rest": (350, 419), - "sql": (420, 519), - "parallel_function": (520, 719), - "chatable": (720, 919), - "java": (920, 1019), - "javascript": (1020, 1069), - "executable_multiple_function": (1070, 1119), - "simple": (1120, 1519), - "relevance": (1520, 1759), - "executable_parallel_multiple_function": (1760, 1799), - "multiple_function": (1800, 1999), -} diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py deleted file mode 100644 index 3504862d8..000000000 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py +++ /dev/null @@ -1,10 +0,0 @@ -class NoAPIKeyError(Exception): - def __init__(self): - self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." - super().__init__(self.message) - - -class BadAPIStatusError(Exception): - def __init__(self, errors, error_rate): - self.errors = errors - self.error_rate = error_rate \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py deleted file mode 100644 index dd45c5dd4..000000000 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py +++ /dev/null @@ -1,535 +0,0 @@ -import sys - -sys.path.append("../") - -from checker import ast_checker, exec_checker, executable_checker_rest -from custom_exception import BadAPIStatusError -from eval_runner_helper import * -from tqdm import tqdm -import argparse - - -# NOTE: This file should be run in the `eval_checker` directory - - -def single_executable_file_runner( - handler, model_result, prompt, model_name, test_category -): - assert len(model_result) == len(prompt) - - result = [] - correct_count = 0 - for i in tqdm(range(len(model_result)), desc="Running tests"): - raw_result = model_result[i]["result"] - try: - decoded_result = handler.decode_execute(raw_result) - except Exception as e: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [f"Failed to decode executable. {str(e)}"], - "error_type": "executable_decoder:decoder_failed", - "prompt": prompt[i], - "model_result_raw": raw_result, - } - ) - continue - - if "rest" in test_category: - # REST is always single-functioned. Therefore we take the first one and pass it to the REST checker. - if not is_rest_format_output(decoded_result): - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "executable_decoder:rest_wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(raw_result), - "model_result_decoded": str(decoded_result), - } - ) - continue - - checker_result = executable_checker_rest(decoded_result[0], i) - - else: - if not is_executable_format_output(decoded_result): - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "executable_decoder:wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(raw_result), - "model_result_decoded": str(decoded_result), - } - ) - continue - - prompt_item = prompt[i] - checker_result = exec_checker(decoded_result, prompt_item, test_category) - - if checker_result["valid"]: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = checker_result["valid"] - temp["error"] = checker_result["error"] - temp["error_type"] = checker_result["error_type"] - temp["prompt"] = prompt[i] - temp["model_result_raw"] = raw_result - temp["model_result_decoded"] = decoded_result - if "model_executed_output" in checker_result: - temp["model_executed_output"] = checker_result["model_executed_output"] - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -def single_relevance_file_runner(handler, model_result, model_name, test_category): - - result = [] - correct_count = 0 - for i in range(len(model_result)): - model_result_item = model_result[i]["result"] - success = False - decoded_result = None - - try: - decoded_result = handler.decode_ast(model_result_item, language="Python") - success = False - if is_empty_output(decoded_result): - success = True - - except Exception as e: - success = True - - if success: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = success - temp["error"] = [ - f"Valid syntax. Successfully decode AST when it should not." - ] - temp["error_type"] = "relevance_error:decoder_success" - temp["model_result"] = model_result_item - temp["decoded_result"] = decoded_result - - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -def single_ast_file_runner( - handler, model_result, prompt, possible_answer, language, test_category, model_name -): - assert ( - len(model_result) == len(prompt) == len(possible_answer) - ), "The length of the model result does not match the length of the prompt or possible answer. Please check the input files for completeness." - - result = [] - correct_count = 0 - for i in range(len(model_result)): - model_result_item = model_result[i]["result"] - prompt_item = prompt[i]["function"] - possible_answer_item = possible_answer[i] - - try: - model_result_item_raw = model_result_item - model_result_item = handler.decode_ast(model_result_item, language) - except Exception as e: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [f"Invalid syntax. Failed to decode AST. {str(e)}"], - "error_type": "ast_decoder:decoder_failed", - "prompt": prompt[i], - "model_result_raw": model_result_item_raw, - "possible_answer": possible_answer_item, - } - ) - continue - - decoder_output_valid = is_function_calling_format_output(model_result_item) - if not decoder_output_valid: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "ast_decoder:decoder_wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(model_result_item_raw), - "model_result_decoded": str(model_result_item), - "possible_answer": possible_answer_item, - } - ) - continue - - checker_result = ast_checker( - prompt_item, - model_result_item, - possible_answer_item, - language, - test_category, - model_name, - ) - - if checker_result["valid"]: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = checker_result["valid"] - temp["error"] = checker_result["error"] - temp["error_type"] = checker_result["error_type"] - temp["prompt"] = prompt[i] - temp["model_result_raw"] = model_result_item_raw - temp["model_result_decoded"] = model_result_item - temp["possible_answer"] = possible_answer_item - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -#### Main runner function #### -def runner(model_names, test_categories, api_sanity_check): - - # A flag to indicate if the API has been tested. - # We should always test the API with ground truth first before running the executable tests. - # Sometimes the API may not be working as expected and we want to catch that before running the evaluation to ensure the results are accurate. - API_TESTED = False - API_STATUS_ERROR_REST = None - API_STATUS_ERROR_EXECUTABLE = None - - # Before running the executable evaluation, we need to get the expected output from the ground truth. - # So we need a list of all the test categories that we have ran the ground truth evaluation on. - # We only get the expected output once for each test category. - EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = [] - - # Get a list of all entries in the folder - entries = os.scandir(INPUT_PATH) - - # Filter out the subdirectories - subdirs = [entry.path for entry in entries if entry.is_dir()] - - # Traverse each subdirectory - for subdir in subdirs: - - model_name = subdir.split(INPUT_PATH)[1] - if model_names is not None and model_name not in model_names: - continue - - model_name_escaped = model_name.replace("_", "/") - - files = [ - f - for f in os.listdir(subdir) - if os.path.isfile(os.path.join(subdir, f)) and not f.startswith(".") - ] - # Check if there is only one file and that file is 'result.json' - # If so, this is an OSS model result file and we need to special process it first - if len(files) == 1 and files[0] == "result.json": - result_json_file_path = os.path.join(subdir, "result.json") - oss_file_formatter(result_json_file_path, subdir) - print( - f"Detected OSS model: {model_name}. result.json has been split into individual test category files." - ) - - # Pattern to match JSON files in this subdirectory - json_files_pattern = os.path.join(subdir, "*.json") - - print(f"🦍 Model: {model_name}") - - # Find and process all JSON files in the subdirectory - for model_result_json in glob.glob(json_files_pattern): - - if os.path.basename(model_result_json) == "result.json": - continue - - test_category = extract_after_test(model_result_json) - if test_categories is not None and test_category not in test_categories: - continue - - handler = get_handler(model_name_escaped) - - # We don't evaluate chatable and SQL models in our current leaderboard - if is_chatable(test_category) or is_sql(test_category): - continue - - language = "Python" - if is_java(test_category): - language = "Java" - if is_js(test_category): - language = "JavaScript" - - print(f"🔍 Running test: {test_category}") - - model_result = load_file(model_result_json) - record_cost_latency(LEADERBOARD_TABLE, model_name, model_result) - - if is_relevance(test_category): - accuracy, total_count = single_relevance_file_runner( - handler, model_result, model_name, test_category - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - continue - - # Find the corresponding test file - prompt_file = find_file_with_suffix(PROMPT_PATH, test_category) - prompt = load_file(prompt_file) - - if is_executable(test_category): - # We only test the API with ground truth once - if not API_TESTED and api_sanity_check: - print("---- Sanity checking API status ----") - try: - api_status_sanity_check_rest() - except BadAPIStatusError as e: - API_STATUS_ERROR_REST = e - - try: - api_status_sanity_check_executable() - except BadAPIStatusError as e: - API_STATUS_ERROR_EXECUTABLE = e - - display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=True) - print("Continuing evaluation...") - - API_TESTED = True - - if ( - test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN - and not is_rest(test_category) - ): - print( - f"---- Getting real-time execution result from ground truth for {test_category} ----" - ) - get_executable_expected_output(prompt_file) - print( - f"---- Ground truth real-time execution result obtained for {test_category} 🌟 ----" - ) - EXECUTABLE_TEST_CATEGORIES_HAVE_RUN.append(test_category) - # Need to re-load the prompt file after getting the expected output, as the prompt file has been updated - prompt = load_file(prompt_file) - - accuracy, total_count = single_executable_file_runner( - handler, model_result, prompt, model_name, test_category - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - - continue - - # Find the corresponding possible answer file - possible_answer_file = find_file_with_suffix( - POSSIBLE_ANSWER_PATH, test_category - ) - possible_answer = load_file(possible_answer_file) - accuracy, total_count = single_ast_file_runner( - handler, - model_result, - prompt, - possible_answer, - language, - test_category, - model_name, - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - - # This function reads all the score files from local folder and updates the leaderboard table. - # This is helpful when you only want to run the evaluation for a subset of models and test categories. - update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH) - # Write the leaderboard table to a file - generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH) - - # Clean up the executable expected output files - # They should be re-generated the next time the evaluation is run - clean_up_executable_expected_output( - PROMPT_PATH, EXECUTABLE_TEST_CATEGORIES_HAVE_RUN - ) - - display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=False) - - print(f"🏁 Evaluation completed. See {os.path.abspath(OUTPUT_PATH + 'data.csv')} for evaluation results.") - - -ARG_PARSE_MAPPING = { - "ast": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "java", - "javascript", - "relevance", - ], - "executable": [ - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], - "all": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "java", - "javascript", - "relevance", - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], - "non-python": [ - "java", - "javascript", - ], - "python": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "relevance", - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], -} - - -INPUT_PATH = "../result/" -PROMPT_PATH = "../data/" -POSSIBLE_ANSWER_PATH = "../data/possible_answer/" -OUTPUT_PATH = "../score/" - -# A dictionary to store the results -# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count -LEADERBOARD_TABLE = {} - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Process two lists of strings.") - - # Add arguments for two lists of strings - parser.add_argument( - "--model", nargs="+", type=str, help="A list of model names to evaluate" - ) - parser.add_argument( - "--test-category", - nargs="+", - type=str, - help="A list of test categories to run the evaluation on", - ) - parser.add_argument( - "-c", - "--api-sanity-check", - action="store_true", - default=False, # Default value is False, meaning the sanity check is skipped unless the flag is specified - help="Perform the REST API status sanity check before running the evaluation. By default, the sanity check is skipped.", - ) - - args = parser.parse_args() - - api_sanity_check = args.api_sanity_check - test_categories = None - if args.test_category is not None: - test_categories = [] - for test_category in args.test_category: - if test_category in ARG_PARSE_MAPPING: - test_categories.extend(ARG_PARSE_MAPPING[test_category]) - else: - test_categories.append(test_category) - - model_names = args.model - if args.model is not None: - model_names = [] - for model_name in args.model: - # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. - # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). - # We patch it here to avoid confusing the user. - model_names.append(model_name.replace("/", "_")) - - runner(model_names, test_categories, api_sanity_check) \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py deleted file mode 100644 index 83e1e8917..000000000 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ /dev/null @@ -1,1030 +0,0 @@ -import glob -import json -import os -import statistics -import subprocess - -import numpy as np -from custom_exception import BadAPIStatusError -from model_handler.handler_map import handler_map -from tqdm import tqdm -from eval_checker_constant import FILENAME_INDEX_MAPPING - -REST_API_GROUND_TRUTH_FILE_PATH = "api_status_check_ground_truth_REST.json" -EXECTUABLE_API_GROUND_TRUTH_FILE_PATH = "api_status_check_ground_truth_executable.json" - -COLUMNS = [ - "Rank", - "Overall Acc", - "Model", - "Model Link", - "Organization", - "License", - "AST Summary", - "Exec Summary", - "Simple Function AST", - "Python Simple Function AST", - "Java Simple Function AST", - "JavaScript Simple Function AST", - "Multiple Functions AST", - "Parallel Functions AST", - "Parallel Multiple AST", - "Simple Function Exec", - "Python Simple Function Exec", - "REST Simple Function Exec", - "Multiple Functions Exec", - "Parallel Functions Exec", - "Parallel Multiple Exec", - "Relevance Detection", - "Cost ($ Per 1k Function Calls)", - "Latency Mean (s)", - "Latency Standard Deviation (s)", - "Latency 95th Percentile (s)", -] - -MODEL_METADATA_MAPPING = { - "gpt-4o-2024-05-13-FC": [ - "GPT-4o-2024-05-13 (FC)", - "https://openai.com/index/hello-gpt-4o/", - "OpenAI", - "Proprietary", - ], - "gpt-4o-2024-05-13": [ - "GPT-4o-2024-05-13 (Prompt)", - "https://openai.com/index/hello-gpt-4o/", - "OpenAI", - "Proprietary", - ], - "gpt-4-1106-preview-FC": [ - "GPT-4-1106-Preview (FC)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-1106-preview": [ - "GPT-4-1106-Preview (Prompt)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-0125-preview-FC": [ - "GPT-4-0125-Preview (FC)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-0125-preview": [ - "GPT-4-0125-Preview (Prompt)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-turbo-2024-04-09-FC": [ - "GPT-4-turbo-2024-04-09 (FC)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-turbo-2024-04-09": [ - "GPT-4-turbo-2024-04-09 (Prompt)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gorilla-openfunctions-v2": [ - "Gorilla-OpenFunctions-v2 (FC)", - "https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html", - "Gorilla LLM", - "Apache 2.0", - ], - "claude-3-opus-20240229-FC": [ - "Claude-3-Opus-20240229 (FC tools-2024-04-04)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-opus-20240229": [ - "Claude-3-Opus-20240229 (Prompt)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "mistral-medium-2312": [ - "Mistral-Medium-2312 (Prompt)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-small-2402": [ - "Mistral-Small-2402 (Prompt)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-large-2402": [ - "Mistral-Large-2402 (Prompt)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "claude-3-sonnet-20240229-FC": [ - "Claude-3-Sonnet-20240229 (FC tools-2024-04-04)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-sonnet-20240229": [ - "Claude-3-Sonnet-20240229 (Prompt)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-haiku-20240307-FC": [ - "Claude-3-Haiku-20240307 (FC tools-2024-04-04)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-haiku-20240307": [ - "Claude-3-Haiku-20240307 (Prompt)", - "https://www.anthropic.com/news/claude-3-family", - "Anthropic", - "Proprietary", - ], - "claude-3-5-sonnet-20240620-FC": [ - "Claude-3.5-Sonnet-20240620 (FC)", - "https://www.anthropic.com/news/claude-3-5-sonnet", - "Anthropic", - "Proprietary", - ], - "claude-3-5-sonnet-20240620": [ - "Claude-3.5-Sonnet-20240620 (Prompt)", - "https://www.anthropic.com/news/claude-3-5-sonnet", - "Anthropic", - "Proprietary", - ], - "gpt-3.5-turbo-0125-FC": [ - "GPT-3.5-Turbo-0125 (FC)", - "https://platform.openai.com/docs/models/gpt-3-5-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-3.5-turbo-0125": [ - "GPT-3.5-Turbo-0125 (Prompting)", - "https://platform.openai.com/docs/models/gpt-3-5-turbo", - "OpenAI", - "Proprietary", - ], - "meetkai/functionary-small-v2.2-FC": [ - "Functionary-Small-v2.2 (FC)", - "https://huggingface.co/meetkai/functionary-small-v2.2", - "MeetKai", - "MIT", - ], - "meetkai/functionary-medium-v2.2-FC": [ - "Functionary-Medium-v2.2 (FC)", - "https://huggingface.co/meetkai/functionary-medium-v2.2", - "MeetKai", - "MIT", - ], - "meetkai/functionary-small-v2.4-FC": [ - "Functionary-Small-v2.4 (FC)", - "https://huggingface.co/meetkai/functionary-small-v2.4", - "MeetKai", - "MIT", - ], - "meetkai/functionary-medium-v2.4-FC": [ - "Functionary-Medium-v2.4 (FC)", - "https://huggingface.co/meetkai/functionary-medium-v2.4", - "MeetKai", - "MIT", - ], - "claude-2.1": [ - "Claude-2.1 (Prompt)", - "https://www.anthropic.com/news/claude-2-1", - "Anthropic", - "Proprietary", - ], - "mistral-tiny-2312": [ - "Mistral-tiny-2312 (Prompt)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "claude-instant-1.2": [ - "Claude-instant-1.2 (Prompt)", - "https://www.anthropic.com/news/releasing-claude-instant-1-2", - "Anthropic", - "Proprietary", - ], - "mistral-small-2402-FC-Auto": [ - "Mistral-small-2402 (FC Auto)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-large-2402-FC-Any": [ - "Mistral-large-2402 (FC Any)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-small-2402-FC-Any": [ - "Mistral-small-2402 (FC Any)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "mistral-large-2402-FC-Auto": [ - "Mistral-large-2402 (FC Auto)", - "https://docs.mistral.ai/guides/model-selection/", - "Mistral AI", - "Proprietary", - ], - "Nexusflow-Raven-v2": [ - "Nexusflow-Raven-v2 (FC)", - "https://huggingface.co/Nexusflow/NexusRaven-V2-13B", - "Nexusflow", - "Apache 2.0", - ], - "firefunction-v1-FC": [ - "FireFunction-v1 (FC)", - "https://huggingface.co/fireworks-ai/firefunction-v1", - "Fireworks", - "Apache 2.0", - ], - "firefunction-v2-FC": [ - "FireFunction-v2 (FC)", - "https://huggingface.co/fireworks-ai/firefunction-v2", - "Fireworks", - "Apache 2.0", - ], - "gemini-1.5-pro-preview-0514": [ - "Gemini-1.5-Pro-Preview-0514 (FC)", - "https://deepmind.google/technologies/gemini/pro/", - "Google", - "Proprietary", - ], - "gemini-1.5-flash-preview-0514": [ - "Gemini-1.5-Flash-Preview-0514 (FC)", - "https://deepmind.google/technologies/gemini/flash/", - "Google", - "Proprietary", - ], - "gemini-1.5-pro-preview-0409": [ - "Gemini-1.5-Pro-Preview-0409 (FC)", - "https://deepmind.google/technologies/gemini/#introduction", - "Google", - "Proprietary", - ], - "gemini-1.0-pro": [ - "Gemini-1.0-Pro-001 (FC)", - "https://deepmind.google/technologies/gemini/#introduction", - "Google", - "Proprietary", - ], - "gpt-4-0613-FC": [ - "GPT-4-0613 (FC)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "gpt-4-0613": [ - "GPT-4-0613 (Prompt)", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "OpenAI", - "Proprietary", - ], - "deepseek-ai/deepseek-coder-6.7b-instruct": [ - "Deepseek-v1.5 (Prompt)", - "https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5", - "Deepseek", - "Deepseek License", - ], - "google/gemma-7b-it": [ - "Gemma-7b-it (Prompt)", - "https://blog.google/technology/developers/gemma-open-models/", - "Google", - "gemma-terms-of-use", - ], - "glaiveai/glaive-function-calling-v1": [ - "Glaive-v1 (FC)", - "https://huggingface.co/glaiveai/glaive-function-calling-v1", - "Glaive", - "cc-by-sa-4.0", - ], - "databricks-dbrx-instruct": [ - "DBRX-Instruct (Prompt)", - "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm", - "Databricks", - "Databricks Open Model", - ], - "NousResearch/Hermes-2-Pro-Mistral-7B": [ - "Hermes-2-Pro-Mistral-7B (FC)", - "https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B", - "NousResearch", - "apache-2.0", - ], - "meta-llama/Meta-Llama-3-8B-Instruct": [ - "Meta-Llama-3-8B-Instruct (Prompt)", - "https://llama.meta.com/llama3", - "Meta", - "Meta Llama 3 Community", - ], - "meta-llama/Meta-Llama-3-70B-Instruct": [ - "Meta-Llama-3-70B-Instruct (Prompt)", - "https://llama.meta.com/llama3", - "Meta", - "Meta Llama 3 Community", - ], - "command-r-plus-FC": [ - "Command-R-Plus (FC) (Original)", - "https://txt.cohere.com/command-r-plus-microsoft-azure", - "Cohere For AI", - "cc-by-nc-4.0", - ], - "command-r-plus": [ - "Command-R-Plus (Prompt) (Original)", - "https://txt.cohere.com/command-r-plus-microsoft-azure", - "Cohere For AI", - "cc-by-nc-4.0", - ], - "command-r-plus-FC-optimized": [ - "Command-R-Plus (FC) (Optimized)", - "https://txt.cohere.com/command-r-plus-microsoft-azure", - "Cohere For AI", - "cc-by-nc-4.0", - ], - "command-r-plus-optimized": [ - "Command-R-Plus (Prompt) (Optimized)", - "https://txt.cohere.com/command-r-plus-microsoft-azure", - "Cohere For AI", - "cc-by-nc-4.0", - ], - "snowflake/arctic": [ - "Snowflake/snowflake-arctic-instruct (Prompt)", - "https://huggingface.co/Snowflake/snowflake-arctic-instruct", - "Snowflake", - "apache-2.0", - ], - "nvidia/nemotron-4-340b-instruct": [ - "Nemotron-4-340b-instruct (Prompt)", - "https://huggingface.co/nvidia/nemotron-4-340b-instruct", - "NVIDIA", - "nvidia-open-model-license" - ], - "THUDM/glm-4-9b-chat": [ - "GLM-4-9b-Chat (FC)", - "https://huggingface.co/THUDM/glm-4-9b-chat", - "THUDM", - "glm-4" - ] -} - -INPUT_PRICE_PER_MILLION_TOKEN = { - "claude-3-opus-20240229-FC": 15, - "claude-3-opus-20240229": 15, - "claude-3-sonnet-20240229-FC": 3, - "claude-3-sonnet-20240229": 3, - "claude-3-haiku-20240307-FC": 0.25, - "claude-3-haiku-20240307": 0.25, - "claude-3-5-sonnet-20240620-FC": 3, - "claude-3-5-sonnet-20240620": 3, - "claude-2.1": 8, - "claude-instant-1.2": 0.8, - "mistral-large-2402-FC-Any": 4, - "mistral-large-2402-FC-Auto": 4, - "mistral-medium-2312": 2.7, - "mistral-small-2402-FC-Any": 1, - "mistral-small-2402-FC-Auto": 1, - "mistral-small-2402": 1, - "mistral-tiny-2312": 0.25, - "gpt-4o-2024-05-13-FC": 5, - "gpt-4o-2024-05-13": 5, - "gpt-4-1106-preview-FC": 10, - "gpt-4-1106-preview": 10, - "gpt-4-0125-preview": 10, - "gpt-4-0125-preview-FC": 10, - "gpt-4-turbo-2024-04-09-FC": 10, - "gpt-4-turbo-2024-04-09": 10, - "gpt-4-0613": 30, - "gpt-4-0613-FC": 30, - "gpt-3.5-turbo-0125": 0.5, - "gpt-3.5-turbo-0125-FC": 0.5, - "gemini-1.0-pro": 0.5, - "gemini-1.5-pro-preview-0409": 3.5, - "gemini-1.5-pro-preview-0514": 3.5, - "gemini-1.5-flash-preview-0514": 0.35, - "databricks-dbrx-instruct": 2.25, - "command-r-plus-FC": 3, - "command-r-plus": 3, - "command-r-plus-FC-optimized": 3, - "command-r-plus-optimized": 3, -} - -OUTPUT_PRICE_PER_MILLION_TOKEN = { - "claude-3-opus-20240229-FC": 75, - "claude-3-opus-20240229": 75, - "claude-3-sonnet-20240229-FC": 15, - "claude-3-sonnet-20240229": 15, - "claude-3-5-sonnet-20240620-FC": 15, - "claude-3-5-sonnet-20240620": 15, - "claude-3-haiku-20240307-FC": 1.25, - "claude-3-haiku-20240307": 1.25, - "claude-2.1": 24, - "claude-instant-1.2": 2.4, - "mistral-large-2402-FC-Any": 12, - "mistral-large-2402-FC-Auto": 12, - "mistral-small-2402": 3, - "mistral-medium-2312": 8.1, - "mistral-small-2402-FC-Any": 3, - "mistral-small-2402-FC-Auto": 3, - "mistral-tiny-2312": 0.25, - "gpt-4o-2024-05-13-FC": 15, - "gpt-4o-2024-05-13": 15, - "gpt-4-turbo-2024-04-09-FC": 30, - "gpt-4-turbo-2024-04-09": 30, - "gpt-4-1106-preview": 30, - "gpt-4-1106-preview-FC": 30, - "gpt-4-0125-preview-FC": 30, - "gpt-4-0125-preview": 30, - "gpt-4-0613": 60, - "gpt-4-0613-FC": 60, - "gpt-3.5-turbo-0125": 1.5, - "gpt-3.5-turbo-0125-FC": 1.5, - "gemini-1.0-pro": 1.5, - "gemini-1.5-pro-preview-0409": 10.50, - "gemini-1.5-pro-preview-0514": 10.50, - "gemini-1.5-flash-preview-0514": 0.53, - "databricks-dbrx-instruct": 6.75, - "command-r-plus-FC": 15, - "command-r-plus": 15, - "command-r-plus-FC-optimized": 15, - "command-r-plus-optimized": 15, -} - -# The latency of the open-source models are hardcoded here. -# Because we do batching when generating the data, so the latency is not accurate from the result data. -# This is the latency for the whole batch of data, when using 8 V100 GPUs. -OSS_LATENCY = { - "deepseek-ai/deepseek-coder-6.7b-instruct": 909, - "google/gemma-7b-it": 95, - "NousResearch/Hermes-2-Pro-Mistral-7B": 135, - "meta-llama/Meta-Llama-3-8B-Instruct": 73, - "meta-llama/Meta-Llama-3-70B-Instruct": 307, - "gorilla-openfunctions-v2": 83, - "THUDM/glm-4-9b-chat": 223 -} - - -NO_COST_MODELS = [ - "Nexusflow-Raven-v2", - "firefunction-v1-FC", - "firefunction-v2-FC", - "meetkai/functionary-medium-v2.4-FC", - "meetkai/functionary-small-v2.2-FC", - "meetkai/functionary-small-v2.4-FC", - "snowflake/arctic", - "nvidia/nemotron-4-340b-instruct", - "THUDM/glm-4-9b-chat", -] - -# Price got from AZure, 22.032 per hour for 8 V100, Pay As You Go Total Price -# Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/ -V100_x8_PRICE_PER_HOUR = 22.032 - - -def extract_after_test(input_string): - parts = input_string.split("_test_")[1].split("_result")[0].split(".json")[0] - return parts - - -def find_file_with_suffix(folder_path, suffix): - json_files_pattern = os.path.join(folder_path, "*.json") - for json_file in glob.glob(json_files_pattern): - if extract_after_test(json_file) == suffix: - return json_file - - -def is_executable(test_category): - return "executable" in test_category or "rest" in test_category - - -def is_rest(test_category): - return "rest" in test_category - - -def is_relevance(test_category): - return "relevance" in test_category - - -def is_chatable(test_category): - return "chatable" in test_category - - -def is_java(test_category): - return "java" in test_category - - -def is_js(test_category): - return "javascript" in test_category - - -def is_sql(test_category): - return "sql" in test_category - - -def load_file(file_path): - result = [] - with open(file_path) as f: - file = f.readlines() - for line in file: - result.append(json.loads(line)) - return result - - -def get_handler(model_name): - return handler_map[model_name](model_name) - - -def write_list_of_dicts_to_file(filename, data, subdir=None): - if subdir: - # Ensure the subdirectory exists - os.makedirs(subdir, exist_ok=True) - - # Construct the full path to the file - filename = os.path.join(subdir, filename) - - # Write the list of dictionaries to the file in JSON format - with open(filename, "w") as f: - for i, entry in enumerate(data): - json_str = json.dumps(entry) - f.write(json_str) - if i < len(data) - 1: - f.write("\n") - - -def is_function_calling_format_output(decoded_output): - # Ensure the output is a list of dictionaries - if type(decoded_output) == list: - for item in decoded_output: - if type(item) != dict: - return False - return True - return False - - -def is_executable_format_output(decoded_output): - # Ensure the output is a list of strings (one or more strings) - if type(decoded_output) == list: - if len(decoded_output) == 0: - return False - for item in decoded_output: - if type(item) != str: - return False - return True - return False - - -def is_rest_format_output(decoded_output): - # Ensure the output is a list of one string - if type(decoded_output) == list: - if len(decoded_output) == 1 and type(decoded_output[0]) == str: - return True - return False - - -def is_empty_output(decoded_output): - # This function is a patch to the ast decoder for relevance detection - # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call - # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct) - if not is_function_calling_format_output(decoded_output): - return True - if len(decoded_output) == 0: - return True - if len(decoded_output) == 1 and len(decoded_output[0]) == 0: - return True - - -def api_status_sanity_check_rest(): - - # We only need to import the executable_checker_rest in this function. So a local import is used. - from checker import executable_checker_rest - - ground_truth_dummy = load_file(REST_API_GROUND_TRUTH_FILE_PATH) - - # Use the ground truth data to make sure the API is working correctly - command = f"cd .. ; python apply_function_credential_config.py --input-file ./eval_checker/{REST_API_GROUND_TRUTH_FILE_PATH};" - try: - subprocess.run(command, shell=True, capture_output=True, text=True, check=True) - except subprocess.CalledProcessError as e: - write_list_of_dicts_to_file(REST_API_GROUND_TRUTH_FILE_PATH, ground_truth_dummy) - raise RuntimeError(e.stderr) from e - - ground_truth_replaced = load_file(REST_API_GROUND_TRUTH_FILE_PATH) - write_list_of_dicts_to_file(REST_API_GROUND_TRUTH_FILE_PATH, ground_truth_dummy) - - correct_count = 0 - errors = [] - for idx, data in tqdm( - enumerate(ground_truth_replaced), - total=len(ground_truth_replaced), - desc="API Status Test (REST)", - ): - status = executable_checker_rest(data["ground_truth"], idx) - if status["valid"]: - correct_count += 1 - else: - errors.append((data, status)) - - if correct_count != len(ground_truth_replaced): - raise BadAPIStatusError(errors, f"{len(ground_truth_replaced) - correct_count} / {len(ground_truth_replaced)}") - - -def api_status_sanity_check_executable(): - from checker import executable_checker_simple - - ground_truth = load_file(EXECTUABLE_API_GROUND_TRUTH_FILE_PATH) - correct_count = 0 - errors = [] - for data in tqdm( - ground_truth, total=len(ground_truth), desc="API Status Test (Non-REST)" - ): - status = executable_checker_simple( - data["ground_truth"][0], - data["execution_result"][0], - data["execution_result_type"][0], - True, - ) - if status["valid"]: - correct_count += 1 - else: - errors.append((data, status)) - - if correct_count != len(ground_truth): - raise BadAPIStatusError(errors, f"{len(ground_truth) - correct_count} / {len(ground_truth)}") - - -def display_api_status_error(rest_error, executable_error, display_success=False): - if not rest_error and not executable_error: - if display_success: - print("🟢 All API Status Test Passed!") - return None - - RED_FONT = "\033[91m" - RESET = "\033[0m" - - print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n") - - if rest_error: - print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test category (REST). Please contact API provider.\n") - print(f"{rest_error.error_rate} APIs affected:\n") - for data, status in rest_error.errors: - print(f" - Test Case: {data['ground_truth']}") - print(f" Error Type: {status['error_type']}\n") - - if executable_error: - print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test categories (Non-REST). Please contact API provider.\n") - print(f"{executable_error.error_rate} APIs affected:\n") - for data, status in executable_error.errors: - print(f" - Test Case: {data['ground_truth'][0]}") - print(f" Error Type: {status['error_type']}\n") - - print(f"{RED_FONT}{'-' * 100}\n{RESET}") - - -def get_executable_expected_output(prompt_file_path): - # Before we run the evaluation, we need to add the "execution_result" field to the prompt file, using the ground truth data. - prompt_content = load_file(prompt_file_path) - exec_dict = {} - for item in tqdm(prompt_content, desc="Getting Executable Expected Output"): - execution_result = [] - ground_truth = item["ground_truth"] - for i in range(len(ground_truth)): - exec( - "from executable_python_function import *" - + "\nresult=" - + ground_truth[i], - exec_dict, - ) - execution_result.append(exec_dict["result"]) - item["execution_result"] = execution_result - - write_list_of_dicts_to_file(prompt_file_path, prompt_content) - - -def clean_up_executable_expected_output(prompt_path, categories): - for category in categories: - prompt_file = find_file_with_suffix(prompt_path, category) - prompt_content = load_file(prompt_file) - for item in prompt_content: - del item["execution_result"] - write_list_of_dicts_to_file(prompt_file, prompt_content) - - -def calculate_weighted_accuracy(accuracy_dict_list): - total_count = 0 - total_accuracy = 0 - for accuracy_dict in accuracy_dict_list: - total_count += accuracy_dict["total_count"] - total_accuracy += accuracy_dict["accuracy"] * accuracy_dict["total_count"] - - if total_count == 0: - return {"accuracy": 0, "total_count": 0} - - return {"accuracy": total_accuracy / total_count, "total_count": total_count} - - -def calculate_unweighted_accuracy(accuracy_dict_list): - total_accuracy = 0 - for accuracy_dict in accuracy_dict_list: - total_accuracy += accuracy_dict["accuracy"] - - if len(accuracy_dict_list) == 0: - return {"accuracy": 0, "total_count": 0} - - return {"accuracy": total_accuracy / len(accuracy_dict_list), "total_count": 0} - - -def record_result(leaderboard_table, model_name, test_category, accuracy, total_count): - if model_name not in leaderboard_table: - leaderboard_table[model_name] = {} - leaderboard_table[model_name][test_category] = { - "accuracy": accuracy, - "total_count": total_count, - } - - -def record_cost_latency(leaderboard_table, model_name, model_output_data): - if model_name not in leaderboard_table: - leaderboard_table[model_name] = {} - leaderboard_table[model_name]["cost"] = {"input_data": [], "output_data": []} - leaderboard_table[model_name]["latency"] = {"data": []} - - input_token = [] - output_token = [] - latency = [] - for data in model_output_data: - if "latency" in data: - latency.append(data["latency"]) - if data["latency"] > 60: - print("*" * 100) - print( - f"❗️Warning: Latency for one of {model_name} response is {data['latency']}." - ) - print("*" * 100) - if "input_token_count" in data: - if data["input_token_count"] != 0: - input_token.append(data["input_token_count"]) - if "output_token_count" in data: - if data["output_token_count"] != 0: - output_token.append(data["output_token_count"]) - - leaderboard_table[model_name]["cost"]["input_data"].extend(input_token) - leaderboard_table[model_name]["cost"]["output_data"].extend(output_token) - leaderboard_table[model_name]["latency"]["data"].extend(latency) - - -def get_metric(model_name, cost_data, latency_data): - - cost, mean_latency, std_latency, percentile_95_latency = "N/A", "N/A", "N/A", "N/A" - - if ( - model_name in INPUT_PRICE_PER_MILLION_TOKEN - and len(cost_data["input_data"]) > 0 - and len(cost_data["output_data"]) > 0 - ): - - mean_input_token = statistics.mean(cost_data["input_data"]) - mean_output_token = statistics.mean(cost_data["output_data"]) - cost = ( - mean_input_token * INPUT_PRICE_PER_MILLION_TOKEN[model_name] - + mean_output_token * OUTPUT_PRICE_PER_MILLION_TOKEN[model_name] - ) / 1000 - cost = round(cost, 2) - - if model_name in OSS_LATENCY: - mean_latency, std_latency, percentile_95_latency = ( - OSS_LATENCY[model_name] / 1700, - "N/A", - "N/A", - ) - mean_latency = round(mean_latency, 2) - cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600 - cost = round(cost, 2) - - elif len(latency_data["data"]) != 0: - mean_latency = statistics.mean(latency_data["data"]) - std_latency = statistics.stdev(latency_data["data"]) - percentile_95_latency = np.percentile(latency_data["data"], 95) - mean_latency = round(mean_latency, 2) - std_latency = round(std_latency, 2) - percentile_95_latency = round(percentile_95_latency, 2) - - if model_name not in INPUT_PRICE_PER_MILLION_TOKEN: - cost = sum(latency_data["data"]) * V100_x8_PRICE_PER_HOUR / 3600 - cost = round(cost, 2) - - if model_name in NO_COST_MODELS: - cost = "N/A" - - return cost, mean_latency, std_latency, percentile_95_latency - - -def generate_leaderboard_csv(leaderboard_table, output_path): - data = [] - for model_name, value in leaderboard_table.items(): - model_name_escaped = model_name.replace("_", "/") - - python_simple_ast = value.get("simple", {"accuracy": 0, "total_count": 0}) - python_multiple_ast = value.get( - "multiple_function", {"accuracy": 0, "total_count": 0} - ) - python_parallel_ast = value.get( - "parallel_function", {"accuracy": 0, "total_count": 0} - ) - python_parallel_multiple_ast = value.get( - "parallel_multiple_function", {"accuracy": 0, "total_count": 0} - ) - python_simple_exec = value.get( - "executable_simple", {"accuracy": 0, "total_count": 0} - ) - python_multiple_exec = value.get( - "executable_multiple_function", {"accuracy": 0, "total_count": 0} - ) - python_parallel_exec = value.get( - "executable_parallel_function", {"accuracy": 0, "total_count": 0} - ) - python_parallel_multiple_exec = value.get( - "executable_parallel_multiple_function", {"accuracy": 0, "total_count": 0} - ) - java_simple_ast = value.get("java", {"accuracy": 0, "total_count": 0}) - javascript_simple_ast = value.get( - "javascript", {"accuracy": 0, "total_count": 0} - ) - rest_simple_exec = value.get("rest", {"accuracy": 0, "total_count": 0}) - relevance = value.get("relevance", {"accuracy": 0, "total_count": 0}) - - cost_data = value.get("cost", {"input_data": [], "output_data": []}) - latency_data = value.get("latency", {"data": []}) - - simple_ast = calculate_weighted_accuracy( - [python_simple_ast, java_simple_ast, javascript_simple_ast] - ) - multiple_ast = python_multiple_ast - parallel_ast = python_parallel_ast - parallel_multiple_ast = python_parallel_multiple_ast - simple_exec = calculate_weighted_accuracy( - [python_simple_exec, rest_simple_exec] - ) - multiple_exec = python_multiple_exec - parallel_exec = python_parallel_exec - parallel_multiple_exec = python_parallel_multiple_exec - - summary_ast = calculate_unweighted_accuracy( - [simple_ast, multiple_ast, parallel_ast, parallel_multiple_ast] - ) - summary_exec = calculate_unweighted_accuracy( - [simple_exec, multiple_exec, parallel_exec, parallel_multiple_exec] - ) - overall_accuracy = calculate_weighted_accuracy( - [ - simple_ast, - multiple_ast, - parallel_ast, - parallel_multiple_ast, - simple_exec, - multiple_exec, - parallel_exec, - parallel_multiple_exec, - relevance, - ] - ) - - cost, latency_mean, latency_std, percentile_95_latency = get_metric( - model_name_escaped, cost_data, latency_data - ) - - if overall_accuracy["total_count"] != 1700: - print("-" * 100) - print( - f"❗️Warning: Total count for {model_name} is {overall_accuracy['total_count']}" - ) - - data.append( - [ - "N/A", - overall_accuracy["accuracy"], - MODEL_METADATA_MAPPING[model_name_escaped][0], - MODEL_METADATA_MAPPING[model_name_escaped][1], - MODEL_METADATA_MAPPING[model_name_escaped][2], - MODEL_METADATA_MAPPING[model_name_escaped][3], - summary_ast["accuracy"], - summary_exec["accuracy"], - simple_ast["accuracy"], - python_simple_ast["accuracy"], - java_simple_ast["accuracy"], - javascript_simple_ast["accuracy"], - multiple_ast["accuracy"], - parallel_ast["accuracy"], - parallel_multiple_ast["accuracy"], - simple_exec["accuracy"], - python_simple_exec["accuracy"], - rest_simple_exec["accuracy"], - multiple_exec["accuracy"], - parallel_exec["accuracy"], - parallel_multiple_exec["accuracy"], - relevance["accuracy"], - cost, - latency_mean, - latency_std, - percentile_95_latency, - ] - ) - - data.sort(key=lambda x: x[1], reverse=True) - for i in range(len(data)): - data[i][0] = str(i + 1) - data[i][1] = "{:.2f}%".format(data[i][1] * 100) - for j in range(6, len(data[i]) - 4): - data[i][j] = "{:.2f}%".format(data[i][j] * 100) - for j in range(len(data[i]) - 4, len(data[i])): - data[i][j] = str(data[i][j]) - - data.insert(0, COLUMNS) - - filepath = os.path.join(output_path, "data.csv") - with open(filepath, "w") as f: - for i, row in enumerate(data): - if i < len(data) - 1: - f.write(",".join(row) + "\n") - else: - f.write(",".join(row)) - - -def update_leaderboard_table_with_score_file(leaderboard_table, score_path): - - entries = os.scandir(score_path) - - # Filter out the subdirectories - subdirs = [entry.path for entry in entries if entry.is_dir()] - - # Traverse each subdirectory - for subdir in subdirs: - # Pattern to match JSON files in this subdirectory - json_files_pattern = os.path.join(subdir, "*.json") - model_name = subdir.split(score_path)[1] - # Find and process all JSON files in the subdirectory - for model_score_json in glob.glob(json_files_pattern): - metadata = load_file(model_score_json)[0] - accuracy, total_count = metadata["accuracy"], metadata["total_count"] - test_category = model_score_json.split("_score.json")[0].split("/")[-1] - if model_name not in leaderboard_table: - leaderboard_table[model_name] = {} - if test_category not in leaderboard_table[model_name]: - leaderboard_table[model_name][test_category] = { - "accuracy": accuracy, - "total_count": total_count, - } - - -def oss_file_formatter(input_file_path, output_dir): - data = load_file(input_file_path) - assert len(data) == 2000, "OSS result.json file should have 2000 entries." - - for key, value in FILENAME_INDEX_MAPPING.items(): - start, end = value - output_file = os.path.join( - output_dir, f"gorilla_openfunctions_v1_test_{key}_result.json" - ) - with open(output_file, "w") as f: - original_idx = 0 - for i in range(start, end + 1): - new_json = {"id": original_idx, "result": data[i]["text"]} - f.write(json.dumps(new_json) + "\n") - original_idx += 1 - - -def collapse_json_objects(file_path): - with open(file_path, "r") as file: - content = file.read() - - objects = [] - depth = 0 - obj_start = 0 - for i, char in enumerate(content): - if char == "{": - if depth == 0: - obj_start = i - depth += 1 - elif char == "}": - depth -= 1 - if depth == 0: - obj = content[obj_start : i + 1] - objects.append(obj) - - with open(file_path, "w") as out_file: - for obj in objects: - json_obj = json.loads(obj) - compact_json = json.dumps(json_obj, separators=(",", ":")) - out_file.write(compact_json + "\n") \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_python_function.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_python_function.py deleted file mode 100644 index e1f5a4665..000000000 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_python_function.py +++ /dev/null @@ -1,883 +0,0 @@ -import json -import math -import requests -from custom_exception import NoAPIKeyError -import time - -api_key = {} -with open("../function_credential_config.json") as f: - data = json.loads(f.read()) - for item in data: - for k, v in item.items(): - if v == "": - raise NoAPIKeyError() - api_key[k] = v - - -def calculate_triangle_area(base, height): - """ - Calculates the area of a triangle. - Args: - base (integer): The base of the triangle. - height (integer): The height of the triangle. - """ - return base * height / 2 - - -def get_distance(pointA, pointB): - """ - Calculates the distance between two 2D points. - Args: - pointA (tuple): The first point. - pointB (tuple): The second point. - """ - return ((pointA[0] - pointB[0]) ** 2 + (pointA[1] - pointB[1]) ** 2) ** 0.5 - - -def math_factorial(n): - """ - Calculates the factorial of a number. - Args: - n (integer): The number to calculate the factorial of. - """ - result = 1 - for i in range(1, n + 1): - result *= i - return result - - -def quadratic_roots(a, b, c): - """ - Calculates the roots of a quadratic equation. - Args: - a (integer): The first coefficient. - b (integer): The second coefficient. - c (integer): The third coefficient. - Returns: - A list of roots, where each root is either a float or a dictionary - with 'real' and 'imaginary' parts for complex roots. - """ - discriminant = b**2 - 4 * a * c - if discriminant >= 0: - root1 = (-b + discriminant**0.5) / (2 * a) - root2 = (-b - discriminant**0.5) / (2 * a) - roots = [root1, root2] - else: - real_part = -b / (2 * a) - imaginary_part = (abs(discriminant) ** 0.5) / (2 * a) - roots = [ - {"real": real_part, "imaginary": imaginary_part}, - {"real": real_part, "imaginary": -imaginary_part}, - ] - - return roots - - -def geometry_area_circle(radius): - """ - Calculates the area of a circle. - Args: - radius (integer): The radius of the circle. - """ - return math.pi * radius**2 - - -def get_prime_factors(number): - """ - Calculates the prime factors of a number. - Args: - number (integer): The number to calculate the prime factors of. - """ - factors = [] - divisor = 2 - while number > 1: - while number % divisor == 0: - factors.append(divisor) - number /= divisor - divisor += 1 - return factors - - -def math_gcd(a, b): - """ - Calculates the greatest common divisor of two numbers. - Args: - a (integer): The first number. This should be the larger number. - b (integer): The second number. - """ - if b == 0: - return a - else: - return math_gcd(b, a % b) - - -def math_lcm(a, b): - """ - Calculates the least common multiple of two numbers. - Args: - a (integer): The first number. This should be the larger number. - b (integer): The second number. - """ - return a * b / math_gcd(a, b) - - -def calculate_final_velocity(initial_velocity, acceleration, time): - """ - Calculates the final velocity of an object. - Args: - initial_velocity (integer): The initial velocity of the object. - acceleration (integer): The acceleration of the object. - time (integer): The time the object has been moving. - """ - return initial_velocity + acceleration * time - - -def calculate_displacement(initial_velocity, acceleration, time): - """ - Calculates the displacement of an object. - Args: - initial_velocity (integer): The initial velocity of the object. - acceleration (integer): The acceleration of the object. - time (integer): The time the object has been moving. - """ - return initial_velocity * time + 0.5 * acceleration * time**2 - - -def calculate_electrostatic_potential_energy(charge, voltage): - """ - Calculates the electrostatic potential energy. - Args: - charge (integer): The charge of the object. - voltage (integer): The voltage of the object. - """ - return charge * voltage - - -def calculate_density(mass, volume): - """ - Calculates the density of an object. - Args: - mass (integer): The mass of the object. - volume (integer): The volume of the object. - """ - return mass / volume - - -def mat_mul(matA, matB): - """ - Multiplies two matrices. - Args: - matA (list): The first matrix. - matB (list): The second matrix. - """ - result = [[0 for i in range(len(matB[0]))] for j in range(len(matA))] - for i in range(len(matA)): - for j in range(len(matB[0])): - for k in range(len(matB)): - result[i][j] += matA[i][k] * matB[k][j] - return result - - -def calculate_mean(numbers): - """ - Calculates the mean of a list of numbers. - Args: - numbers (list): The list of numbers. - """ - return sum(numbers) / len(numbers) - - -def calculate_standard_deviation(numbers): - """ - Calculates the standard deviation of a list of numbers. - Args: - numbers (list): The list of numbers. - """ - mean = calculate_mean(numbers) - variance = sum((number - mean) ** 2 for number in numbers) / len(numbers) - return variance**0.5 - - -def calc_binomial_probability(n, k, p): - """ - Calculates the probability of getting k successes in n trials. - Args: - n (integer): The number of trials. - k (integer): The number of successes. - p (integer): The probability of success. - """ - return ( - math_factorial(n) - / (math_factorial(k) * math_factorial(n - k)) - * (p**k * (1 - p) ** (n - k)) - ) - - -def calculate_permutations(n, k): - """ - Calculates the number of permutations of k elements from a set of n elements. - Args: - n (integer): The number of elements in the set. - k (integer): The number of elements to choose. - """ - return math_factorial(n) / math_factorial(n - k) - - -def get_fibonacci_sequence(n): - """ - Calculates the n numbers of the Fibonacci. - Args: - n (integer): The number of Fibonacci numbers to calculate. - """ - sequence = [0, 1] - for i in range(2, n): - sequence.append(sequence[i - 1] + sequence[i - 2]) - return sequence - - -def estimate_derivative(function, x): - """ - Estimate the derivative of a function at a given point. - Args: - function (function): The function to calculate the derivative of. - x (integer): The point to calculate the derivative at. - """ - func = eval(function) - h = 0.0000000001 - return (func(x + h) - func(x)) / h - - -def calculate_cosine_similarity(vectorA, vectorB): - """ - Calculates the cosine similarity of two vectors. - Args: - vectorA (list): The first vector. - vectorB (list): The second vector. - """ - dot_product = sum(vectorA[i] * vectorB[i] for i in range(len(vectorA))) - magnitudeA = (sum(vectorA[i] ** 2 for i in range(len(vectorA)))) ** 0.5 - magnitudeB = (sum(vectorB[i] ** 2 for i in range(len(vectorB)))) ** 0.5 - return dot_product / (magnitudeA * magnitudeB) - - -def mortgage_calculator(loan_amount, interest_rate, loan_period): - """ - Calculates the monthly mortgage payment. - Args: - loan_amount (integer): The amount of the loan. - interest_rate (integer): The interest rate of the loan. - loan_period (integer): The period of the loan. - """ - monthly_interest_rate = interest_rate / 12 - number_of_payments = loan_period * 12 - monthly_payment = ( - loan_amount - * monthly_interest_rate - * (1 + monthly_interest_rate) ** number_of_payments - / ((1 + monthly_interest_rate) ** number_of_payments - 1) - ) - return monthly_payment - - -def calculate_future_value(present_value, interest_rate, periods): - """ - Calculates the future value of an investment. - Args: - present_value (integer): The present value of the investment. - interest_rate (integer): The interest rate of the investment. - periods (integer): The number of periods. - """ - return present_value * (1 + interest_rate) ** periods - - -def sort_array(array, reverse=False): - """ - Sorts an array of numbers. - Args: - array (list): The array of numbers. - reverse (optional bool): Whether to sort the array in reverse order, i.e., descending order. - """ - return sorted(array, reverse=reverse) - - -def get_weather_data(coordinates): - """ - Fetches weather data from the Open-Meteo API for the given latitude and longitude. - - Args: - coordinates (tuple): The latitude of the location. - - Returns: - float: The current temperature in the coordinates you've asked for - """ - lat, long = coordinates - url = "https://api.open-meteo.com/v1/forecast" - params = { - "latitude": lat, - "longitude": long, - "current": "temperature_2m", - "temperature_unit": "fahrenheit", - } - - response = requests.get(url, params=params) - if response.status_code == 200: - return response.json()["current"]["temperature_2m"] - else: - return "Failed to fetch data with status code: {}".format(response.status_code) - - -def get_coordinates_from_city(city_name): - """ - Fetches the latitude and longitude of a given city name using the Maps.co Geocoding API. - - Args: - city_name (str): The name of the city. - - Returns: - tuple: The latitude and longitude of the city. - """ - time.sleep(2) # To avoid rate limiting - url = "https://geocode.maps.co/search" - params = {"q": city_name, "api_key": api_key["GEOCODE-API-KEY"]} - - response = requests.get(url, params=params) - if response.status_code == 200: - data = response.json() - if data: - return data[0]["lat"], data[0]["lon"] - else: - return "No data found for the given city name." - else: - return "Failed to fetch data with status code: {}".format(response.status_code) - - -def convert_currency(amount, from_currency, to_currency): - """ - Converts a given amount from one currency to another using the ExchangeRate-API. - - Args: - amount (float): The amount of money to convert. - from_currency (str): The ISO currency code for the base currency. - to_currency (str): The ISO currency code for the target currency. - - Returns: - float: The converted amount in the target currency. - """ - key = api_key["EXCHANGERATE-API-KEY"] - base_url = f"https://v6.exchangerate-api.com/v6/{key}/latest/{from_currency}" - response = requests.get(base_url) - - if response.status_code == 200: - data = response.json() - rates = data.get("conversion_rates", {}) - if to_currency in rates: - converted_amount = amount * rates[to_currency] - return converted_amount - else: - return "Target currency code not found." - else: - return "Failed to fetch data with status code: {}".format(response.status_code) - - -def find_term_on_urban_dictionary(term): - """ - Finds the definition of a term on Urban Dictionary. - Args: - term (str): The term to find the definition of. - """ - url = "https://mashape-community-urban-dictionary.p.rapidapi.com/define" - - querystring = {"term": term} - - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "mashape-community-urban-dictionary.p.rapidapi.com", - } - - response = requests.get(url, headers=headers, params=querystring) - - return response.json()["list"][0]["definition"] - - -def get_coordinate_by_ip_address(ip_address): - """ - Finds the latitude and longitude of an IP address. - Args: - ip_address (str): The IP address to find the location of. - """ - url = f"http://ip-api.com/json/{ip_address}" - response = requests.get(url) - try: - return (response.json()["lat"], response.json()["lon"]) - except: - return response.json()["message"] - - -def get_zipcode_by_ip_address(ip_address): - """ - Finds the zipcode of an IP address. - Args: - ip_address (str): The IP address to find the location of. - """ - url = f"http://ip-api.com/json/{ip_address}" - response = requests.get(url) - try: - return response.json()["zip"] - except: - return response.json()["message"] - - -def get_covid_death_by_country(country): - """ - Finds the most up to date total deaths of a country result from COVID. - Args: - country (str): The country to find the total deaths of, in the format of the country's full name. - """ - url = "https://covid-193.p.rapidapi.com/statistics" - - querystring = {"country": country} - - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "covid-193.p.rapidapi.com", - } - - response = requests.get(url, headers=headers, params=querystring) - try: - return response.json()["response"][0]["deaths"]["total"] - except: - return response.json() - - -def get_active_covid_case_by_country(country): - """ - Finds the most up to date active cases of a country result from COVID. - Args: - country (str): The country to find the active cases of. - """ - url = "https://covid-193.p.rapidapi.com/statistics" - - querystring = {"country": country} - - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "covid-193.p.rapidapi.com", - } - - response = requests.get(url, headers=headers, params=querystring) - try: - return response.json()["response"][0]["cases"]["active"] - except: - return response.json() - - -def get_rating_by_amazon_ASIN(ASIN): - url = "https://real-time-amazon-data.p.rapidapi.com/product-details" - querystring = {"asin": ASIN, "country": "US"} - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "real-time-amazon-data.p.rapidapi.com", - } - - retries = 0 - max_retries = 5 - while retries < max_retries: - response = requests.get(url, headers=headers, params=querystring) - try: - return response.json()["data"]["product_star_rating"] - except KeyError: - wait_time = 2**retries # Exponential backoff: 1, 2, 4 seconds - time.sleep(wait_time) - retries += 1 - - return None - - -def get_price_by_amazon_ASIN(ASIN): - url = "https://real-time-amazon-data.p.rapidapi.com/product-details" - querystring = {"asin": ASIN, "country": "US"} - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "real-time-amazon-data.p.rapidapi.com", - } - - retries = 0 - max_retries = 5 - while retries < max_retries: - response = requests.get(url, headers=headers, params=querystring) - try: - return response.json()["data"]["product_price"] - except KeyError: - wait_time = 2**retries # Exponential backoff: 1, 2, 4 seconds - time.sleep(wait_time) - retries += 1 - - return None - - -def get_product_name_by_amazon_ASIN(ASIN): - url = "https://real-time-amazon-data.p.rapidapi.com/product-details" - querystring = {"asin": ASIN, "country": "US"} - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "real-time-amazon-data.p.rapidapi.com", - } - - retries = 0 - max_retries = 5 - while retries < max_retries: - response = requests.get(url, headers=headers, params=querystring) - try: - return response.json()["data"]["product_title"] - except KeyError: - wait_time = 2**retries # Exponential backoff: 1, 2, 4 seconds - time.sleep(wait_time) - retries += 1 - - return None - - -def get_company_name_by_stock_name(stock_name): - """ - Finds the company name of a stock by its stock name. - Args: - stock_name (str): The stock name of the product. - """ - url = "https://yahoo-finance15.p.rapidapi.com/api/v1/markets/search" - - querystring = {"search": stock_name} - - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "yahoo-finance15.p.rapidapi.com", - } - - response = requests.get(url, headers=headers, params=querystring) - try: - return response.json()["body"][0]["name"] - except: - return response.json() - - -def get_stock_price_by_stock_name(stock_name): - """ - Finds the price of a stock by its stock name. - Args: - stock_name (str): The stock name of the product. - """ - url = "https://yahoo-finance15.p.rapidapi.com/api/v1/markets/stock/quotes" - - querystring = {"ticker": stock_name} - - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "yahoo-finance15.p.rapidapi.com", - } - - response = requests.get(url, headers=headers, params=querystring) - try: - return float(response.json()["body"][0]["regularMarketPrice"]) - except: - return response.json() - - -def get_stock_history(stock_name, interval, diffandsplits="true"): - """ - Finds the price of a stock by its stock name. - Args: - stock_name (str): The stock name of the product. - interval (str): The interval of the stock history. Allows one of following : 5m|15m|30m|1h|1d|1wk|1mo|3mo - diffandsplits (optional str): The diff and splits of the stock history. Allows one of following : 'true'|'false' - """ - url = "https://yahoo-finance15.p.rapidapi.com/api/v1/markets/stock/history" - - querystring = { - "symbol": stock_name, - "interval": interval, - "diffandsplits": diffandsplits, - } - - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "yahoo-finance15.p.rapidapi.com", - } - - response = requests.get(url, headers=headers, params=querystring) - try: - data = response.json()["body"] - return {key: data[key] for key in list(data)[-10:]} - except: - return response.json() - - -def retrieve_city_based_on_zipcode(zipcode): - """ - Finds the city of a zipcode. - Args: - zipcode (str): The zipcode of the city. - """ - url = f"http://ziptasticapi.com/{zipcode}" - response = requests.get(url) - try: - return response.json()["city"] - except: - return response.json() - - -def retrieve_holiday_by_year(country, year): - """ - Finds the holidays of a year. - Args: - year (str): The year of the holidays. - country (str): The country of the holidays. Possible options: US, AT, DE, ES, FR, GB, IT, NL, PL, RO, SK, UA. - """ - url = f"https://date.nager.at/api/v3/publicholidays/{year}/{country}" - response = requests.get(url) - return response.json() - - -def get_time_zone_by_coord(long, lat): - """ - Finds the timezone of a coordinate. - Args: - long (str): The longitude of the coordinate. - lat (str): The latitude of the coordinate. - """ - url = "https://timezone-by-location.p.rapidapi.com/timezone" - - querystring = {"lat": lat, "lon": long, "c": "1", "s": "0"} - - headers = { - "X-RapidAPI-Key": api_key["RAPID-API-KEY"], - "X-RapidAPI-Host": "timezone-by-location.p.rapidapi.com", - } - - response = requests.get(url, headers=headers, params=querystring) - try: - return response.json()["Zones"][0]["TimezoneId"] - except: - return response.json() - - -def linear_regression(x, y, point): - """ - Finds the linear regression of a set of points. - Args: - x (list): The x coordinates of the points. - y (list): The y coordinates of the points. - point (int): The point to calculate the linear regression at. - """ - n = len(x) - sum_x = sum(x) - sum_y = sum(y) - sum_x_squared = sum(x_i**2 for x_i in x) - sum_xy = sum(x[i] * y[i] for i in range(n)) - slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x_squared - sum_x**2) - intercept = (sum_y - slope * sum_x) / n - return slope * point + intercept - - -def add_binary_numbers(a, b): - """ - Adds two binary numbers. - Args: - a (str): The first binary number. - b (str): The second binary number. - """ - return bin(int(a, 2) + int(b, 2))[2:] - - -def maxPoints(points) -> int: - """ - Finds the maximum number of points on a line. - Args: - points (list): The list of points. points are 2 element lists. - """ - counter = 1 - if len(points) < 2: - return 1 - for i in range(len(points)): - lst = {} - for j in range(i + 1, len(points)): - y = points[j][1] - points[i][1] - x = points[j][0] - points[i][0] - if x != 0: - lst[y / x] = 1 + lst.get(y / x, 0) - else: - lst["inf"] = 1 + lst.get("inf", 0) - for key, value in lst.items(): - counter = max(counter, value) - return counter + 1 - - -def calculate_investment_value( - initial_investment, - annual_contribution, - years, - annual_return, - inflation_rate, - adjust_for_inflation=True, -): - """ - Calculates the value of an investment over time. - Args: - initial_investment (integer): The initial investment amount. - annual_contribution (integer): The annual contribution amount. - years (integer): The number of years to calculate the investment value for. - annual_return (float): The annual return rate, ranging from 0 to 1. - inflation_rate (list): The inflation rate for each year in percentage, ranging from 0 to 1. - adjust_for_inflation (optional bool): Whether to adjust the investment value for inflation. - """ - current_value = initial_investment - real_value = initial_investment # Adjusted for inflation - - for year in range(1, years + 1): - # Apply annual return - current_value = current_value * (1 + annual_return) + annual_contribution - - # Adjust for inflation if requested - if adjust_for_inflation: - inflation_adjustment = ( - 1 - inflation_rate[year - 1] - if year <= len(inflation_rate) - else 1 - inflation_rate[-1] - ) - real_value = ( - real_value * (1 + annual_return - inflation_rate[year - 1]) - + annual_contribution * inflation_adjustment - ) - else: - real_value = current_value - - final_value = real_value if adjust_for_inflation else current_value - return final_value - - -def calculate_nutritional_needs(weight, height, age, gender, activity_level, goal): - """ - Calculates the nutritional needs of a person based on their weight, height - Args: - weight (integer): The weight of the person. - height (integer): The height of the person. - age (integer): The age of the person - gender (str): The gender of the person. Possible options [male,female,other] - activity_level (integer): The activity level of the person. Possible options [1,2,3,4,5] - goal (str): The goal of the person. Possible options [lose,gain,maintain] - """ - if gender == "male": - bmr = 88.362 + (13.397 * weight) + (4.799 * height) - (5.677 * age) - else: - bmr = 447.593 + (9.247 * weight) + (3.098 * height) - (4.330 * age) - - # Total Daily Energy Expenditure (TDEE) Calculation - activity_multipliers = [1.2, 1.375, 1.55, 1.725, 1.9] - tdee = bmr * activity_multipliers[activity_level - 1] - - # Adjust TDEE based on goal - if goal == "lose": - tdee -= 500 # Creating a deficit to lose weight - elif goal == "gain": - tdee += 500 # Creating a surplus to gain weight - - # Macronutrient Distribution - proteins = (tdee * 0.30) / 4 # 30% of calories from protein, 4 calories per gram - fats = (tdee * 0.25) / 9 # 25% of calories from fat, 9 calories per gram - carbohydrates = (tdee * 0.45) / 4 # 45% of calories from carbs, 4 calories per gram - - return { - "calories": tdee, - "proteins_g": proteins, - "fats_g": fats, - "carbohydrates_g": carbohydrates, - } - - -def book_room( - room_type, price, check_in_date, check_out_date, customer_id, discount_code=None -): - """ - Books a room for a customer. - Args: - room_type (dict): The room type to book. - check_in_date (str): The check-in date. - check_out_date (str): The check-out date. - customer_id (str): The customer ID. - discount_code (str): The discount code (if any). - """ - # Assume the first available room is booked (for simplicity) - booked_room = room_type - - # Calculate price and apply discount if applicable - if discount_code and discount_code == "DISCOUNT10": - price *= 0.9 # Apply 10% discount - - booking_details = { - "customer_id": customer_id, - "room_number": room_type, - "check_in_date": check_in_date, - "check_out_date": check_out_date, - "total_price": price, - } - - return booking_details - - -def order_food(item, quantity, price): - """ - Orders food for a customer. - Args: - item (list): The item to order. - quantity (list): The quantity of the item. - price (list): The price of the item. - """ - # Calculate total price - total_price = sum([quantity[i] * price[i] for i in range(len(item))]) - return total_price - - -def get_movie_rating(movie_name): - """ - Fetches the age rating of a movie from the OMDB API. - Args: - movie_name (str): The name of the movie. - """ - url = "http://www.omdbapi.com/" - params = {"t": movie_name, "apikey": api_key["OMDB-API-KEY"]} - response = requests.get(url, params=params) - return response.json()["Rated"] - - -def get_movie_director(movie_name): - """ - Fetches the director of a movie from the OMDB API. - Args: - movie_name (str): The name of the movie. - """ - url = "http://www.omdbapi.com/" - params = {"t": movie_name, "apikey": api_key["OMDB-API-KEY"]} - response = requests.get(url, params=params) - return response.json()["Director"] - - -def polygon_area(vertices): - """ - Calculate the area of a polygon given its vertices using the shoelace formula. - Args: - vertices (list): The vertices of the polygon. Vertices are 2 element lists. - """ - n = len(vertices) - if n < 3: - raise ValueError("A polygon must have at least 3 vertices.") - - # Append the first vertex to the end to complete the loop - vertices.append(vertices[0]) - - # Apply the shoelace formula - area = 0 - for i in range(n): - area += (vertices[i][0] * vertices[i + 1][1]) - ( - vertices[i + 1][0] * vertices[i][1] - ) - - area = abs(area) / 2.0 - return area diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/java_type_converter.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/java_type_converter.py deleted file mode 100644 index 973aaa0e2..000000000 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/java_type_converter.py +++ /dev/null @@ -1,407 +0,0 @@ -import re -from typing import List, Dict, Union -from model_handler.constant import JAVA_TYPE_CONVERSION - - -def java_type_converter(value, expected_type, nested_type=None): - if expected_type not in JAVA_TYPE_CONVERSION: - raise ValueError(f"Unsupported type: {expected_type}") - if ( - expected_type == "byte" - or expected_type == "short" - or expected_type == "integer" - ): - if not re.match(r"^-?\d+$", value): - return str(value) # default to string - return int(value) - elif expected_type == "float": - if not re.match(r"^-?\d+(\.\d+)?([eE][+-]?\d+)?[fF]$", value): - return str(value) # default to string - return float(re.sub(r"[fF]$", "", value)) - elif expected_type == "double": - if not re.match(r"^-?\d+(\.\d+)?([eE][+-]?\d+)?$", value): - return str(value) # default to string - return float(value) - elif expected_type == "long": - if not re.match(r"^-?\d+[lL]$", value): - return str(value) # default to string - return int(re.sub(r"[lL]$", "", value)) - elif expected_type == "boolean": - if value not in ["true", "false"]: - return str(value) # default to string - return parse_java_boolean(value) - elif expected_type == "char": - if not re.match(r"^\'.$\'", value): - return str(value) # default to string - return value # Remove the single quotes - elif expected_type == "Array" or expected_type == "ArrayList": - return parse_java_collection(value, expected_type, nested_type) - elif expected_type == "Set": - raise NotImplementedError("Set conversion is not implemented") - elif expected_type == "HashMap": - return parse_java_collection(value, expected_type, nested_type) - elif expected_type == "Hashtable": - raise NotImplementedError("Set conversion is not implemented") - elif expected_type == "Queue" or expected_type == "Stack": - raise NotImplementedError(f"{expected_type} conversion is not implemented") - elif expected_type == "String" or expected_type == "any": - return str(value) # we output as string for `any` type - else: - raise ValueError(f"Unsupported type: {expected_type}") - - -def parse_java_boolean(value): - return value == "true" - - -def parse_java_collection( - input_str: str, type_str: str, nested_type=None -) -> Union[List, Dict]: - if type_str == "ArrayList": - return parse_arraylist(input_str, nested_type) - elif type_str == "Array": - return parse_array(input_str, nested_type) - elif type_str == "HashMap": - return parse_hashmap(input_str) - else: - raise ValueError(f"Unsupported type: {type_str}") - - -def parse_arraylist(input_str: str, nested_type=None) -> List: - match_asList = re.search( - r"new\s+ArrayList<\w*>\(Arrays\.asList\((.+?)\)\)", input_str - ) - if match_asList: - elements_str = match_asList.group(1) - elements = [] - for element_str in elements_str.split(","): - element_str = element_str.strip() - if nested_type == "char": - element = element_str[1:-1] # Remove the single quotes - elif nested_type == "String": - element = element_str[1:-1] # Remove the double quotes - else: - element = ( - java_type_converter(element_str, nested_type) - if nested_type - else parse_java_value(element_str) - ) - elements.append(element) - return elements - - match_add = re.search( - r"new\s+ArrayList<\w*>\(\)\s*\{\{\s*(.+?)\s*\}\}", input_str, re.DOTALL - ) - if match_add: - adds_str = match_add.group(1) - elements = [] - matches = re.findall(r"add\((.+?)\)", adds_str) - for match in matches: - value_str = match.strip() - if nested_type == "char": - value = value_str[1:-1] # Remove the single quotes - elif nested_type == "String": - value = value_str[1:-1] # Remove the double quotes - else: - value = ( - java_type_converter(value_str, nested_type) - if nested_type - else parse_java_value(value_str) - ) - elements.append(value) - return elements - - match_empty = re.search(r"new\s+ArrayList<\w*>\(\)", input_str) - if match_empty: - return [] # Return an empty list for an empty ArrayList - - return input_str # default to string - - -def parse_array(input_str: str, nested_type=None) -> List: - match = re.search(r"new\s+\w+\[\]\s*\{(.*?)\}", input_str) - if match: - elements_str = match.group(1) - if nested_type: - elements = [ - java_type_converter(x.strip(), nested_type) - for x in elements_str.split(",") - if x.strip() - ] - else: - elements = [ - parse_java_value(x.strip()) - for x in elements_str.split(",") - if x.strip() - ] - - return elements - else: - return input_str # default to string - - -def parse_hashmap(input_str: str) -> Dict: - elements = {} - match = re.search( - r"new\s+HashMap<.*?>\s*\(\)\s*\{\s*\{?\s*(.*?)\s*\}?\s*\}", input_str, re.DOTALL - ) - if match: - puts_str = match.group(1) - if puts_str.strip(): - matches = re.findall(r"put\(\"(.*?)\",\s*(.*?)\)", puts_str) - for match in matches: - key = match[0] - value = parse_java_value(match[1].strip()) - elements[key] = value - return elements - - match_empty = re.search(r"new\s+HashMap<.*?>\s*\(\)", input_str) - if match_empty: - return {} # Return an empty dictionary for an empty HashMap - - return input_str # default to string - - -# This method parses without the information of what each element type is, contrary of the previous -def parse_java_value(value_str: str): - # check if it's boolean - if value_str == "true": - return True - elif value_str == "false": - return False - # check if it's a string - elif value_str.startswith('"') and value_str.endswith('"'): - return value_str[1:-1] - # check if it's a long - elif re.match(r"^-?\d+[lL]$", value_str): - return int(value_str[:-1]) - # check if it's a float - elif re.match(r"^-?\d+(\.\d+)?([eE][+-]?\d+)?[fF]$", value_str): - return float(re.sub(r"[fF]$", "", value_str)) - # check if it's a integer-like and float-like types (including byte, short, integer, double, etc) - else: - try: - return int(value_str) - except ValueError: - try: - return float(value_str) - except ValueError: - # this assuming all other types are converted to string - return value_str - - -# Write tests for the `java_type_converter` function -def test_java_type_converter(): - # Test valid conversions - assert java_type_converter("true", "boolean") == True - assert java_type_converter("false", "boolean") == False - assert java_type_converter("123", "integer") == 123 - assert java_type_converter("-123", "integer") == -123 - assert java_type_converter("3.14f", "float") == 3.14 - assert java_type_converter("-3.14f", "float") == -3.14 - assert java_type_converter("3.14", "double") == 3.14 - assert java_type_converter("-3.14", "double") == -3.14 - assert java_type_converter("123L", "long") == 123 - assert java_type_converter("-123L", "long") == -123 - assert java_type_converter("a", "char") == "a" - assert java_type_converter("abc", "String") == "abc" - assert java_type_converter("new int[]{1, 2, 3}", "Array") == [1, 2, 3] - assert java_type_converter( - 'new ArrayList<>(Arrays.asList("a", "b"))', "ArrayList" - ) == ["a", "b"] - assert java_type_converter( - 'new HashMap() {{ put("key", "value"); }}', "HashMap" - ) == {"key": "value"} - assert java_type_converter("3f", "float") == 3.0 - assert java_type_converter("3e3F", "float") == 3e3 - assert java_type_converter("3e-3F", "float") == 3e-3 - assert java_type_converter("3.14e2", "double") == 3.14e2 - assert java_type_converter("3.14e-2", "double") == 3.14e-2 - assert java_type_converter("127", "byte") == 127 - assert java_type_converter("-128", "byte") == -128 - assert java_type_converter("32767", "short") == 32767 - assert java_type_converter("-32768", "short") == -32768 - assert java_type_converter("9223372036854775807L", "long") == 9223372036854775807 - assert java_type_converter("-9223372036854775808L", "long") == -9223372036854775808 - assert java_type_converter("123", "any") == "123" - assert java_type_converter("abc", "any") == "abc" - - # Test empty collections - assert java_type_converter("new int[]{}", "Array") == [] - assert java_type_converter("new ArrayList<>()", "ArrayList") == [] - assert java_type_converter("new HashMap<>()", "HashMap") == {} - - # Test collections with mixed types - assert java_type_converter('new Object[]{1, "abc", true}', "Array") == [ - 1, - "abc", - True, - ] - assert java_type_converter( - 'new ArrayList<>(Arrays.asList(1, "abc", true))', "ArrayList" - ) == [1, "abc", True] - assert java_type_converter( - 'new HashMap() {{ put("key1", 1); put("key2", "value"); put("key3", true); }}', - "HashMap", - ) == {"key1": 1, "key2": "value", "key3": True} - - # Test invalid values - try: - java_type_converter("true", "integer") - except ValueError as e: - assert str(e) == "Invalid integer value: true" - - try: - java_type_converter("abc", "integer") - except ValueError as e: - assert str(e) == "Invalid integer value: abc" - - try: - java_type_converter("abc", "long") - except ValueError as e: - assert str(e) == "Invalid long value: abc" - - try: - java_type_converter("3.14", "float") - except ValueError as e: - assert str(e) == "Invalid float value: 3.14" - - try: - java_type_converter("3.14f", "double") - except ValueError as e: - assert str(e) == "Invalid double value: 3.14f" - - try: - java_type_converter("128", "byte") - except ValueError as e: - assert str(e) == "Invalid byte value: 128" - - try: - java_type_converter("32768", "short") - except ValueError as e: - assert str(e) == "Invalid short value: 32768" - - try: - java_type_converter("invalid", "boolean") - except ValueError as e: - assert str(e) == "Invalid boolean value: invalid" - - try: - java_type_converter("abc", "char") - except ValueError as e: - assert str(e) == "Invalid char value: abc" - - # Test unsupported types - try: - java_type_converter("abc", "Set") - except NotImplementedError as e: - assert str(e) == "Set conversion is not implemented" - - try: - java_type_converter("abc", "Hashtable") - except NotImplementedError as e: - assert str(e) == "Set conversion is not implemented" - - try: - java_type_converter("abc", "Queue") - except NotImplementedError as e: - assert str(e) == "Queue conversion is not implemented" - - try: - java_type_converter("abc", "Stack") - except NotImplementedError as e: - assert str(e) == "Stack conversion is not implemented" - - # extra array testing - assert java_type_converter("new int[]{}", "Array") == [] - assert java_type_converter("new int[] {}", "Array") == [] - assert java_type_converter("new int[] { }", "Array") == [] - assert java_type_converter("new int[]{1,2,3}", "Array") == [1, 2, 3] - assert java_type_converter("new int[]{1, 2, 3}", "Array") == [1, 2, 3] - assert java_type_converter("new int[] {1, 2, 3}", "Array") == [1, 2, 3] - assert java_type_converter("new int[] { 1, 2, 3 }", "Array") == [1, 2, 3] - - # extra hashmap testing - assert java_type_converter("new HashMap<>()", "HashMap") == {} - assert java_type_converter("new HashMap<>() {}", "HashMap") == {} - assert java_type_converter("new HashMap<>() {{}}", "HashMap") == {} - assert java_type_converter("new HashMap<>() {{ }}", "HashMap") == {} - assert java_type_converter( - 'new HashMap() {{ put("key", "value"); }}', "HashMap" - ) == {"key": "value"} - assert java_type_converter( - 'new HashMap() {{put("key", "value");}}', "HashMap" - ) == {"key": "value"} - assert java_type_converter( - 'new HashMap() { { put("key", "value"); } }', "HashMap" - ) == {"key": "value"} - assert java_type_converter( - 'new HashMap() {{ put("key1", 123); put("key2", true); }}', - "HashMap", - ) == {"key1": 123, "key2": True} - assert java_type_converter( - 'new HashMap() {{ put("key1", "value 1"); put("key2", "value 2"); }}', - "HashMap", - ) == {"key1": "value 1", "key2": "value 2"} - - def test_parse_array_long(): - input_str = "new long[]{1L, 2L, 3L}" - expected_output = [1, 2, 3] - assert parse_array(input_str, nested_type="long") == expected_output - - def test_parse_array_mixed_long(): - input_str = "new long[]{1L, 2, 3L}" - expected_output = [1, "2", 3] - assert parse_array(input_str, nested_type="long") == expected_output - - def test_parse_array_invalid_long(): - input_str = "new long[]{1L, 2.0, 3L}" - expected_output = [1, "2.0", 3] - assert parse_array(input_str, nested_type="long") == expected_output - - def test_parse_arraylist_int(): - input_str = "new ArrayList(Arrays.asList(1, 2, 3))" - expected_output = [1, 2, 3] - assert parse_arraylist(input_str, nested_type="integer") == expected_output - - def test_parse_arraylist_float(): - input_str = "new ArrayList() {{ add(1.0f); add(2.0f); add(3.0f); }}" - expected_output = [1.0, 2.0, 3.0] - assert parse_arraylist(input_str, nested_type="float") == expected_output - - def test_parse_arraylist_double(): - input_str = "new ArrayList() {{ add(1.0); add(2.0); add(3.0); }}" - expected_output = [1.0, 2.0, 3.0] - assert parse_arraylist(input_str, nested_type="double") == expected_output - - def test_parse_arraylist_boolean(): - input_str = "new ArrayList(Arrays.asList(true, false, true))" - expected_output = [True, False, True] - assert parse_arraylist(input_str, nested_type="boolean") == expected_output - - def test_parse_arraylist_char(): - input_str = "new ArrayList() {{ add('a'); add('b'); add('c'); }}" - expected_output = ["a", "b", "c"] - print(parse_arraylist(input_str, nested_type="char")) - assert parse_arraylist(input_str, nested_type="char") == expected_output - - def test_parse_arraylist_string(): - input_str = 'new ArrayList() {{ add("aasdasd"); add("basdasd"); add("casdasd"); }}' - expected_output = ["aasdasd", "basdasd", "casdasd"] - print(parse_arraylist(input_str)) - assert parse_arraylist(input_str) == expected_output - - test_parse_array_long() - test_parse_array_mixed_long() - test_parse_array_invalid_long() - test_parse_arraylist_int() - test_parse_arraylist_float() - test_parse_arraylist_double() - test_parse_arraylist_boolean() - test_parse_arraylist_char() - test_parse_arraylist_string() - print("All tests passed successfully!") - - -if __name__ == "__main__": - test_java_type_converter() diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/javascript_type_converter.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/javascript_type_converter.py deleted file mode 100644 index 93a4de6c0..000000000 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/javascript_type_converter.py +++ /dev/null @@ -1,293 +0,0 @@ -import re -from model_handler.constant import JS_TYPE_CONVERSION - - -def js_type_converter(value, expected_type, nested_type=None): - if expected_type not in JS_TYPE_CONVERSION: - raise ValueError(f"Unsupported type: {expected_type}") - - if expected_type == "String": - if not (value.startswith('"') and value.endswith('"')) and not ( - value.startswith("'") and value.endswith("'") - ): - return str(value) - return value[1:-1] - - elif expected_type == "integer": - if not re.match(r"^-?\d+$", value): - return str(value) # default to string - return int(value) - elif expected_type == "float": - if not re.match(r"^-?\d+(\.\d+)?$", value): - return str(value) # default to string - return float(value) - elif expected_type == "Bigint": - if not re.match(r"^-?\d+n$", value): - return str(value) # default to string - return int(value[:-1]) - elif expected_type == "Boolean": - if value not in ["true", "false"]: - return str(value) # default to string - return value == "true" - elif expected_type == "dict": - return parse_js_collection(value, "dict", nested_type) - elif expected_type == "array": - return parse_js_collection(value, "array", nested_type) - elif expected_type == "any": - return str(value) - else: - raise ValueError(f"Unsupported type: {expected_type}") - - -def parse_js_collection(code, type_str, nested_type=None): - code = code.strip() - if type_str == "array": - # Regular expression patterns - array_2d_pattern = r"\[\s*\[.*?\]\s*(,\s*\[.*?\]\s*)*\]|\bnew\s+Array\(\s*\[.*?\]\s*(,\s*\[.*?\]\s*)*\)" - array_pattern = r"\[(.*?)\]|\bnew\s+Array\((.*?)\)" - - # Check if the code is a 2D array - array_2d_match = re.match(array_2d_pattern, code) - try: - if array_2d_match: - elements_str = array_2d_match.group(0) - inner_arrays = re.findall(r"\[(.*?)\]", elements_str) - elements = [] - for idx, inner_array_str in enumerate(inner_arrays): - inner_array_str = inner_array_str.strip() - if idx == 0 and inner_array_str.startswith("["): - inner_array_str = inner_array_str[1:] - inner_array_elements = [ - e.strip() for e in inner_array_str.split(",") - ] - if nested_type: - inner_array = [parse_js_value(e) for e in inner_array_elements] - else: - inner_array = [parse_js_value(e) for e in inner_array_elements] - elements.append(inner_array) - return elements - - # Check if the code is a 1D array - array_match = re.match(array_pattern, code) - if array_match: - if array_match.group(1) is not None: - elements_str = array_match.group(1).strip() - if elements_str: - elements = elements_str.split(",") - else: - elements = [] - elif array_match.group(2) is not None: - elements_str = array_match.group(2).strip() - if elements_str: - elements = elements_str.split(",") - else: - elements = [] - else: - elements = [] - if nested_type: - elements = [ - ( - js_type_converter(e.strip(), nested_type, "String") - if (e.strip().startswith("'") or e.strip().startswith('"')) - else js_type_converter(e.strip(), nested_type) - ) - for e in elements - ] - else: - elements = [parse_js_value(e.strip()) for e in elements] - return elements - else: - return code - except: - return code - - elif type_str == "dict": - - if code == "{}": - return {} # Return an empty dictionary for an empty object - dict_pattern = r"\{(.*?)\}" - # Check if the code is a dictionary - dict_match = re.match(dict_pattern, code) - if dict_match: - try: - pairs = dict_match.group(1).split(",") - dictionary = {} - for pair in pairs: - key, value = pair.split(":") - key = parse_js_value(key.strip().strip("'")) - value = parse_js_value(value.strip().strip("'")) - dictionary[key] = value - return dictionary - except: - return code - else: - return code # default to string - else: - raise ValueError(f"Unsupported type: {type_str}") - - -def parse_js_value(value_str: str): - value_str = value_str.strip() - if value_str == "true": - return True - elif value_str == "false": - return False - elif (value_str.startswith('"') and value_str.endswith('"')) or ( - value_str.startswith("'") and value_str.endswith("'") - ): - return value_str[1:-1] - else: - try: - return int(value_str) - except ValueError: - try: - return float(value_str) - except ValueError: - return value_str - - -# Write tests for the `js_type_converter` function - - -def test_js_type_converter(): - assert js_type_converter("true", "Boolean") == True - assert js_type_converter("false", "Boolean") == False - assert js_type_converter("123", "integer") == 123 - assert js_type_converter("3.14", "float") == 3.14 - assert js_type_converter("123n", "Bigint") == 123 - assert js_type_converter("abc", "String") == "abc" - assert js_type_converter("[1, 2, 3]", "array") == [1, 2, 3] - assert js_type_converter("new Array(1, 2, 3)", "array") == [1, 2, 3] - assert js_type_converter("{'key': 'value'}", "dict") == {"key": "value"} - assert js_type_converter("{'key': 123}", "dict") == {"key": 123} - assert js_type_converter("{'key': true}", "dict") == {"key": True} - - # Additional test cases - # Test empty array and dictionary - assert js_type_converter("[]", "array") == [] - assert js_type_converter("{}", "dict") == {} - - # Test array with mixed types - assert js_type_converter("[1, 'two', true]", "array") == [1, "two", True] - - # Test dictionary with mixed types - assert js_type_converter( - "{'key1': 123, 'key2': 'value', 'key3': false}", "dict" - ) == {"key1": 123, "key2": "value", "key3": False} - - # Test string with special characters - - # Test negative integer and float values - assert js_type_converter("-123", "integer") == -123 - assert js_type_converter("-3.14", "float") == -3.14 - - # Test invalid type - try: - js_type_converter("123", "InvalidType") - except ValueError as e: - assert str(e) == "Unsupported type: InvalidType" - - # Test invalid integer value - try: - js_type_converter("123.45", "integer") - except ValueError as e: - assert str(e) == "Invalid integer value: 123.45" - - # Test invalid float value - try: - js_type_converter("3.14abc", "float") - except ValueError as e: - assert str(e) == "Invalid float value: 3.14abc" - - # Test invalid Bigint value - try: - js_type_converter("123", "Bigint") - except ValueError as e: - assert str(e) == "Invalid Bigint value: 123" - - # Test invalid boolean value - try: - js_type_converter("not_a_boolean", "Boolean") - except ValueError as e: - assert str(e) == "Invalid boolean value: not_a_boolean" - - print("All tests passed successfully!") - - -def test_js_type_converter_nested_array(): - # Test array with nested integers - assert js_type_converter("[1, 2, 3]", "array", "integer") == [1, 2, 3] - assert js_type_converter("new Array(4, 5, 6)", "array", "integer") == [4, 5, 6] - - # Test array with nested floats - assert js_type_converter("[1.1, 2.2, 3.3]", "array", "float") == [1.1, 2.2, 3.3] - assert js_type_converter("new Array(4.4, 5.5, 6.6)", "array", "float") == [ - 4.4, - 5.5, - 6.6, - ] - - # Test array with nested Bigints - assert js_type_converter("[1n, 2n, 3n]", "array", "Bigint") == [1, 2, 3] - assert js_type_converter("new Array(4n, 5n, 6n)", "array", "Bigint") == [4, 5, 6] - - # Test array with nested booleans - assert js_type_converter("[true, false, true]", "array", "Boolean") == [ - True, - False, - True, - ] - assert js_type_converter("new Array(false, true, false)", "array", "Boolean") == [ - False, - True, - False, - ] - - # Test array with nested strings - print(js_type_converter('["hello", "world", "!"]', "array", "String")) - assert js_type_converter('["hello", "world", "!"]', "array", "String") == [ - "hello", - "world", - "!", - ] - assert js_type_converter('new Array("foo", "bar", "baz")', "array", "String") == [ - "foo", - "bar", - "baz", - ] - - # Test array with mixed nested types - assert js_type_converter('[1, "two", true]', "array") == [1, "two", True] - assert js_type_converter('new Array(3.14, "pi", false)', "array") == [ - 3.14, - "pi", - False, - ] - - # Test array with nested arrays - print(js_type_converter(" [ [1, 2], [3, 4], [5, 6]]", "array", "array")) - assert js_type_converter(" [ [ 1, 2 ], [ 3, 4], [5, 6]]", "array", "array") == [ - [1, 2], - [3, 4], - [5, 6], - ] # this example has many weird spacings - assert js_type_converter("new Array([1, 2], [3, 4], [5, 6])", "array", "array") == [ - [1, 2], - [3, 4], - [5, 6], - ] - - # Test array with nested dictionaries - assert js_type_converter( - '[{"key1": 1}, {"key2": 2}, {"key3": 3}]', "array", "dict" - ) == [{"key1": 1}, {"key2": 2}, {"key3": 3}] - assert js_type_converter( - 'new Array({"key1": 1}, {"key2": 2}, {"key3": 3})', "array", "dict" - ) == [{"key1": 1}, {"key2": 2}, {"key3": 3}] - - print("All nested array tests passed successfully!") - - -if __name__ == "__main__": - test_js_type_converter() - test_js_type_converter_nested_array() From 159039ded959a5ed9daaff61105a9847286e3e88 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sat, 13 Jul 2024 13:18:14 -0400 Subject: [PATCH 30/35] Generate bfcl leaderboard result csv file --- .../bfcl/evaluation.py | 12 +- .../bfcl/evaluator/checker/ast/ast.py | 4 +- .../bfcl/evaluator/constants.py | 341 +++++++++++++++++- .../bfcl/evaluator/evaluator.py | 143 +++++++- .../bfcl/evaluator/utils.py | 24 +- .../pyproject.toml | 1 + 6 files changed, 494 insertions(+), 31 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/evaluation.py b/berkeley-function-call-leaderboard/bfcl/evaluation.py index e43dfda04..c315a1fab 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluation.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluation.py @@ -1,4 +1,3 @@ -import json import argparse from pathlib import Path @@ -21,7 +20,7 @@ def evaluate( ) file_name_to_test_category = {} for test_category in leaderboard.test_categories: - if test_category in (LeaderboardCategory.SQL, LeaderboardCategory.CHATABLE): + if test_category.value in (LeaderboardCategory.SQL.value, LeaderboardCategory.CHATABLE.value): print(f'Evaluation for test category "{test_category.value}" is not currently supported!') else: file_name = leaderboard.get_file_name(test_category) @@ -33,10 +32,5 @@ def evaluate( continue evaluator(file_path, test_category) - metrics = evaluator.get_leaderboard_metrics() - metrics_json = json.dumps(metrics, indent=2) - file_path = model_handler.model_dir / 'leaderboard_evaluation_result.json' - file_path.write_text(metrics_json) - print(f'Saved leaderboard evaluation result at "{file_path}"') - print('🏁 Evaluation completed.') - print(metrics_json) \ No newline at end of file + evaluator.generate_leaderboard_csv() + print('🏁 Evaluation completed.') \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py index 23dcdfaa1..dbf257666 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/checker/ast/ast.py @@ -542,9 +542,9 @@ def standardize_string(input_string: str): @staticmethod def get_language(test_category: LeaderboardAstCategory) -> str: - if test_category == LeaderboardAstCategory.JAVA: + if test_category.value == LeaderboardAstCategory.JAVA.value: language = 'java' - elif test_category == LeaderboardAstCategory.JAVASCRIPT: + elif test_category.value == LeaderboardAstCategory.JAVASCRIPT.value: language = 'javascript' else: language = 'python' diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py b/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py index 82367595b..bde259c15 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/constants.py @@ -108,4 +108,343 @@ "snowflake/arctic", "nvidia/nemotron-4-340b-instruct", "THUDM/glm-4-9b-chat", -] \ No newline at end of file +] + +MODEL_METADATA_MAPPING = { + "gpt-4o-2024-05-13-FC": [ + "GPT-4o-2024-05-13 (FC)", + "https://openai.com/index/hello-gpt-4o/", + "OpenAI", + "Proprietary", + ], + "gpt-4o-2024-05-13": [ + "GPT-4o-2024-05-13 (Prompt)", + "https://openai.com/index/hello-gpt-4o/", + "OpenAI", + "Proprietary", + ], + "gpt-4-1106-preview-FC": [ + "GPT-4-1106-Preview (FC)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-1106-preview": [ + "GPT-4-1106-Preview (Prompt)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-0125-preview-FC": [ + "GPT-4-0125-Preview (FC)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-0125-preview": [ + "GPT-4-0125-Preview (Prompt)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-turbo-2024-04-09-FC": [ + "GPT-4-turbo-2024-04-09 (FC)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-turbo-2024-04-09": [ + "GPT-4-turbo-2024-04-09 (Prompt)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gorilla-openfunctions-v2": [ + "Gorilla-OpenFunctions-v2 (FC)", + "https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html", + "Gorilla LLM", + "Apache 2.0", + ], + "claude-3-opus-20240229-FC": [ + "Claude-3-Opus-20240229 (FC tools-2024-04-04)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-opus-20240229": [ + "Claude-3-Opus-20240229 (Prompt)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "mistral-medium-2312": [ + "Mistral-Medium-2312 (Prompt)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-small-2402": [ + "Mistral-Small-2402 (Prompt)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-large-2402": [ + "Mistral-Large-2402 (Prompt)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "claude-3-sonnet-20240229-FC": [ + "Claude-3-Sonnet-20240229 (FC tools-2024-04-04)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-sonnet-20240229": [ + "Claude-3-Sonnet-20240229 (Prompt)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-haiku-20240307-FC": [ + "Claude-3-Haiku-20240307 (FC tools-2024-04-04)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-haiku-20240307": [ + "Claude-3-Haiku-20240307 (Prompt)", + "https://www.anthropic.com/news/claude-3-family", + "Anthropic", + "Proprietary", + ], + "claude-3-5-sonnet-20240620-FC": [ + "Claude-3.5-Sonnet-20240620 (FC)", + "https://www.anthropic.com/news/claude-3-5-sonnet", + "Anthropic", + "Proprietary", + ], + "claude-3-5-sonnet-20240620": [ + "Claude-3.5-Sonnet-20240620 (Prompt)", + "https://www.anthropic.com/news/claude-3-5-sonnet", + "Anthropic", + "Proprietary", + ], + "gpt-3.5-turbo-0125-FC": [ + "GPT-3.5-Turbo-0125 (FC)", + "https://platform.openai.com/docs/models/gpt-3-5-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-3.5-turbo-0125": [ + "GPT-3.5-Turbo-0125 (Prompting)", + "https://platform.openai.com/docs/models/gpt-3-5-turbo", + "OpenAI", + "Proprietary", + ], + "meetkai/functionary-small-v2.2-FC": [ + "Functionary-Small-v2.2 (FC)", + "https://huggingface.co/meetkai/functionary-small-v2.2", + "MeetKai", + "MIT", + ], + "meetkai/functionary-medium-v2.2-FC": [ + "Functionary-Medium-v2.2 (FC)", + "https://huggingface.co/meetkai/functionary-medium-v2.2", + "MeetKai", + "MIT", + ], + "meetkai/functionary-small-v2.4-FC": [ + "Functionary-Small-v2.4 (FC)", + "https://huggingface.co/meetkai/functionary-small-v2.4", + "MeetKai", + "MIT", + ], + "meetkai/functionary-medium-v2.4-FC": [ + "Functionary-Medium-v2.4 (FC)", + "https://huggingface.co/meetkai/functionary-medium-v2.4", + "MeetKai", + "MIT", + ], + "claude-2.1": [ + "Claude-2.1 (Prompt)", + "https://www.anthropic.com/news/claude-2-1", + "Anthropic", + "Proprietary", + ], + "mistral-tiny-2312": [ + "Mistral-tiny-2312 (Prompt)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "claude-instant-1.2": [ + "Claude-instant-1.2 (Prompt)", + "https://www.anthropic.com/news/releasing-claude-instant-1-2", + "Anthropic", + "Proprietary", + ], + "mistral-small-2402-FC-Auto": [ + "Mistral-small-2402 (FC Auto)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-large-2402-FC-Any": [ + "Mistral-large-2402 (FC Any)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-small-2402-FC-Any": [ + "Mistral-small-2402 (FC Any)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "mistral-large-2402-FC-Auto": [ + "Mistral-large-2402 (FC Auto)", + "https://docs.mistral.ai/guides/model-selection/", + "Mistral AI", + "Proprietary", + ], + "Nexusflow-Raven-v2": [ + "Nexusflow-Raven-v2 (FC)", + "https://huggingface.co/Nexusflow/NexusRaven-V2-13B", + "Nexusflow", + "Apache 2.0", + ], + "firefunction-v1-FC": [ + "FireFunction-v1 (FC)", + "https://huggingface.co/fireworks-ai/firefunction-v1", + "Fireworks", + "Apache 2.0", + ], + "firefunction-v2-FC": [ + "FireFunction-v2 (FC)", + "https://huggingface.co/fireworks-ai/firefunction-v2", + "Fireworks", + "Apache 2.0", + ], + "gemini-1.5-pro-preview-0514": [ + "Gemini-1.5-Pro-Preview-0514 (FC)", + "https://deepmind.google/technologies/gemini/pro/", + "Google", + "Proprietary", + ], + "gemini-1.5-flash-preview-0514": [ + "Gemini-1.5-Flash-Preview-0514 (FC)", + "https://deepmind.google/technologies/gemini/flash/", + "Google", + "Proprietary", + ], + "gemini-1.5-pro-preview-0409": [ + "Gemini-1.5-Pro-Preview-0409 (FC)", + "https://deepmind.google/technologies/gemini/#introduction", + "Google", + "Proprietary", + ], + "gemini-1.0-pro": [ + "Gemini-1.0-Pro-001 (FC)", + "https://deepmind.google/technologies/gemini/#introduction", + "Google", + "Proprietary", + ], + "gpt-4-0613-FC": [ + "GPT-4-0613 (FC)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "gpt-4-0613": [ + "GPT-4-0613 (Prompt)", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "OpenAI", + "Proprietary", + ], + "deepseek-ai/deepseek-coder-6.7b-instruct": [ + "Deepseek-v1.5 (Prompt)", + "https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5", + "Deepseek", + "Deepseek License", + ], + "google/gemma-7b-it": [ + "Gemma-7b-it (Prompt)", + "https://blog.google/technology/developers/gemma-open-models/", + "Google", + "gemma-terms-of-use", + ], + "glaiveai/glaive-function-calling-v1": [ + "Glaive-v1 (FC)", + "https://huggingface.co/glaiveai/glaive-function-calling-v1", + "Glaive", + "cc-by-sa-4.0", + ], + "databricks-dbrx-instruct": [ + "DBRX-Instruct (Prompt)", + "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm", + "Databricks", + "Databricks Open Model", + ], + "NousResearch/Hermes-2-Pro-Mistral-7B": [ + "Hermes-2-Pro-Mistral-7B (FC)", + "https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B", + "NousResearch", + "apache-2.0", + ], + "meta-llama/Meta-Llama-3-8B-Instruct": [ + "Meta-Llama-3-8B-Instruct (Prompt)", + "https://llama.meta.com/llama3", + "Meta", + "Meta Llama 3 Community", + ], + "meta-llama/Meta-Llama-3-70B-Instruct": [ + "Meta-Llama-3-70B-Instruct (Prompt)", + "https://llama.meta.com/llama3", + "Meta", + "Meta Llama 3 Community", + ], + "command-r-plus-FC": [ + "Command-R-Plus (FC) (Original)", + "https://txt.cohere.com/command-r-plus-microsoft-azure", + "Cohere For AI", + "cc-by-nc-4.0", + ], + "command-r-plus": [ + "Command-R-Plus (Prompt) (Original)", + "https://txt.cohere.com/command-r-plus-microsoft-azure", + "Cohere For AI", + "cc-by-nc-4.0", + ], + "command-r-plus-FC-optimized": [ + "Command-R-Plus (FC) (Optimized)", + "https://txt.cohere.com/command-r-plus-microsoft-azure", + "Cohere For AI", + "cc-by-nc-4.0", + ], + "command-r-plus-optimized": [ + "Command-R-Plus (Prompt) (Optimized)", + "https://txt.cohere.com/command-r-plus-microsoft-azure", + "Cohere For AI", + "cc-by-nc-4.0", + ], + "snowflake/arctic": [ + "Snowflake/snowflake-arctic-instruct (Prompt)", + "https://huggingface.co/Snowflake/snowflake-arctic-instruct", + "Snowflake", + "apache-2.0", + ], + "nvidia/nemotron-4-340b-instruct": [ + "Nemotron-4-340b-instruct (Prompt)", + "https://huggingface.co/nvidia/nemotron-4-340b-instruct", + "NVIDIA", + "nvidia-open-model-license" + ], + "THUDM/glm-4-9b-chat": [ + "GLM-4-9b-Chat (FC)", + "https://huggingface.co/THUDM/glm-4-9b-chat", + "THUDM", + "glm-4" + ] +} \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py index 2ed503325..8b37ea6ee 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py @@ -1,15 +1,17 @@ import json +import warnings from pathlib import Path from typing import List, Dict, Any +import pandas as pd from tqdm import tqdm from pydantic import BaseModel import bfcl.types as types from bfcl.model_handler.base import BaseHandler from bfcl.evaluator.metrics import LeaderboardModelMetrics -from bfcl.evaluator import checker -from bfcl.evaluator import utils as evaluator_utils +from bfcl.evaluator import checker, utils as evaluator_utils +from bfcl.evaluator.constants import MODEL_METADATA_MAPPING class FailedResult(BaseModel): @@ -68,28 +70,133 @@ def __call__(self, file_path: Path, test_category) -> None: if result: accuracy = result['accuracy'] - self._test_category_to_metrics[test_category] = dict( + self._test_category_to_metrics[test_category.value] = dict( accuracy=accuracy, total_count=result['total_count'] ) print(f"✅ Test completed: {test_category.value} | 🎯 Accuracy: {accuracy:.4f}") + + def generate_leaderboard_csv(self) -> None: + metrics = self._test_category_to_metrics + C = types.LeaderboardCategory + + python_simple_ast = metrics.get(C.SIMPLE.value, dict(accuracy=0, total_count=0)) + python_multiple_ast = metrics.get(C.MULTIPLE_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_parallel_ast = metrics.get(C.PARALLEL_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_parallel_multiple_ast = metrics.get(C.PARALLEL_MULTIPLE_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_simple_exec = metrics.get(C.EXECUTABLE_SIMPLE.value, dict(accuracy=0, total_count=0)) + python_multiple_exec = metrics.get(C.EXECUTABLE_MULTIPLE_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_parallel_exec = metrics.get(C.EXECUTABLE_PARALLEL_FUNCTION.value, dict(accuracy=0, total_count=0)) + python_parallel_multiple_exec = metrics.get(C.EXECUTABLE_PARALLEL_MULTIPLE_FUNCTION.value, dict(accuracy=0, total_count=0)) + java_simple_ast = metrics.get(C.JAVA.value, dict(accuracy=0, total_count=0)) + javascript_simple_ast = metrics.get(C.JAVASCRIPT.value, dict(accuracy=0, total_count=0)) + rest_simple_exec = metrics.get(C.REST.value, dict(accuracy=0, total_count=0)) + relevance = metrics.get(C.RELEVANCE.value, dict(accuracy=0, total_count=0)) + + simple_ast = evaluator_utils.calculate_weighted_accuracy( + [python_simple_ast, java_simple_ast, javascript_simple_ast] + ) + multiple_ast = python_multiple_ast + parallel_ast = python_parallel_ast + parallel_multiple_ast = python_parallel_multiple_ast + simple_exec = evaluator_utils.calculate_weighted_accuracy( + [python_simple_exec, rest_simple_exec] + ) + multiple_exec = python_multiple_exec + parallel_exec = python_parallel_exec + parallel_multiple_exec = python_parallel_multiple_exec - def get_leaderboard_metrics(self) -> Dict: - model_metrics = self._model_metrics.compute() - total_count = 0 - weighted_total_accuracy = unweighted_total_accuracy = 0 - test_category_to_accuracy = {} - for test_category, metrics in self._test_category_to_metrics.items(): - test_category_to_accuracy[test_category.value] = metrics['accuracy'] - total_count += metrics['total_count'] - weighted_total_accuracy += metrics['accuracy'] * metrics['total_count'] - unweighted_total_accuracy += metrics['accuracy'] - return dict( - overall_accuracy_weighted=weighted_total_accuracy / total_count, - overall_accuracy_unweighted=unweighted_total_accuracy / len(self._test_category_to_metrics), - **test_category_to_accuracy, - **model_metrics, + summary_ast = evaluator_utils.calculate_unweighted_accuracy( + [simple_ast, multiple_ast, parallel_ast, parallel_multiple_ast] ) + summary_exec = evaluator_utils.calculate_unweighted_accuracy( + [simple_exec, multiple_exec, parallel_exec, parallel_multiple_exec] + ) + overall_accuracy = evaluator_utils.calculate_weighted_accuracy( + [ + simple_ast, + multiple_ast, + parallel_ast, + parallel_multiple_ast, + simple_exec, + multiple_exec, + parallel_exec, + parallel_multiple_exec, + relevance, + ] + ) + + # if overall_accuracy["total_count"] != 1700: + # print("-" * 100) + # print(f"❗️Warning: Total count for {self.model_name} is {overall_accuracy['total_count']}") + + # Model metrics - cost, mean_latency, std_latency, p95_latency + model_metrics = self._model_metrics.compute() + model_metadata = MODEL_METADATA_MAPPING.get(self.model_name) + if model_metadata is None: + warnings.warn( + f'Metadata not found for the model "{self.model_name}"! ' + 'Please add your model metadata in the `MODEL_METADATA_MAPPING` variable ' + 'in the `bfcl/evaluator/constants.py` file.' + ) + + f_acc = lambda acc: "{:.2f}%".format(acc * 100) + rv_f_acc = lambda acc_str: float(acc_str.replace('%', '')) / 100 + + row = { + "Rank": 0, # Temporary value of 0. Updated below. + "Overall Acc": f_acc(overall_accuracy["accuracy"]), + "Model": model_metadata[0] if model_metadata else self.model_name, + "Model Link": model_metadata[1] if model_metadata else "N/A", + "Organization": model_metadata[2] if model_metadata else "N/A", + "License": model_metadata[3] if model_metadata else "N/A", + "AST Summary": f_acc(summary_ast["accuracy"]), + "Exec Summary": f_acc(summary_exec["accuracy"]), + "Simple Function AST": f_acc(simple_ast["accuracy"]), + "Python Simple Function AST": f_acc(python_simple_ast["accuracy"]), + "Java Simple Function AST": f_acc(java_simple_ast["accuracy"]), + "JavaScript Simple Function AST": f_acc(javascript_simple_ast["accuracy"]), + "Multiple Functions AST": f_acc(multiple_ast["accuracy"]), + "Parallel Functions AST": f_acc(parallel_ast["accuracy"]), + "Parallel Multiple AST": f_acc(parallel_multiple_ast["accuracy"]), + "Simple Function Exec": f_acc(simple_exec["accuracy"]), + "Python Simple Function Exec": f_acc(python_simple_exec["accuracy"]), + "REST Simple Function Exec": f_acc(rest_simple_exec["accuracy"]), + "Multiple Functions Exec": f_acc(multiple_exec["accuracy"]), + "Parallel Functions Exec": f_acc(parallel_exec["accuracy"]), + "Parallel Multiple Exec": f_acc(parallel_multiple_exec["accuracy"]), + "Relevance Detection": f_acc(relevance["accuracy"]), + "Cost ($ Per 1k Function Calls)": str(model_metrics['cost']), + "Latency Mean (s)": str(model_metrics['mean_latency']), + "Latency Standard Deviation (s)": str(model_metrics['std_latency']), + "Latency 95th Percentile (s)": str(model_metrics['p95_latency']), + } + + df_new = pd.DataFrame([row]) + file_path = self.model_handler.result_dir / 'BFCL_leaderboard_result.csv' + if file_path.exists(): + print('Found existing BFCL leaderboard file! Loading...') + existing_df = pd.read_csv(file_path, dtype=str) + + # Check if model name already exists + if df_new["Model"].iloc[0] in existing_df["Model"].values: + print('Model already exists. Overwriting the row...') + existing_df.loc[existing_df["Model"] == df_new["Model"].iloc[0], :] = df_new.values + else: + print('Appending new model to the existing dataframe...') + existing_df = pd.concat((existing_df, df_new), ignore_index=True) + df = existing_df + else: + print('No existing BFCL leaderboard file found. Creating a new one...') + df = df_new + + df["Overall Acc"] = df["Overall Acc"].apply(rv_f_acc) + df.sort_values("Overall Acc", ascending=False, inplace=True) + df["Overall Acc"] = df["Overall Acc"].apply(f_acc) + df['Rank'] = list(range(1, len(df) + 1)) + + df.to_csv(file_path, index=False) + print(f'🔒 Saved BFCL leaderboard result at "{file_path}".') def run_relevance_evaluator(self, model_responses: List[Dict]) -> Dict: """Run function relevance detection. diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py index ff443246d..54c7b09f9 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/utils.py @@ -63,4 +63,26 @@ def is_executable_format_output(decoded_output): if type(item) != str: return False return True - return False \ No newline at end of file + return False + +def calculate_weighted_accuracy(accuracy_dict_list): + total_count = 0 + total_accuracy = 0 + for accuracy_dict in accuracy_dict_list: + total_count += accuracy_dict["total_count"] + total_accuracy += accuracy_dict["accuracy"] * accuracy_dict["total_count"] + + if total_count == 0: + return {"accuracy": 0, "total_count": 0} + + return {"accuracy": total_accuracy / total_count, "total_count": total_count} + +def calculate_unweighted_accuracy(accuracy_dict_list): + total_accuracy = 0 + for accuracy_dict in accuracy_dict_list: + total_accuracy += accuracy_dict["accuracy"] + + if len(accuracy_dict_list) == 0: + return {"accuracy": 0, "total_count": 0} + + return {"accuracy": total_accuracy / len(accuracy_dict_list), "total_count": 0} \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index bde5f28ca..78b5b7cf1 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "requests", "tqdm", "numpy", + "pandas", "huggingface_hub", "pydantic>=2.8.2", "python-dotenv>=1.0.1", From 707e2bddb141c2f8af7b2913a9bfb8bb111cf8ad Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sat, 13 Jul 2024 14:45:17 -0400 Subject: [PATCH 31/35] Fix issue of incorrect test category comparison --- .../bfcl/evaluator/evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py index 8b37ea6ee..1a1036558 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluator/evaluator.py @@ -254,7 +254,7 @@ def run_executable_evaluator( test_data = self.test_category_to_data[test_category] assert len(model_responses) == len(test_data) test_example_id_to_data = {} - if test_category != types.LeaderboardExecutableCategory.REST: + if test_category.value != types.LeaderboardExecutableCategory.REST.value: print(f"---- Getting real-time execution result from ground truth for '{test_category.value}' ----") exec_dict = {} for item in tqdm(test_data, desc="Getting Executable Expected Output"): @@ -303,7 +303,7 @@ def run_executable_evaluator( failed_model_responses.append(result) continue - if test_category == types.LeaderboardExecutableCategory.REST: + if test_category.value == types.LeaderboardExecutableCategory.REST.value: # REST is always single-functioned. Therefore we take the first one and pass # it to the REST checker. if not evaluator_utils.is_rest_format_output(decoded_result): From e85ca864868009645616b69d43018335c9a379f8 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sat, 13 Jul 2024 14:45:35 -0400 Subject: [PATCH 32/35] Update comments --- berkeley-function-call-leaderboard/.env.example | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/berkeley-function-call-leaderboard/.env.example b/berkeley-function-call-leaderboard/.env.example index a940c6642..b1b0a437f 100644 --- a/berkeley-function-call-leaderboard/.env.example +++ b/berkeley-function-call-leaderboard/.env.example @@ -1,7 +1,8 @@ -# [OPTIONAL] Only required for downloading gated hugging face models +# [OPTIONAL] Required for downloading gated hugging face models HUGGING_FACE_HUB_TOKEN= -# [OPTIONAL] Only required for respective proprietary model evaluation +# [OPTIONAL] Required for LLM generation step +# Provide the API key for the model(s) you intend to use OPENAI_API_KEY=sk-XXXXXX MISTRAL_API_KEY= FIREWORKS_API_KEY= @@ -15,7 +16,7 @@ USE_COHERE_OPTIMIZATION=False # True/False DATABRICKS_API_KEY= DATABRICKS_AZURE_ENDPOINT_URL= -# [OPTIONAL] Only required for evaluating executable test categories +# [OPTIONAL] Required for evaluation of `executable` test group RAPID_API_KEY= EXCHANGERATE_API_KEY= OMDB_API_KEY= From 3f732012bdf2bc9034edfd5fdf8706db8add18be Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sat, 13 Jul 2024 14:45:58 -0400 Subject: [PATCH 33/35] Add new readme --- berkeley-function-call-leaderboard/_README.md | 218 ++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 berkeley-function-call-leaderboard/_README.md diff --git a/berkeley-function-call-leaderboard/_README.md b/berkeley-function-call-leaderboard/_README.md new file mode 100644 index 000000000..2cfd340ce --- /dev/null +++ b/berkeley-function-call-leaderboard/_README.md @@ -0,0 +1,218 @@ +
+

Berkeley Function Calling Leaderboard (BFCL)

+ +

+ 🤗 Dataset • + 🏆 Leaderboard • + 📰 Blog +

+ +
+ + +## Introduction +We present Berkeley Function Leaderboard, the **first comprehensive and executable function calling evaluation for LLMs function calling**. Different from prior function calling evaluations (e.g. Anyscale function calling blog), we consider function callings of various forms, different function calling scenarios, and the executability of function calls. We also release our model [gorilla-openfunctions-v2](https://huggingface.co/gorilla-llm/gorilla-openfunctions-v2), the best open-source models so far to handle multiple languages of function calls, parallel function calls and multiple function calls. We also provide a specific debugging feature that when the provided function is not suitable for your task, the model will output an “Error Message”. + +Read more about the technical details and interesting insights in our blog post! + +![image](./architecture_diagram.png) + +## Get started + +Create a `.env` file similar to the [.env.example](.env.example) file, and fill out the values for the variables you wish to use for either open-source or proprietary LLM generation and evaluation. + +### 🚀 Installation + +> [!Tip] +> Ensure that you are using the latest versions of `setuptools`, `wheel`, and `pip` to avoid any installation issues. Run: +> ```bash +> pip install --upgrade setuptools wheel pip +> ``` + +To install the `bfcl` package from the GitHub repository, run: +```bash +$ git clone https://github.com/ShishirPatil/gorilla +$ cd berkeley-function-call-leaderboard +$ pip install -e . +``` + +Extras dependencies can be installed via: +```bash +pip install -e ".[NAME]" +``` +| Name | Use | +|-------------------|----------------------------------------------------------| +| oss_eval | For LLM generation and evaluation using open source models | +| proprietary_eval | For LLM generation and evaluation using proprietary models | +| all | Loads all extras (not recommended) | + +#### OSS eval + +We use [vllm](https://docs.vllm.ai/en/latest/index.html) to perform offline LLM inference. Installation of [vllm](https://docs.vllm.ai/en/latest/getting_started/installation.html#requirements) requires installing a CUDA-compatible PyTorch version. You can run the following command: +```bash +# Replace the CUDA version "cu118" according to your system. +# See available CUDA versions at https://pytorch.org/get-started/locally/ +# bfcl currently uses `v0.5.1` of vllm and it requires torch `v2.3.0` +$ pip install torch==2.3.0 --index-url https://download.pytorch.org/whl/cu118 +$ pip install -e ".[oss_eval]" +``` + +#### Proprietary eval + +To install dependencies for proprietary model evaluation, run: +```bash +pip install -e ".[proprietary_eval]" +``` + +## User Guide + +A comprehensive user guide detailing the full list of supported arguments is available [here](./bfcl/cli.py) and can also be accessed on the terminal by calling: +```bash +bfcl -h +``` +```text +usage: bfcl [-h] {llm_generation,evaluation} ... + +Berkeley Function Calling Leaderboard (BFCL) + +positional arguments: + {llm_generation,evaluation} + Sub-command to run + llm_generation Collect LLM responses + evaluation Run evaluation + +options: + -h, --help show this help message and exit +``` + +### LLM Generation + +To view the full list of arguments for the LLM generation sub-command, call: +```bash +bfcl llm_generation -h +``` + +#### Open Source Models + +To perform generation on an open-weights model (e.g. [google/gemma-7b-it](https://huggingface.co/google/gemma-7b-it)) for the `ast` test group, use the following command: +```bash +bfcl llm_generation \ + --model google/gemma-7b-it \ + --model_type oss \ + --test-group ast +``` + +To provide sampling parameters, you can use: +```bash +bfcl llm_generation \ + --model google/gemma-7b-it \ + --model_type oss \ + --test-group ast \ + --temperature 0.7 \ + --top-p 1 \ + --max-tokens 1000 +``` + +To specify multiple test categories: +```bash +bfcl llm_generation \ + --model google/gemma-7b-it \ + --model_type oss \ + --test-categories rest,java,javascript +``` + +#### Proprietary Models + +To perform generation on a proprietary/hosted model (e.g. [gorilla-openfunctions-v2](https://huggingface.co/gorilla-llm/gorilla-openfunctions-v2)) for the `executable` test group, use: +```bash +bfcl llm_generation \ + --model gorilla-openfunctions-v2 \ + --model_type proprietary \ + --test-group executable +``` + +To specify multiple test categories: +```bash +bfcl llm_generation \ + --model gorilla-openfunctions-v2 \ + --model_type proprietary \ + --test-categories relevance,multiple_function,parallel_function +``` + +### Evaluation + +To view the full list of arguments for the evaluation sub-command, call: +```bash +bfcl evaluation -h +``` + +To perform evaluation of a proprietary/hosted model (e.g. [gorilla-openfunctions-v2](https://huggingface.co/gorilla-llm/gorilla-openfunctions-v2)) on all the test categories, use: +```bash +bfcl evaluation \ + --model gorilla-openfunctions-v2 \ + --model_type proprietary \ + --test-group all +``` + +#### Executable Test Category Evaluation + +To run the executable test categories, you need to provide the following API keys in the `.env` file: +```ini +RAPID_API_KEY= +EXCHANGERATE_API_KEY= +OMDB_API_KEY= +GEOCODE_API_KEY= +``` +You can use the following links to obtain the API keys: +1. Rapid API: https://rapidapi.com/hub + * Yahoo Finance: https://rapidapi.com/sparior/api/yahoo-finance15 + * Real Time Amazon Data : https://rapidapi.com/letscrape-6bRBa3QguO5/api/real-time-amazon-data + * Urban Dictionary: https://rapidapi.com/community/api/urban-dictionary + * Covid 19: https://rapidapi.com/api-sports/api/covid-193 + * Time zone by Location: https://rapidapi.com/BertoldVdb/api/timezone-by-location + + All the Rapid APIs we use have free tier usage. As a result, you need to subscribe to those API providers in order to have the executable test environment setup but it will be free of charge! +2. ExchangeRate API:https://www.exchangerate-api.com +3. OMDB API: http://www.omdbapi.com/apikey.aspx +4. Geocode API: https://geocode.maps.co/ + + +### Evaluation + +To view the full list of arguments for the evaluation sub-command, call: +```bash +$ bfcl evaluation -h +``` + +To perform evaluation of a proprietary/hosted model (e.g. [gorilla-openfunctions-v2](https://huggingface.co/gorilla-llm/gorilla-openfunctions-v2)) on all test categories, use: +```bash +$ bfcl evaluation \ + --model gorilla-openfunctions-v2 \ + --model_type proprietary \ + --test-group all +``` + +#### Executable Test Category Evaluation + +To run the executable test categories, you need to provide the following API keys in the `.env` file: +```ini +RAPID_API_KEY= +EXCHANGERATE_API_KEY= +OMDB_API_KEY= +GEOCODE_API_KEY= +``` + +You can use the following links to obtain the API keys: + +1. **Rapid API**: [Rapid API Hub](https://rapidapi.com/hub) + - Yahoo Finance: https://rapidapi.com/sparior/api/yahoo-finance15 + - Real Time Amazon Data : https://rapidapi.com/letscrape-6bRBa3QguO5/api/real-time-amazon-data + - Urban Dictionary: https://rapidapi.com/community/api/urban-dictionary + - Covid 19: https://rapidapi.com/api-sports/api/covid-193 + - Time zone by Location: https://rapidapi.com/BertoldVdb/api/timezone-by-location + + All the Rapid APIs we use have free tier usage. You need to subscribe to these API providers to set up the executable test environment, but it will be free of charge! + +2. **ExchangeRate API**: https://www.exchangerate-api.com +3. **OMDB API**: http://www.omdbapi.com/apikey.aspx +4. **Geocode API**: https://geocode.maps.co/ \ No newline at end of file From 15b9c6aeba1a523591d87317e57dac55fa773700 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Sat, 13 Jul 2024 17:15:20 -0400 Subject: [PATCH 34/35] Fix evaluation section --- berkeley-function-call-leaderboard/_README.md | 45 +------------------ 1 file changed, 2 insertions(+), 43 deletions(-) diff --git a/berkeley-function-call-leaderboard/_README.md b/berkeley-function-call-leaderboard/_README.md index 2cfd340ce..f9209f480 100644 --- a/berkeley-function-call-leaderboard/_README.md +++ b/berkeley-function-call-leaderboard/_README.md @@ -172,47 +172,6 @@ You can use the following links to obtain the API keys: * Time zone by Location: https://rapidapi.com/BertoldVdb/api/timezone-by-location All the Rapid APIs we use have free tier usage. As a result, you need to subscribe to those API providers in order to have the executable test environment setup but it will be free of charge! -2. ExchangeRate API:https://www.exchangerate-api.com +2. ExchangeRate API: https://www.exchangerate-api.com 3. OMDB API: http://www.omdbapi.com/apikey.aspx -4. Geocode API: https://geocode.maps.co/ - - -### Evaluation - -To view the full list of arguments for the evaluation sub-command, call: -```bash -$ bfcl evaluation -h -``` - -To perform evaluation of a proprietary/hosted model (e.g. [gorilla-openfunctions-v2](https://huggingface.co/gorilla-llm/gorilla-openfunctions-v2)) on all test categories, use: -```bash -$ bfcl evaluation \ - --model gorilla-openfunctions-v2 \ - --model_type proprietary \ - --test-group all -``` - -#### Executable Test Category Evaluation - -To run the executable test categories, you need to provide the following API keys in the `.env` file: -```ini -RAPID_API_KEY= -EXCHANGERATE_API_KEY= -OMDB_API_KEY= -GEOCODE_API_KEY= -``` - -You can use the following links to obtain the API keys: - -1. **Rapid API**: [Rapid API Hub](https://rapidapi.com/hub) - - Yahoo Finance: https://rapidapi.com/sparior/api/yahoo-finance15 - - Real Time Amazon Data : https://rapidapi.com/letscrape-6bRBa3QguO5/api/real-time-amazon-data - - Urban Dictionary: https://rapidapi.com/community/api/urban-dictionary - - Covid 19: https://rapidapi.com/api-sports/api/covid-193 - - Time zone by Location: https://rapidapi.com/BertoldVdb/api/timezone-by-location - - All the Rapid APIs we use have free tier usage. You need to subscribe to these API providers to set up the executable test environment, but it will be free of charge! - -2. **ExchangeRate API**: https://www.exchangerate-api.com -3. **OMDB API**: http://www.omdbapi.com/apikey.aspx -4. **Geocode API**: https://geocode.maps.co/ \ No newline at end of file +4. Geocode API: https://geocode.maps.co/ \ No newline at end of file From e0645b1fb929e3a9820981fa6195838490f17e16 Mon Sep 17 00:00:00 2001 From: Huanzhi Mao Date: Wed, 24 Jul 2024 16:25:04 -0700 Subject: [PATCH 35/35] update package dependency version --- .../pyproject.toml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index 78b5b7cf1..2354df783 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -13,17 +13,17 @@ readme = "README.md" requires-python = ">=3.9" license = { "text" = "Apache 2.0" } dependencies = [ - "requests", - "tqdm", - "numpy", + "requests==2.32.3", + "tqdm==4.66.4", + "numpy==1.26.4", "pandas", "huggingface_hub", "pydantic>=2.8.2", "python-dotenv>=1.0.1", - "tree-sitter~=0.21.0", + "tree_sitter==0.21.3", "tree-sitter-java==0.21.0", "tree-sitter-javascript==0.21.4", - "openai>=1.35.10", + "openai==1.35.13", ] [tool.setuptools.packages.find] @@ -36,11 +36,11 @@ bfcl = "bfcl.cli:main" Repository = "https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard" [project.optional-dependencies] -oss_eval = ["vllm==0.5.1"] +oss_eval = ["vllm==0.5.0"] proprietary_eval = [ "mistralai==0.4.2", - "anthropic==0.29.0", - "cohere==5.2.5", + "anthropic==0.31.1", + "cohere==5.5.8", ] all = [ "bfcl[oss_eval]",