diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md index 64befed41..b83c4ac99 100644 --- a/berkeley-function-call-leaderboard/CHANGELOG.md +++ b/berkeley-function-call-leaderboard/CHANGELOG.md @@ -7,6 +7,7 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen - `MadeAgents/Hammer2.0-3b` - `MadeAgents/Hammer2.0-1.5b` - `MadeAgents/Hammer2.0-0.5b` +- [Oct 10, 2024] [#621](https://github.com/ShishirPatil/gorilla/pull/621), [#675](https://github.com/ShishirPatil/gorilla/pull/675): Add a basic command-line interface for ease of use. - [Oct 5, 2024] [#633](https://github.com/ShishirPatil/gorilla/pull/633): Add new model `openbmb/MiniCPM3-4B` to the leaderboard. - [Oct 5, 2024] [#642](https://github.com/ShishirPatil/gorilla/pull/642): Add the following new models to the leaderboard: - `Qwen/Qwen2.5-7B-Instruct` diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index bb7ef2ca4..a1bc6f1f5 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -101,7 +101,7 @@ If decided to run locally-hosted model, the generation script uses vLLM and ther Use the following command for LLM inference of the evaluation dataset with specific models. ```bash -python openfunctions_evaluation.py --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1 +bfcl generate --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1 ``` You can optionally specify the number of threads to use for _parallel inference_ by setting the `--num-threads` flag to speed up inference for **hosted models**, not applicable for OSS models. @@ -112,7 +112,7 @@ If no `MODEL_NAME` is provided, the model `gorilla-openfunctions-v2` will be use ### Models Available -Below is _a table of models we support_ to run our leaderboard evaluation against. If the models support function calling (FC), we will follow its function calling format provided by official documentation. Otherwise, we use a consistent system message to prompt the model to generate function calls in the right format. +Below is _a table of models we support_ to run our leaderboard evaluation against. If the models support function calling (FC), we will follow its function calling format provided by official documentation. Otherwise, we use a consistent system message to prompt the model to generate function calls in the right format. You can also use `bfcl models` command to list out all available models. |Model | Type | |---|---| @@ -197,7 +197,7 @@ For `Databrick-DBRX-instruct`, you need to create a Databrick Azure workspace an In the following two sections, the optional `--test-category` parameter can be used to specify the category of tests to run. You can specify multiple categories separated by spaces. Available options include: -- Available test groups: +- Available test groups (you can also use `bfcl test-categories` command to see): - `all`: All test categories. - This is the default option if no test category is provided. - `multi_turn`: All multi-turn test categories. @@ -248,7 +248,7 @@ In the following two sections, the optional `--test-category` parameter can be u Navigate to the `gorilla/berkeley-function-call-leaderboard/bfcl/eval_checker` directory and run the `eval_runner.py` script with the desired parameters. The basic syntax is as follows: ```bash -python eval_runner.py --model MODEL_NAME --test-category TEST_CATEGORY +bfcl evaluate --model MODEL_NAME --test-category TEST_CATEGORY ``` For available options for `MODEL_NAME` and `TEST_CATEGORY`, please refer to the [Models Available](#models-available) and [Available Test Category](#available-test-category) section. @@ -260,25 +260,25 @@ If no `MODEL_NAME` is provided, all available model results will be evaluated by If you want to run all tests for the `gorilla-openfunctions-v2` model, you can use the following command: ```bash -python eval_runner.py --model gorilla-openfunctions-v2 +bfcl evaluate --model gorilla-openfunctions-v2 ``` If you want to evaluate all offline tests (do not require RapidAPI keys) for OpenAI GPT-3.5, you can use the following command: ```bash -python eval_runner.py --model gpt-3.5-turbo-0125 --test-category ast +bfcl evaluate --model gpt-3.5-turbo-0125 --test-category ast ``` If you want to run the `rest` tests for a few Claude models, you can use the following command: ```bash -python eval_runner.py --model claude-3-5-sonnet-20240620 claude-3-opus-20240229 claude-3-sonnet-20240229 --test-category rest +bfcl evaluate --model claude-3-5-sonnet-20240620 claude-3-opus-20240229 claude-3-sonnet-20240229 --test-category rest ``` If you want to run `live_simple` and `javascript` tests for a few models and `gorilla-openfunctions-v2`, you can use the following command: ```bash -python eval_runner.py --model gorilla-openfunctions-v2 claude-3-5-sonnet-20240620 gpt-4-0125-preview gemini-1.5-pro-preview-0514 --test-category live_simple javascript +bfcl evaluate --model gorilla-openfunctions-v2 claude-3-5-sonnet-20240620 gpt-4-0125-preview gemini-1.5-pro-preview-0514 --test-category live_simple javascript ``` ### Model-Specific Optimization diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py new file mode 100644 index 000000000..6a75b8586 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -0,0 +1,231 @@ +import csv +from collections import namedtuple +from datetime import datetime +from typing import List + +import typer +from bfcl._llm_response_generation import main as generation_main +from bfcl.constant import DOTENV_PATH, RESULT_PATH, SCORE_PATH, TEST_COLLECTION_MAPPING +from bfcl.eval_checker import eval_runner +from bfcl.model_handler.handler_map import HANDLER_MAP +from dotenv import load_dotenv +from tabulate import tabulate + + +class ExecutionOrderGroup(typer.core.TyperGroup): + def list_commands(self, ctx): + return [ + "models", + "test-categories", + "generate", + "results", + "evaluate", + "scores", + ] + + +cli = typer.Typer( + context_settings=dict(help_option_names=["-h", "--help"]), + no_args_is_help=True, + cls=ExecutionOrderGroup, +) + + +@cli.command() +def test_categories(): + """ + List available test categories. + """ + table = tabulate( + [ + (category, "\n".join(test for test in tests)) + for category, tests in TEST_COLLECTION_MAPPING.items() + ], + headers=["Test category", "Test names"], + tablefmt="grid", + ) + print(table) + + +@cli.command() +def models(): + """ + List available models. + """ + table = tabulate( + [[model] for model in HANDLER_MAP.keys()], + tablefmt="plain", + colalign=("left",), + ) + print(table) + + +@cli.command() +def generate( + model: List[str] = typer.Option( + ["gorilla-openfunctions-v2"], help="A list of model names to evaluate." + ), + test_category: List[str] = typer.Option( + ["all"], help="A list of test categories to run the evaluation on." + ), + api_sanity_check: bool = typer.Option( + False, + "--api-sanity-check", + "-c", + help="Perform the REST API status sanity check before running the evaluation.", + ), + temperature: float = typer.Option( + 0.001, help="The temperature parameter for the model." + ), + include_debugging_log: bool = typer.Option( + False, + help="Include debugging log in the response file to see model's interaction with the state machine.", + ), + num_gpus: int = typer.Option(1, help="The number of GPUs to use."), + num_threads: int = typer.Option(1, help="The number of threads to use."), + gpu_memory_utilization: float = typer.Option( + 0.9, help="The GPU memory utilization." + ), +): + """ + Generate the LLM response for one or more models on a test-category (same as openfunctions_evaluation.py). + """ + generationArgs = namedtuple( + "generationArgs", + [ + "model", + "test_category", + "api_sanity_check", + "temperature", + "include_debugging_log", + "num_gpus", + "num_threads", + "gpu_memory_utilization", + ], + ) + + load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file + generation_main( + generationArgs( + model=model, + test_category=test_category, + api_sanity_check=api_sanity_check, + temperature=temperature, + include_debugging_log=include_debugging_log, + num_gpus=num_gpus, + num_threads=num_threads, + gpu_memory_utilization=gpu_memory_utilization, + ) + ) + + +@cli.command() +def results(): + """ + List the results available for evaluation. + """ + + def display_name(name: str): + """ + Undo the / -> _ transformation if it happened. + + Args: + name (str): The name of the model in the result directory. + + Returns: + str: The original name of the model. + """ + if name not in HANDLER_MAP: + candidate = name.replace("_", "/") + if candidate in HANDLER_MAP: + return candidate + print(f"Unknown model name: {name}") + return name + + result_dir = RESULT_PATH + + results_data = [] + for dir in result_dir.iterdir(): + # Check if it is a directory and not a file + if not dir.is_dir(): + continue + + results_data.append( + ( + display_name(dir.name), + datetime.fromtimestamp(dir.stat().st_ctime).strftime( + "%Y-%m-%d %H:%M:%S" + ), + ) + ) + + print( + tabulate( + results_data, + headers=["Model name", "Creation time"], + tablefmt="pretty", + ) + ) + + +@cli.command() +def evaluate( + model: List[str] = typer.Option(None, help="A list of model names to evaluate."), + test_category: List[str] = typer.Option( + None, help="A list of test categories to run the evaluation on." + ), + api_sanity_check: bool = typer.Option( + False, + "--api-sanity-check", + "-c", + help="Perform the REST API status sanity check before running the evaluation.", + ), +): + """ + Evaluate results from run of one or more models on a test-category (same as eval_runner.py). + """ + + load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file + eval_runner.main(model, test_category, api_sanity_check) + + +@cli.command() +def scores(): + """ + Display the leaderboard. + """ + + def truncate(text, length=22): + return (text[:length] + "...") if len(text) > length else text + + # files = ["./score/data_non_live.csv", "./score/data_live.csv", "./score/data_overall.csv"] + file = SCORE_PATH / "data_overall.csv" + + selected_columns = [ + "Rank", + "Model", + "Overall Acc", + "Non-Live AST Acc", + "Non-Live Exec Acc", + "Live Acc", + "Multi Turn Acc", + "Relevance Detection", + "Irrelevance Detection", + ] + + if file.exists(): + with open(file, newline="") as csvfile: + reader = csv.reader(csvfile) + headers = next(reader) # Read the header row + column_indices = [headers.index(col) for col in selected_columns] + data = [ + [row[i] for i in column_indices] for row in reader + ] # Read the rest of the data + selected_columns = selected_columns[:-2] + ["Relevance", "Irrelevance"] # Shorten the column names + print(tabulate(data, headers=selected_columns, tablefmt="grid")) + else: + print(f"\nFile {file} not found.\n") + + +if __name__ == "__main__": + cli() diff --git a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py new file mode 100644 index 000000000..03af01fe0 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py @@ -0,0 +1,226 @@ +import argparse +import copy +import json +import os +import time +from concurrent.futures import ThreadPoolExecutor + +from bfcl._apply_function_credential_config import apply_function_credential_config +from bfcl.constant import ( + DOTENV_PATH, + PROMPT_PATH, + RESULT_PATH, + TEST_COLLECTION_MAPPING, + TEST_FILE_MAPPING, +) +from bfcl.eval_checker.eval_runner_helper import is_executable +from bfcl.model_handler.handler_map import HANDLER_MAP +from bfcl.model_handler.model_style import ModelStyle +from dotenv import load_dotenv +from tqdm import tqdm + +RETRY_LIMIT = 3 +# 60s for the timer to complete. But often we find that even with 60 there is a conflict. So 65 is a safe no. +RETRY_DELAY = 65 # Delay in seconds + + +def get_args(): + parser = argparse.ArgumentParser() + # Refer to model_choice for supported models. + parser.add_argument( + "--model", type=str, default="gorilla-openfunctions-v2", nargs="+" + ) + # Refer to test_categories for supported categories. + parser.add_argument( + "--test-category", type=str, default="all", nargs="+" + ) + + # Parameters for the model that you want to test. + parser.add_argument("--temperature", type=float, default=0.001) + parser.add_argument("--include-debugging-log", action="store_true", default=False) + parser.add_argument("--num-threads", default=1, type=int) + parser.add_argument("--num-gpus", default=1, type=int) + parser.add_argument("--gpu-memory-utilization", default=0.9, type=float) + args = parser.parse_args() + return args + + +def build_handler(model_name, temperature): + handler = HANDLER_MAP[model_name](model_name, temperature) + return handler + + +def sort_key(entry): + """ + Index comes in two forms: TestCategory_Index or TestCategory_Index-FuncDocSubIndex-PromptSubIndex; both 0-indexed. + + TestCategory_Index: For example, `simple_20` means the 21st entry in the `simple` test category. + + TestCategory_Index-FuncDocSubIndex-PromptSubIndex is used when there are multiple prompts for a single function doc; this only happens in the live dataset. + FuncDocSubIndex increments for each unique function doc. + PromptSubIndex is per function doc. It resets to 0 for each function doc. + For example, `live_simple_19-3-15` means the 20th entry in the `live_simple` test category. + This entry has the 4th unique function doc and the 16th prompt for that function doc (there are at least 15 other prompts for this same function doc in this category). + + In either case, the universal index is enough to sort the entries. + """ + parts = entry["id"].rsplit("_", 1) + test_category, index = parts[0], parts[1] + # This handles the case where the index is in the form TestCategory_Index-FuncDocSubIndex-PromptSubIndex + if "-" in index: + index = index.split("-")[0] + return (test_category, int(index)) + + +def parse_test_category_argument(test_category_args): + test_name_total = set() + test_filename_total = set() + + for test_category in test_category_args: + if test_category in TEST_COLLECTION_MAPPING: + for test_name in TEST_COLLECTION_MAPPING[test_category]: + test_name_total.add(test_name) + test_filename_total.add(TEST_FILE_MAPPING[test_name]) + else: + test_name_total.add(test_category) + test_filename_total.add(TEST_FILE_MAPPING[test_category]) + + return sorted(list(test_name_total)), sorted(list(test_filename_total)) + + +def collect_test_cases(test_filename_total, model_name): + model_name_dir = model_name.replace("/", "_") + model_result_dir = RESULT_PATH / model_name_dir + + test_cases_total = [] + for file_to_open in test_filename_total: + test_cases = [] + with open(PROMPT_PATH / file_to_open) as f: + for line in f: + test_cases.append(json.loads(line)) + + existing_result = [] + result_file_path = model_result_dir / file_to_open.replace(".json", "_result.json") + if result_file_path.exists(): + with open(result_file_path) as f: + for line in f: + existing_result.append(json.loads(line)) + + existing_ids = [entry["id"] for entry in existing_result] + test_cases_total.extend( + [test_case for test_case in test_cases if test_case["id"] not in existing_ids] + ) + + return sorted(test_cases_total, key=sort_key) + + +def multi_threaded_inference(handler, test_case, include_debugging_log): + + assert type(test_case["function"]) is list + + retry_count = 0 + + while True: + try: + result, metadata = handler.inference(copy.deepcopy(test_case), include_debugging_log) + break # Success, exit the loop + except Exception as e: + # TODO: It might be better to handle the exception in the handler itself rather than a universal catch block here, as each handler use different ways to call the endpoint. + # OpenAI has openai.RateLimitError while Anthropic has anthropic.RateLimitError. It would be more robust in the long run. + if retry_count < RETRY_LIMIT and ( + "rate limit reached" in str(e).lower() + or (hasattr(e, "status_code") and (e.status_code in {429, 503, 500})) + ): + print( + f"Rate limit reached. Sleeping for 65 seconds. Retry {retry_count + 1}/{RETRY_LIMIT}" + ) + time.sleep(RETRY_DELAY) + retry_count += 1 + else: + # This is usually the case when the model getting stuck on one particular test case. + # For example, timeout error or FC model returning invalid JSON response. + # Since temperature is already set to 0.001, retrying the same test case will not help. + # So we continue the generation process and record the error message as the model response + print("-" * 100) + print( + "❗️❗️ Error occurred during inference. Maximum reties reached for rate limit or other error. Continuing to next test case." + ) + print(f"❗️❗️ Test case ID: {test_case['id']}, Error: {str(e)}") + print("-" * 100) + + return { + "id": test_case["id"], + "result": f"Error during inference: {str(e)}", + } + + result_to_write = { + "id": test_case["id"], + "result": result, + } + + result_to_write.update(metadata) + + return result_to_write + + +def generate_results(args, model_name, test_cases_total): + + handler = build_handler(model_name, args.temperature) + + if handler.model_style == ModelStyle.OSSMODEL: + # batch_inference will handle the writing of results + handler.batch_inference( + test_entries=test_cases_total, + num_gpus=args.num_gpus, + gpu_memory_utilization=args.gpu_memory_utilization, + include_debugging_log=args.include_debugging_log, + ) + + else: + futures = [] + with ThreadPoolExecutor(max_workers=args.num_threads) as executor: + with tqdm( + total=len(test_cases_total), desc=f"Generating results for {model_name}" + ) as pbar: + + for test_case in test_cases_total: + future = executor.submit(multi_threaded_inference, handler, test_case, args.include_debugging_log) + futures.append(future) + + for future in futures: + # This will wait for the task to complete, so that we are always writing in order + result = future.result() + handler.write(result) + pbar.update() + + +def main(args): + + if type(args.model) is not list: + args.model = [args.model] + if type(args.test_category) is not list: + args.test_category = [args.test_category] + + test_name_total, test_filename_total = parse_test_category_argument(args.test_category) + + print(f"Generating results for {args.model} on test category: {test_name_total}.") + + # Apply function credential config if any of the test categories are executable + if any([is_executable(category) for category in test_name_total]): + apply_function_credential_config(input_path=PROMPT_PATH) + + for model_name in args.model: + if ( + os.getenv("USE_COHERE_OPTIMIZATION") == "True" + and "command-r-plus" in model_name + ): + model_name = model_name + "-optimized" + + test_cases_total = collect_test_cases(test_filename_total, model_name) + + if len(test_cases_total) == 0: + print( + f"All selected test cases have been previously generated for {model_name}. No new test cases to generate." + ) + else: + generate_results(args, model_name, test_cases_total) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py index 42904a22d..6c87c74db 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -602,6 +602,50 @@ def runner(model_names, test_categories, api_sanity_check): ) +def main(model, test_category, api_sanity_check): + test_categories = None + if test_category is not None: + test_categories = [] + for category in test_category: + if category in TEST_COLLECTION_MAPPING: + test_categories.extend(TEST_COLLECTION_MAPPING[category]) + else: + test_categories.append(category) + + model_names = None + if model is not None: + model_names = [] + for model_name in model: + # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. + # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). + # We patch it here to avoid confusing the user. + model_names.append(model_name.replace("/", "_")) + + runner(model_names, test_categories, api_sanity_check) + + +def main(model, test_category, api_sanity_check): + test_categories = None + if test_category is not None: + test_categories = [] + for category in test_category: + if category in TEST_COLLECTION_MAPPING: + test_categories.extend(TEST_COLLECTION_MAPPING[category]) + else: + test_categories.append(category) + + model_names = None + if model is not None: + model_names = [] + for model_name in model: + # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. + # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). + # We patch it here to avoid confusing the user. + model_names.append(model_name.replace("/", "_")) + + runner(model_names, test_categories, api_sanity_check) + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process two lists of strings.") @@ -625,24 +669,6 @@ def runner(model_names, test_categories, api_sanity_check): args = parser.parse_args() - api_sanity_check = args.api_sanity_check - test_categories = None - if args.test_category is not None: - test_categories = [] - for test_category in args.test_category: - if test_category in TEST_COLLECTION_MAPPING: - test_categories.extend(TEST_COLLECTION_MAPPING[test_category]) - else: - test_categories.append(test_category) - - model_names = args.model - if args.model is not None: - model_names = [] - for model_name in args.model: - # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. - # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). - # We patch it here to avoid confusing the user. - model_names.append(model_name.replace("/", "_")) - load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file - runner(model_names, test_categories, api_sanity_check) + + main(args.model, args.test_category, args.api_sanity_check) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index 3ec38d16d..29525d5dd 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -11,7 +11,7 @@ from bfcl.eval_checker.constant import * from bfcl.eval_checker.executable_eval.custom_exception import BadAPIStatusError from bfcl.eval_checker.model_metadata import * -from bfcl.model_handler.handler_map import handler_map +from bfcl.model_handler.handler_map import HANDLER_MAP from tqdm import tqdm @@ -77,7 +77,7 @@ def load_file(file_path): def get_handler(model_name): - return handler_map[model_name](model_name, temperature=0) #Temperature doesn't matter for evaluation + return HANDLER_MAP[model_name](model_name, temperature=0) #Temperature doesn't matter for evaluation def write_list_of_dicts_to_file(filename, data, subdir=None): diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py index c178dcefa..36e2dd968 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/executable_eval/executable_checker.py @@ -1,5 +1,6 @@ import json import time +from functools import lru_cache import requests # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function. from bfcl.eval_checker.constant import ( @@ -9,12 +10,15 @@ from bfcl.eval_checker.executable_eval.custom_exception import NoAPIKeyError # Load the ground truth data for the `rest` test category -with open(REST_EVAL_GROUND_TRUTH_PATH, "r") as f: - EVAL_GROUND_TRUTH = f.readlines() - +@lru_cache(maxsize=1) # cache the result, effectively loading data once +def load_eval_ground_truth(): + with open(REST_EVAL_GROUND_TRUTH_PATH, "r") as f: + return f.readlines() #### Main function #### def executable_checker_rest(func_call, idx): + EVAL_GROUND_TRUTH = load_eval_ground_truth() + if "https://geocode.maps.co" in func_call: time.sleep(2) if "requests_get" in func_call: diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py index aa7e09a91..9ded1e7d5 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py @@ -147,4 +147,4 @@ # "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler, } -handler_map = {**api_inference_handler_map, **local_inference_handler_map} +HANDLER_MAP = {**api_inference_handler_map, **local_inference_handler_map} diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py index f830ce995..d8bc79904 100644 --- a/berkeley-function-call-leaderboard/openfunctions_evaluation.py +++ b/berkeley-function-call-leaderboard/openfunctions_evaluation.py @@ -1,229 +1,11 @@ -import argparse -import copy -import json -import os -import time -from concurrent.futures import ThreadPoolExecutor - -from bfcl._apply_function_credential_config import apply_function_credential_config -from bfcl.constant import ( - DOTENV_PATH, - PROMPT_PATH, - RESULT_PATH, - TEST_COLLECTION_MAPPING, - TEST_FILE_MAPPING, -) -from bfcl.eval_checker.eval_runner_helper import is_executable -from bfcl.model_handler.handler_map import handler_map -from bfcl.model_handler.model_style import ModelStyle +from bfcl._llm_response_generation import get_args, main +from bfcl.constant import DOTENV_PATH from dotenv import load_dotenv -from tqdm import tqdm - -RETRY_LIMIT = 3 -# 60s for the timer to complete. But often we find that even with 60 there is a conflict. So 65 is a safe no. -RETRY_DELAY = 65 # Delay in seconds - - -def get_args(): - parser = argparse.ArgumentParser() - # Refer to model_choice for supported models. - parser.add_argument( - "--model", type=str, default="gorilla-openfunctions-v2", nargs="+" - ) - # Refer to test_categories for supported categories. - parser.add_argument( - "--test-category", type=str, default="all", nargs="+" - ) - - # Parameters for the model that you want to test. - parser.add_argument("--temperature", type=float, default=0.001) - parser.add_argument("--include-debugging-log", action="store_true", default=False) - parser.add_argument("--num-threads", default=1, type=int) - parser.add_argument("--num-gpus", default=1, type=int) - parser.add_argument("--gpu-memory-utilization", default=0.9, type=float) - args = parser.parse_args() - return args - - -def build_handler(model_name, temperature): - handler = handler_map[model_name](model_name, temperature) - return handler - - -def sort_key(entry): - """ - Index comes in two forms: TestCategory_Index or TestCategory_Index-FuncDocSubIndex-PromptSubIndex; both 0-indexed. - - TestCategory_Index: For example, `simple_20` means the 21st entry in the `simple` test category. - - TestCategory_Index-FuncDocSubIndex-PromptSubIndex is used when there are multiple prompts for a single function doc; this only happens in the live dataset. - FuncDocSubIndex increments for each unique function doc. - PromptSubIndex is per function doc. It resets to 0 for each function doc. - For example, `live_simple_19-3-15` means the 20th entry in the `live_simple` test category. - This entry has the 4th unique function doc and the 16th prompt for that function doc (there are at least 15 other prompts for this same function doc in this category). - - In either case, the universal index is enough to sort the entries. - """ - parts = entry["id"].rsplit("_", 1) - test_category, index = parts[0], parts[1] - # This handles the case where the index is in the form TestCategory_Index-FuncDocSubIndex-PromptSubIndex - if "-" in index: - index = index.split("-")[0] - return (test_category, int(index)) - - -def parse_test_category_argument(test_category_args): - test_name_total = set() - test_filename_total = set() - - for test_category in test_category_args: - if test_category in TEST_COLLECTION_MAPPING: - for test_name in TEST_COLLECTION_MAPPING[test_category]: - test_name_total.add(test_name) - test_filename_total.add(TEST_FILE_MAPPING[test_name]) - else: - test_name_total.add(test_category) - test_filename_total.add(TEST_FILE_MAPPING[test_category]) - - return sorted(list(test_name_total)), sorted(list(test_filename_total)) - - -def collect_test_cases(test_filename_total, model_name): - model_name_dir = model_name.replace("/", "_") - model_result_dir = RESULT_PATH / model_name_dir - - test_cases_total = [] - for file_to_open in test_filename_total: - test_cases = [] - with open(PROMPT_PATH / file_to_open) as f: - for line in f: - test_cases.append(json.loads(line)) - - existing_result = [] - result_file_path = model_result_dir / file_to_open.replace(".json", "_result.json") - if result_file_path.exists(): - with open(result_file_path) as f: - for line in f: - existing_result.append(json.loads(line)) - - existing_ids = [entry["id"] for entry in existing_result] - test_cases_total.extend( - [test_case for test_case in test_cases if test_case["id"] not in existing_ids] - ) - - return sorted(test_cases_total, key=sort_key) - - -def multi_threaded_inference(handler, test_case, include_debugging_log): - - assert type(test_case["function"]) is list - - retry_count = 0 - - while True: - try: - result, metadata = handler.inference(copy.deepcopy(test_case), include_debugging_log) - break # Success, exit the loop - except Exception as e: - # TODO: It might be better to handle the exception in the handler itself rather than a universal catch block here, as each handler use different ways to call the endpoint. - # OpenAI has openai.RateLimitError while Anthropic has anthropic.RateLimitError. It would be more robust in the long run. - if retry_count < RETRY_LIMIT and ( - "rate limit reached" in str(e).lower() - or (hasattr(e, "status_code") and (e.status_code in {429, 503, 500})) - ): - print( - f"Rate limit reached. Sleeping for 65 seconds. Retry {retry_count + 1}/{RETRY_LIMIT}" - ) - time.sleep(RETRY_DELAY) - retry_count += 1 - else: - # This is usually the case when the model getting stuck on one particular test case. - # For example, timeout error or FC model returning invalid JSON response. - # Since temperature is already set to 0.001, retrying the same test case will not help. - # So we continue the generation process and record the error message as the model response - print("-" * 100) - print( - "❗️❗️ Error occurred during inference. Maximum reties reached for rate limit or other error. Continuing to next test case." - ) - print(f"❗️❗️ Test case ID: {test_case['id']}, Error: {str(e)}") - print("-" * 100) - - return { - "id": test_case["id"], - "result": f"Error during inference: {str(e)}", - } - - result_to_write = { - "id": test_case["id"], - "result": result, - } - - result_to_write.update(metadata) - - return result_to_write - - -def generate_results(args, model_name, test_cases_total): - - handler = build_handler(model_name, args.temperature) - - if handler.model_style == ModelStyle.OSSMODEL: - # batch_inference will handle the writing of results - handler.batch_inference( - test_entries=test_cases_total, - num_gpus=args.num_gpus, - gpu_memory_utilization=args.gpu_memory_utilization, - include_debugging_log=args.include_debugging_log, - ) - - else: - futures = [] - with ThreadPoolExecutor(max_workers=args.num_threads) as executor: - with tqdm( - total=len(test_cases_total), desc=f"Generating results for {model_name}" - ) as pbar: - - for test_case in test_cases_total: - future = executor.submit(multi_threaded_inference, handler, test_case, args.include_debugging_log) - futures.append(future) - - for future in futures: - # This will wait for the task to complete, so that we are always writing in order - result = future.result() - handler.write(result) - pbar.update() - +# Note: This file is still kept for compatibility with the old structure of the codebase. +# It is recommended to use the new `bfcl xxx` cli commands instead. +# We will remove this in the next major release. if __name__ == "__main__": load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file - args = get_args() - - if type(args.model) is not list: - args.model = [args.model] - if type(args.test_category) is not list: - args.test_category = [args.test_category] - - test_name_total, test_filename_total = parse_test_category_argument(args.test_category) - - print(f"Generating results for {args.model} on test category: {test_name_total}.") - - # Apply function credential config if any of the test categories are executable - if any([is_executable(category) for category in test_name_total]): - apply_function_credential_config(input_path=PROMPT_PATH) - - for model_name in args.model: - if ( - os.getenv("USE_COHERE_OPTIMIZATION") == "True" - and "command-r-plus" in model_name - ): - model_name = model_name + "-optimized" - - test_cases_total = collect_test_cases(test_filename_total, model_name) - - if len(test_cases_total) == 0: - print( - f"All selected test cases have been previously generated for {model_name}. No new test cases to generate." - ) - else: - generate_results(args, model_name, test_cases_total) + main(get_args()) diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index 9187a2b34..0d05f8f33 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "tqdm", "numpy==1.26.4", "pandas", + "pathlib", "huggingface_hub", "pydantic>=2.8.2", "python-dotenv>=1.0.1", @@ -27,10 +28,14 @@ dependencies = [ "mistralai==1.1.0", "anthropic==0.31.1", "cohere==5.5.8", + "typer>=0.12.5", + "tabulate>=0.9.0", "google-cloud-aiplatform>=1.70.0", - "pathlib", ] +[project.scripts] +bfcl = "bfcl.__main__:cli" + [tool.setuptools.packages.find] include = ["bfcl*"]