From fc41e2e3ccffaee90ec5cb20448fdab655a312dc Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 4 Sep 2024 12:34:44 -0400 Subject: [PATCH 01/16] avoid initializing EVAL_GROUND_TRUTH on module load --- .../bfcl/eval_checker/checker.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py index 48fd9618d..451de9a70 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py @@ -9,6 +9,7 @@ import requests # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function. import time import json +from functools import lru_cache # We switch to conditional import for the following two imports to avoid unnecessary installations. # User doesn't need to setup the tree-sitter packages if they are not running the test for that language. @@ -36,8 +37,12 @@ EVAL_GROUND_TRUTH_PATH = ( "./rest-eval-response_v5.jsonl" # Ground truth file for v5 for rest execution ) -with open(EVAL_GROUND_TRUTH_PATH, "r") as f: - EVAL_GROUND_TRUTH = f.readlines() + + +@lru_cache(maxsize=1) # cache the result, effectively loading data once +def load_eval_ground_truth(): + with open(EVAL_GROUND_TRUTH_PATH, "r") as f: + return f.readlines() #### Helper functions for AST #### @@ -831,6 +836,8 @@ def executable_checker_parallel_no_order( #### Main function #### def executable_checker_rest(func_call, idx): + EVAL_GROUND_TRUTH = load_eval_ground_truth() + if "https://geocode.maps.co" in func_call: time.sleep(2) if "requests_get" in func_call: From 05dbf6f35c2f2a3a390cb5f7bd8f55515e66748c Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 4 Sep 2024 12:35:53 -0400 Subject: [PATCH 02/16] allow loading from eval_checker --- .../bfcl/eval_checker/checker.py | 8 ++++++-- .../bfcl/eval_checker/eval_runner.py | 14 ++++++++++---- .../bfcl/eval_checker/eval_runner_helper.py | 5 ++++- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py index 451de9a70..0021d5abe 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py @@ -3,8 +3,12 @@ JAVA_TYPE_CONVERSION, JS_TYPE_CONVERSION, ) -from eval_checker_constant import REAL_TIME_MATCH_ALLOWED_DIFFERENCE -from custom_exception import NoAPIKeyError +try: + from .eval_checker_constant import REAL_TIME_MATCH_ALLOWED_DIFFERENCE + from .custom_exception import NoAPIKeyError +except ImportError: + from eval_checker_constant import REAL_TIME_MATCH_ALLOWED_DIFFERENCE + from custom_exception import NoAPIKeyError import re import requests # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function. import time diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py index 1638f1a89..56941fbbe 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -1,7 +1,13 @@ -from checker import ast_checker, exec_checker, executable_checker_rest -from custom_exception import BadAPIStatusError -from eval_runner_helper import * -from eval_checker_constant import TEST_COLLECTION_MAPPING +try: + from .checker import ast_checker, exec_checker, executable_checker_rest + from .custom_exception import BadAPIStatusError + from .eval_runner_helper import * + from .eval_checker_constant import TEST_COLLECTION_MAPPING +except ImportError: + from checker import ast_checker, exec_checker, executable_checker_rest + from custom_exception import BadAPIStatusError + from eval_runner_helper import * + from eval_checker_constant import TEST_COLLECTION_MAPPING from tqdm import tqdm import argparse diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index 62a67203c..ada62db7b 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -5,7 +5,10 @@ import subprocess import re import numpy as np -from custom_exception import BadAPIStatusError +try: + from .custom_exception import BadAPIStatusError +except ImportError: + from custom_exception import BadAPIStatusError from bfcl.model_handler.handler_map import handler_map from tqdm import tqdm From 66357dd6d698eeb94f96ad126850b110a862d688 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 4 Sep 2024 12:37:33 -0400 Subject: [PATCH 03/16] move openfunctions_evaluation.py into the bfcl package --- .../bfcl/_openfunctions_evaluation.py | 223 +++++++++++++++++ .../openfunctions_evaluation.py | 225 +----------------- 2 files changed, 225 insertions(+), 223 deletions(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py diff --git a/berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py b/berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py new file mode 100644 index 000000000..a2911f096 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py @@ -0,0 +1,223 @@ +import argparse, json, os, time +from tqdm import tqdm +from bfcl.model_handler.handler_map import handler_map +from bfcl.model_handler.model_style import ModelStyle +from bfcl.model_handler.constant import USE_COHERE_OPTIMIZATION +from bfcl.eval_checker.eval_checker_constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING +from concurrent.futures import ThreadPoolExecutor + +RETRY_LIMIT = 3 +# 60s for the timer to complete. But often we find that even with 60 there is a conflict. So 65 is a safe no. +RETRY_DELAY = 65 # Delay in seconds + +def get_args(): + parser = argparse.ArgumentParser() + # Refer to model_choice for supported models. + parser.add_argument("--model", type=str, default="gorilla-openfunctions-v2", nargs="+") + # Refer to test_categories for supported categories. + parser.add_argument("--test-category", type=str, default="all", nargs="+") + + # Parameters for the model that you want to test. + parser.add_argument("--temperature", type=float, default=0.001) + parser.add_argument("--top-p", type=float, default=1) + parser.add_argument("--max-tokens", type=int, default=1200) + parser.add_argument("--num-gpus", default=1, type=int) + parser.add_argument("--timeout", default=60, type=int) + parser.add_argument("--num-threads", default=1, type=int) + parser.add_argument("--gpu-memory-utilization", default=0.9, type=float) + args = parser.parse_args() + return args + + +def build_handler(model_name, temperature, top_p, max_tokens): + handler = handler_map[model_name](model_name, temperature, top_p, max_tokens) + return handler + + +def sort_key(entry): + """ + Index comes in two forms: TestCategory_Index or TestCategory_Index-FuncDocSubIndex-PromptSubIndex; both 0-indexed. + + TestCategory_Index: For example, `simple_20` means the 21st entry in the `simple` test category. + + TestCategory_Index-FuncDocSubIndex-PromptSubIndex is used when there are multiple prompts for a single function doc; this only happens in the live dataset. + FuncDocSubIndex increments for each unique function doc. + PromptSubIndex is per function doc. It resets to 0 for each function doc. + For example, `live_simple_19-3-15` means the 20th entry in the `live_simple` test category. + This entry has the 4th unique function doc and the 16th prompt for that function doc (there are at least 15 other prompts for this same function doc in this category). + + In either case, the universal index is enough to sort the entries. + """ + parts = entry["id"].rsplit("_", 1) + test_category, index = parts[0], parts[1] + # This handles the case where the index is in the form TestCategory_Index-FuncDocSubIndex-PromptSubIndex + if "-" in index: + index = index.split("-")[0] + return (test_category, int(index)) + + +def parse_test_category_argument(test_category_args): + test_name_total = set() + test_filename_total = set() + + for test_category in test_category_args: + if test_category in TEST_COLLECTION_MAPPING: + for test_name in TEST_COLLECTION_MAPPING[test_category]: + test_name_total.add(test_name) + test_filename_total.add(TEST_FILE_MAPPING[test_name]) + else: + test_name_total.add(test_category) + test_filename_total.add(TEST_FILE_MAPPING[test_category]) + + return sorted(list(test_name_total)), sorted(list(test_filename_total)) + + +def collect_test_cases(test_filename_total, model_name): + model_name_dir = model_name.replace("/", "_") + test_cases_total = [] + for file_to_open in test_filename_total: + test_cases = [] + with open("./data/" + file_to_open) as f: + for line in f: + test_cases.append(json.loads(line)) + + existing_result = [] + if os.path.exists( + "./result/" + + model_name_dir + + "/" + + file_to_open.replace(".json", "_result.json") + ): + with open( + "./result/" + + model_name_dir + + "/" + + file_to_open.replace(".json", "_result.json") + ) as f: + for line in f: + existing_result.append(json.loads(line)) + + existing_ids = [entry["id"] for entry in existing_result] + test_cases_total.extend( + [ + test_case + for test_case in test_cases + if test_case["id"] not in existing_ids + ] + ) + + return sorted(test_cases_total, key=sort_key) + + +def multi_threaded_inference(handler, test_case): + user_question, functions, test_category = ( + test_case["question"], + test_case["function"], + test_case["id"].rsplit("_", 1)[0], + ) + if type(functions) is dict or type(functions) is str: + functions = [functions] + + retry_count = 0 + + while True: + try: + result, metadata = handler.inference( + user_question, functions, test_category + ) + break # Success, exit the loop + except Exception as e: + # TODO: It might be better to handle the exception in the handler itself rather than a universal catch block here, as each handler use different ways to call the endpoint. + # OpenAI has openai.RateLimitError while Anthropic has anthropic.RateLimitError. It would be more robust in the long run. + if retry_count < RETRY_LIMIT and ( + "rate limit reached" in str(e).lower() + or (hasattr(e, "status_code") and (e.status_code in {429, 503, 500})) + ): + print( + f"Rate limit reached. Sleeping for 65 seconds. Retry {retry_count + 1}/{RETRY_LIMIT}" + ) + time.sleep(RETRY_DELAY) + retry_count += 1 + else: + # This is usually the case when the model getting stuck on one particular test case. + # For example, timeout error or FC model returning invalid JSON response. + # Since temperature is already set to 0.001, retrying the same test case will not help. + # So we continue the generation process and record the error message as the model response + print("-" * 100) + print( + "❗️❗️ Error occurred during inference. Maximum reties reached for rate limit or other error. Continuing to next test case." + ) + print(f"❗️❗️ Test case ID: {test_case['id']}, Error: {str(e)}") + print("-" * 100) + + return { + "id": test_case["id"], + "result": f"Error during inference: {str(e)}", + } + + result_to_write = { + "id": test_case["id"], + "result": result, + } + + result_to_write.update(metadata) + + return result_to_write + + +def generate_results(args, model_name, test_cases_total): + + handler = build_handler(model_name, args.temperature, args.top_p, args.max_tokens) + + if handler.model_style == ModelStyle.OSSMODEL: + results, processed_messages = handler.inference( + test_question=test_cases_total, + num_gpus=args.num_gpus, + gpu_memory_utilization=args.gpu_memory_utilization, + ) + for test_case, result, processed_message in zip(test_cases_total, results, processed_messages): + result_to_write = {"id": test_case["id"], "result": result, "processed_message": processed_message} + handler.write(result_to_write) + + else: + futures = [] + with ThreadPoolExecutor(max_workers=args.num_threads) as executor: + with tqdm( + total=len(test_cases_total), desc=f"Generating results for {model_name}" + ) as pbar: + + for test_case in test_cases_total: + future = executor.submit( + multi_threaded_inference, handler, test_case + ) + futures.append(future) + + for future in futures: + # This will wait for the task to complete, so that we are always writing in order + result = future.result() + handler.write(result) + pbar.update() + + +def main(args): + if type(args.model) is not list: + args.model = [args.model] + if type(args.test_category) is not list: + args.test_category = [args.test_category] + + test_name_total, test_filename_total = parse_test_category_argument(args.test_category) + + print(f"Generating results for {args.model} on test category: {test_name_total}.") + + for model_name in args.model: + if USE_COHERE_OPTIMIZATION and "command-r-plus" in model_name: + model_name = model_name + "-optimized" + + test_cases_total = collect_test_cases(test_filename_total, model_name) + + if len(test_cases_total) == 0: + print( + f"All selected test cases have been previously generated for {model_name}. No new test cases to generate." + ) + else: + generate_results(args, model_name, test_cases_total) diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py index b7658959f..985fc1b4b 100644 --- a/berkeley-function-call-leaderboard/openfunctions_evaluation.py +++ b/berkeley-function-call-leaderboard/openfunctions_evaluation.py @@ -1,225 +1,4 @@ -import argparse, json, os, time -from tqdm import tqdm -from bfcl.model_handler.handler_map import handler_map -from bfcl.model_handler.model_style import ModelStyle -from bfcl.model_handler.constant import USE_COHERE_OPTIMIZATION -from bfcl.eval_checker.eval_checker_constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING -from concurrent.futures import ThreadPoolExecutor - -RETRY_LIMIT = 3 -# 60s for the timer to complete. But often we find that even with 60 there is a conflict. So 65 is a safe no. -RETRY_DELAY = 65 # Delay in seconds - -def get_args(): - parser = argparse.ArgumentParser() - # Refer to model_choice for supported models. - parser.add_argument("--model", type=str, default="gorilla-openfunctions-v2", nargs="+") - # Refer to test_categories for supported categories. - parser.add_argument("--test-category", type=str, default="all", nargs="+") - - # Parameters for the model that you want to test. - parser.add_argument("--temperature", type=float, default=0.001) - parser.add_argument("--top-p", type=float, default=1) - parser.add_argument("--max-tokens", type=int, default=1200) - parser.add_argument("--num-gpus", default=1, type=int) - parser.add_argument("--timeout", default=60, type=int) - parser.add_argument("--num-threads", default=1, type=int) - parser.add_argument("--gpu-memory-utilization", default=0.9, type=float) - args = parser.parse_args() - return args - - -def build_handler(model_name, temperature, top_p, max_tokens): - handler = handler_map[model_name](model_name, temperature, top_p, max_tokens) - return handler - - -def sort_key(entry): - """ - Index comes in two forms: TestCategory_Index or TestCategory_Index-FuncDocSubIndex-PromptSubIndex; both 0-indexed. - - TestCategory_Index: For example, `simple_20` means the 21st entry in the `simple` test category. - - TestCategory_Index-FuncDocSubIndex-PromptSubIndex is used when there are multiple prompts for a single function doc; this only happens in the live dataset. - FuncDocSubIndex increments for each unique function doc. - PromptSubIndex is per function doc. It resets to 0 for each function doc. - For example, `live_simple_19-3-15` means the 20th entry in the `live_simple` test category. - This entry has the 4th unique function doc and the 16th prompt for that function doc (there are at least 15 other prompts for this same function doc in this category). - - In either case, the universal index is enough to sort the entries. - """ - parts = entry["id"].rsplit("_", 1) - test_category, index = parts[0], parts[1] - # This handles the case where the index is in the form TestCategory_Index-FuncDocSubIndex-PromptSubIndex - if "-" in index: - index = index.split("-")[0] - return (test_category, int(index)) - - -def parse_test_category_argument(test_category_args): - test_name_total = set() - test_filename_total = set() - - for test_category in test_category_args: - if test_category in TEST_COLLECTION_MAPPING: - for test_name in TEST_COLLECTION_MAPPING[test_category]: - test_name_total.add(test_name) - test_filename_total.add(TEST_FILE_MAPPING[test_name]) - else: - test_name_total.add(test_category) - test_filename_total.add(TEST_FILE_MAPPING[test_category]) - - return sorted(list(test_name_total)), sorted(list(test_filename_total)) - - -def collect_test_cases(test_filename_total, model_name): - model_name_dir = model_name.replace("/", "_") - test_cases_total = [] - for file_to_open in test_filename_total: - test_cases = [] - with open("./data/" + file_to_open) as f: - for line in f: - test_cases.append(json.loads(line)) - - existing_result = [] - if os.path.exists( - "./result/" - + model_name_dir - + "/" - + file_to_open.replace(".json", "_result.json") - ): - with open( - "./result/" - + model_name_dir - + "/" - + file_to_open.replace(".json", "_result.json") - ) as f: - for line in f: - existing_result.append(json.loads(line)) - - existing_ids = [entry["id"] for entry in existing_result] - test_cases_total.extend( - [ - test_case - for test_case in test_cases - if test_case["id"] not in existing_ids - ] - ) - - return sorted(test_cases_total, key=sort_key) - - -def multi_threaded_inference(handler, test_case): - user_question, functions, test_category = ( - test_case["question"], - test_case["function"], - test_case["id"].rsplit("_", 1)[0], - ) - if type(functions) is dict or type(functions) is str: - functions = [functions] - - retry_count = 0 - - while True: - try: - result, metadata = handler.inference( - user_question, functions, test_category - ) - break # Success, exit the loop - except Exception as e: - # TODO: It might be better to handle the exception in the handler itself rather than a universal catch block here, as each handler use different ways to call the endpoint. - # OpenAI has openai.RateLimitError while Anthropic has anthropic.RateLimitError. It would be more robust in the long run. - if retry_count < RETRY_LIMIT and ( - "rate limit reached" in str(e).lower() - or (hasattr(e, "status_code") and (e.status_code in {429, 503, 500})) - ): - print( - f"Rate limit reached. Sleeping for 65 seconds. Retry {retry_count + 1}/{RETRY_LIMIT}" - ) - time.sleep(RETRY_DELAY) - retry_count += 1 - else: - # This is usually the case when the model getting stuck on one particular test case. - # For example, timeout error or FC model returning invalid JSON response. - # Since temperature is already set to 0.001, retrying the same test case will not help. - # So we continue the generation process and record the error message as the model response - print("-" * 100) - print( - "❗️❗️ Error occurred during inference. Maximum reties reached for rate limit or other error. Continuing to next test case." - ) - print(f"❗️❗️ Test case ID: {test_case['id']}, Error: {str(e)}") - print("-" * 100) - - return { - "id": test_case["id"], - "result": f"Error during inference: {str(e)}", - } - - result_to_write = { - "id": test_case["id"], - "result": result, - } - - result_to_write.update(metadata) - - return result_to_write - - -def generate_results(args, model_name, test_cases_total): - - handler = build_handler(model_name, args.temperature, args.top_p, args.max_tokens) - - if handler.model_style == ModelStyle.OSSMODEL: - results, processed_messages = handler.inference( - test_question=test_cases_total, - num_gpus=args.num_gpus, - gpu_memory_utilization=args.gpu_memory_utilization, - ) - for test_case, result, processed_message in zip(test_cases_total, results, processed_messages): - result_to_write = {"id": test_case["id"], "result": result, "processed_message": processed_message} - handler.write(result_to_write) - - else: - futures = [] - with ThreadPoolExecutor(max_workers=args.num_threads) as executor: - with tqdm( - total=len(test_cases_total), desc=f"Generating results for {model_name}" - ) as pbar: - - for test_case in test_cases_total: - future = executor.submit( - multi_threaded_inference, handler, test_case - ) - futures.append(future) - - for future in futures: - # This will wait for the task to complete, so that we are always writing in order - result = future.result() - handler.write(result) - pbar.update() - +from bfcl._openfunctions_evaluation import main, get_args if __name__ == "__main__": - args = get_args() - - if type(args.model) is not list: - args.model = [args.model] - if type(args.test_category) is not list: - args.test_category = [args.test_category] - - test_name_total, test_filename_total = parse_test_category_argument(args.test_category) - - print(f"Generating results for {args.model} on test category: {test_name_total}.") - - for model_name in args.model: - if USE_COHERE_OPTIMIZATION and "command-r-plus" in model_name: - model_name = model_name + "-optimized" - - test_cases_total = collect_test_cases(test_filename_total, model_name) - - if len(test_cases_total) == 0: - print( - f"All selected test cases have been previously generated for {model_name}. No new test cases to generate." - ) - else: - generate_results(args, model_name, test_cases_total) + main(get_args()) From 076f617243fd1ca1f135a4e8acb6fdb47bbde9a3 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 4 Sep 2024 12:39:24 -0400 Subject: [PATCH 04/16] refactor eval_runner entry to separate arg parsing from main() --- .../bfcl/eval_checker/eval_runner.py | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py index 56941fbbe..abbbb2819 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -452,6 +452,28 @@ def runner(model_names, test_categories, api_sanity_check): LEADERBOARD_TABLE = {} +def main(model, test_category, api_sanity_check): + test_categories = None + if test_category is not None: + test_categories = [] + for category in test_category: + if category in TEST_COLLECTION_MAPPING: + test_categories.extend(TEST_COLLECTION_MAPPING[category]) + else: + test_categories.append(category) + + model_names = None + if model is not None: + model_names = [] + for model_name in model: + # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. + # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). + # We patch it here to avoid confusing the user. + model_names.append(model_name.replace("/", "_")) + + runner(model_names, test_categories, api_sanity_check) + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process two lists of strings.") @@ -475,23 +497,4 @@ def runner(model_names, test_categories, api_sanity_check): args = parser.parse_args() - api_sanity_check = args.api_sanity_check - test_categories = None - if args.test_category is not None: - test_categories = [] - for test_category in args.test_category: - if test_category in TEST_COLLECTION_MAPPING: - test_categories.extend(TEST_COLLECTION_MAPPING[test_category]) - else: - test_categories.append(test_category) - - model_names = args.model - if args.model is not None: - model_names = [] - for model_name in args.model: - # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. - # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). - # We patch it here to avoid confusing the user. - model_names.append(model_name.replace("/", "_")) - - runner(model_names, test_categories, api_sanity_check) + main(args.model, args.test_category, args.api_sanity_check) From e3721728bc81f95dcabcb901dd46e2f2928a2d7c Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 4 Sep 2024 12:45:45 -0400 Subject: [PATCH 05/16] add basic bfcl cli --- .../bfcl/__main__.py | 106 ++++++++++++++++++ .../pyproject.toml | 4 + 2 files changed, 110 insertions(+) create mode 100644 berkeley-function-call-leaderboard/bfcl/__main__.py diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py new file mode 100644 index 000000000..b140e6142 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -0,0 +1,106 @@ +from typing import List +from collections import namedtuple +import typer +from bfcl._openfunctions_evaluation import main as openfunctions_main +from bfcl.eval_checker import eval_runner +import os + +cli = typer.Typer( + context_settings=dict(help_option_names=["-h", "--help"]), + no_args_is_help=True, +) + + +@cli.command() +def run( + model: List[str] = typer.Option( + ["gorilla-openfunctions-v2"], help="A list of model names to evaluate." + ), + test_category: List[str] = typer.Option( + ["all"], help="A list of test categories to run the evaluation on." + ), + api_sanity_check: bool = typer.Option( + False, + "--api-sanity-check", + "-c", + help="Perform the REST API status sanity check before running the evaluation.", + ), + temperature: float = typer.Option( + 0.001, help="The temperature parameter for the model." + ), + top_p: float = typer.Option(1.0, help="The top-p parameter for the model."), + max_tokens: int = typer.Option( + 1200, help="The maximum number of tokens for the model." + ), + num_gpus: int = typer.Option(1, help="The number of GPUs to use."), + timeout: int = typer.Option(60, help="The timeout for the model in seconds."), + num_threads: int = typer.Option(1, help="The number of threads to use."), + gpu_memory_utilization: float = typer.Option( + 0.9, help="The GPU memory utilization." + ), +): + """ + Run one or more models on a test-category (same as openfunctions_evaluation). + """ + RunArgs = namedtuple( + "RunArgs", + [ + "model", + "test_category", + "api_sanity_check", + "temperature", + "top_p", + "max_tokens", + "num_gpus", + "timeout", + "num_threads", + "gpu_memory_utilization", + ], + ) + + openfunctions_main( + RunArgs( + model=model, + test_category=test_category, + api_sanity_check=api_sanity_check, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + num_gpus=num_gpus, + timeout=timeout, + num_threads=num_threads, + gpu_memory_utilization=gpu_memory_utilization, + ) + ) + + +@cli.command() +def evaluate( + model: List[str] = typer.Option(..., help="A list of model names to evaluate."), + test_category: List[str] = typer.Option( + ..., help="A list of test categories to run the evaluation on." + ), + api_sanity_check: bool = typer.Option( + False, + "--api-sanity-check", + "-c", + help="Perform the REST API status sanity check before running the evaluation.", + ), +): + """ + Evaluate results from run of one or more models on a test-category (same as eval_runner). + """ + # todo: make these params eval_runner_main + eval_runner.INPUT_PATH = "./result/" + eval_runner.PROMPT_PATH = "./data/" + eval_runner.POSSIBLE_ANSWER_PATH = "./data/possible_answer/" + eval_runner.OUTPUT_PATH = "./score/" + + # todo: change the eval_runner to not depend on OPENAI_API_KEY + os.environ["OPENAI_API_KEY"] = "BOGUS" + + eval_runner.main(model, test_category, api_sanity_check) + + +if __name__ == "__main__": + cli() diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index 922acfd7e..6a5358ad0 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -27,8 +27,12 @@ dependencies = [ "mistralai==0.4.2", "anthropic==0.31.1", "cohere==5.5.8", + "typer>=0.12.5", ] +[project.scripts] +bfcl = "bfcl.__main__:cli" + [tool.setuptools.packages.find] include = ["bfcl*"] From 2c71926b8530e5746b27d3e79fe8bf0adc413ff9 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 4 Sep 2024 13:36:47 -0400 Subject: [PATCH 06/16] order commands in help listing --- berkeley-function-call-leaderboard/bfcl/__main__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index b140e6142..7650e95d9 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -5,9 +5,19 @@ from bfcl.eval_checker import eval_runner import os + +class ExecutionOrderGroup(typer.core.TyperGroup): + def list_commands(self, ctx): + return [ + "run", + "evaluate", + ] + + cli = typer.Typer( context_settings=dict(help_option_names=["-h", "--help"]), no_args_is_help=True, + cls=ExecutionOrderGroup, ) From 70ad2cc70fba05e82a6cce9fa43d3c43b38c1527 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sun, 22 Sep 2024 19:56:57 -0400 Subject: [PATCH 07/16] add commands: test-categories, models, results, scores --- .../bfcl/__main__.py | 103 ++++++++++++++++++ .../pyproject.toml | 1 + 2 files changed, 104 insertions(+) diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index 7650e95d9..07b44d48a 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -3,14 +3,24 @@ import typer from bfcl._openfunctions_evaluation import main as openfunctions_main from bfcl.eval_checker import eval_runner +from bfcl.eval_checker.eval_runner_helper import MODEL_METADATA_MAPPING +from bfcl.eval_checker.eval_checker_constant import TEST_COLLECTION_MAPPING import os +from pathlib import Path +from datetime import datetime +from tabulate import tabulate +import csv class ExecutionOrderGroup(typer.core.TyperGroup): def list_commands(self, ctx): return [ + "models", + "test-categories", "run", + "results", "evaluate", + "scores", ] @@ -21,6 +31,32 @@ def list_commands(self, ctx): ) +@cli.command() +def test_categories(): + """ + List available test categories. + """ + table = tabulate( + [(category, "\n".join(test for test in tests)) for category, tests in TEST_COLLECTION_MAPPING.items()], + headers=["Test category", "Test names"], + tablefmt="grid", + ) + print(table) + + +@cli.command() +def models(): + """ + List available models. + """ + table = tabulate( + [[model] for model in MODEL_METADATA_MAPPING], + tablefmt="plain", + colalign=("left",), + ) + print(table) + + @cli.command() def run( model: List[str] = typer.Option( @@ -84,6 +120,53 @@ def run( ) +@cli.command() +def results(): + """ + List the results available for evaluation. + """ + + def display_name(name: str): + """ + Undo the / -> _ transformation if it happened. + + Args: + name (str): The name of the model in the result directory. + + Returns: + str: The original name of the model. + """ + if name not in MODEL_METADATA_MAPPING: + candidate = name.replace("_", "/") + if candidate in MODEL_METADATA_MAPPING: + return candidate + print(f"Unknown model name: {name}") + return name + + result_dir = Path("./result") # todo: make this configurable + if not result_dir.exists(): + print("No results available.") + return + + results_data = [] + for dir in result_dir.iterdir(): + results_data.append( + ( + display_name(dir.name), + datetime.fromtimestamp(dir.stat().st_ctime).strftime( + "%Y-%m-%d %H:%M:%S" + ), + ) + ) + print( + tabulate( + results_data, + headers=["Model name", "Creation time"], + tablefmt="pretty", + ) + ) + + @cli.command() def evaluate( model: List[str] = typer.Option(..., help="A list of model names to evaluate."), @@ -112,5 +195,25 @@ def evaluate( eval_runner.main(model, test_category, api_sanity_check) +@cli.command() +def scores(): + """ + Display the leaderboard. + """ + def truncate(text, length=22): + return (text[:length] + '...') if len(text) > length else text + + # files = ["./score/data_non_live.csv", "./score/data_live.csv", "./score/data_combined.csv"] + files = ["./score/data_combined.csv"] # todo: make ./score configurable + for file in files: + if os.path.exists(file): + with open(file, newline='') as csvfile: + reader = csv.reader(csvfile) + headers = [truncate(header) for header in next(reader)] # Read the header row + data = [[truncate(cell) for cell in row] for row in reader] # Read the rest of the data + print(tabulate(data, headers=headers, tablefmt='grid')) + else: + print(f"\nFile {file} not found.\n") + if __name__ == "__main__": cli() diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index 6a5358ad0..bff1ed5b8 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "anthropic==0.31.1", "cohere==5.5.8", "typer>=0.12.5", + "tabulate>=0.9.0", ] [project.scripts] From 384c3ab8c39eb3fbafc61a6d6a31fd7ff4bd0dbe Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Sun, 22 Sep 2024 18:01:07 -0700 Subject: [PATCH 08/16] update import path --- berkeley-function-call-leaderboard/bfcl/__main__.py | 7 ++----- .../bfcl/_openfunctions_evaluation.py | 3 --- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index 07b44d48a..8e332c9af 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -4,7 +4,7 @@ from bfcl._openfunctions_evaluation import main as openfunctions_main from bfcl.eval_checker import eval_runner from bfcl.eval_checker.eval_runner_helper import MODEL_METADATA_MAPPING -from bfcl.eval_checker.eval_checker_constant import TEST_COLLECTION_MAPPING +from bfcl.constant import TEST_COLLECTION_MAPPING import os from pathlib import Path from datetime import datetime @@ -189,9 +189,6 @@ def evaluate( eval_runner.POSSIBLE_ANSWER_PATH = "./data/possible_answer/" eval_runner.OUTPUT_PATH = "./score/" - # todo: change the eval_runner to not depend on OPENAI_API_KEY - os.environ["OPENAI_API_KEY"] = "BOGUS" - eval_runner.main(model, test_category, api_sanity_check) @@ -204,7 +201,7 @@ def truncate(text, length=22): return (text[:length] + '...') if len(text) > length else text # files = ["./score/data_non_live.csv", "./score/data_live.csv", "./score/data_combined.csv"] - files = ["./score/data_combined.csv"] # todo: make ./score configurable + files = ["./score/data_overall.csv"] # todo: make ./score configurable for file in files: if os.path.exists(file): with open(file, newline='') as csvfile: diff --git a/berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py b/berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py index 8ea874e54..614d2f2e5 100644 --- a/berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py +++ b/berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py @@ -8,7 +8,6 @@ from bfcl.constant import TEST_COLLECTION_MAPPING, TEST_FILE_MAPPING from bfcl.model_handler.handler_map import handler_map from bfcl.model_handler.model_style import ModelStyle -from dotenv import load_dotenv from tqdm import tqdm RETRY_LIMIT = 3 @@ -194,8 +193,6 @@ def generate_results(args, model_name, test_cases_total): def main(args): - load_dotenv(dotenv_path="./.env", verbose=True, override=True) # Load the .env file - if type(args.model) is not list: args.model = [args.model] if type(args.test_category) is not list: From edbbb48f94d7af4d6834f36f2db1619b75d1f455 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Tue, 8 Oct 2024 19:05:14 -0700 Subject: [PATCH 09/16] update code to be in sync with latest pipeline --- .../bfcl/__main__.py | 72 ++++++++----------- ...luation.py => _llm_response_generation.py} | 5 +- .../bfcl/eval_checker/eval_runner.py | 1 + .../bfcl/eval_checker/eval_runner_helper.py | 4 +- .../bfcl/model_handler/handler_map.py | 2 +- .../openfunctions_evaluation.py | 13 ++-- 6 files changed, 44 insertions(+), 53 deletions(-) rename berkeley-function-call-leaderboard/bfcl/{_openfunctions_evaluation.py => _llm_response_generation.py} (97%) diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index 8e332c9af..47f6c75be 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -1,15 +1,15 @@ -from typing import List +import csv from collections import namedtuple +from datetime import datetime +from typing import List + import typer -from bfcl._openfunctions_evaluation import main as openfunctions_main +from bfcl._llm_response_generation import main as generation_main +from bfcl.constant import DOTENV_PATH, RESULT_PATH, SCORE_PATH, TEST_COLLECTION_MAPPING from bfcl.eval_checker import eval_runner -from bfcl.eval_checker.eval_runner_helper import MODEL_METADATA_MAPPING -from bfcl.constant import TEST_COLLECTION_MAPPING -import os -from pathlib import Path -from datetime import datetime +from bfcl.model_handler.handler_map import HANDLER_MAP +from dotenv import load_dotenv from tabulate import tabulate -import csv class ExecutionOrderGroup(typer.core.TyperGroup): @@ -17,9 +17,9 @@ def list_commands(self, ctx): return [ "models", "test-categories", - "run", + "generation", "results", - "evaluate", + "evaluation", "scores", ] @@ -50,7 +50,7 @@ def models(): List available models. """ table = tabulate( - [[model] for model in MODEL_METADATA_MAPPING], + [[model] for model in HANDLER_MAP.keys()], tablefmt="plain", colalign=("left",), ) @@ -58,7 +58,7 @@ def models(): @cli.command() -def run( +def generation( model: List[str] = typer.Option( ["gorilla-openfunctions-v2"], help="A list of model names to evaluate." ), @@ -74,46 +74,40 @@ def run( temperature: float = typer.Option( 0.001, help="The temperature parameter for the model." ), - top_p: float = typer.Option(1.0, help="The top-p parameter for the model."), - max_tokens: int = typer.Option( - 1200, help="The maximum number of tokens for the model." + include_debugging_log: bool = typer.Option( + False, help="Include debugging log in the response file to see model's interaction with the state machine." ), num_gpus: int = typer.Option(1, help="The number of GPUs to use."), - timeout: int = typer.Option(60, help="The timeout for the model in seconds."), num_threads: int = typer.Option(1, help="The number of threads to use."), gpu_memory_utilization: float = typer.Option( 0.9, help="The GPU memory utilization." ), ): """ - Run one or more models on a test-category (same as openfunctions_evaluation). + Generate the LLM response for one or more models on a test-category (same as openfunctions_evaluation.py). """ - RunArgs = namedtuple( - "RunArgs", + generationArgs = namedtuple( + "generationArgs", [ "model", "test_category", "api_sanity_check", "temperature", - "top_p", - "max_tokens", + "include_debugging_log", "num_gpus", - "timeout", "num_threads", "gpu_memory_utilization", ], ) - openfunctions_main( - RunArgs( + generation_main( + generationArgs( model=model, test_category=test_category, api_sanity_check=api_sanity_check, temperature=temperature, - top_p=top_p, - max_tokens=max_tokens, + include_debugging_log=include_debugging_log, num_gpus=num_gpus, - timeout=timeout, num_threads=num_threads, gpu_memory_utilization=gpu_memory_utilization, ) @@ -136,17 +130,14 @@ def display_name(name: str): Returns: str: The original name of the model. """ - if name not in MODEL_METADATA_MAPPING: + if name not in HANDLER_MAP: candidate = name.replace("_", "/") - if candidate in MODEL_METADATA_MAPPING: + if candidate in HANDLER_MAP: return candidate print(f"Unknown model name: {name}") return name - result_dir = Path("./result") # todo: make this configurable - if not result_dir.exists(): - print("No results available.") - return + result_dir = RESULT_PATH results_data = [] for dir in result_dir.iterdir(): @@ -158,6 +149,7 @@ def display_name(name: str): ), ) ) + print( tabulate( results_data, @@ -168,7 +160,7 @@ def display_name(name: str): @cli.command() -def evaluate( +def evaluation( model: List[str] = typer.Option(..., help="A list of model names to evaluate."), test_category: List[str] = typer.Option( ..., help="A list of test categories to run the evaluation on." @@ -183,11 +175,6 @@ def evaluate( """ Evaluate results from run of one or more models on a test-category (same as eval_runner). """ - # todo: make these params eval_runner_main - eval_runner.INPUT_PATH = "./result/" - eval_runner.PROMPT_PATH = "./data/" - eval_runner.POSSIBLE_ANSWER_PATH = "./data/possible_answer/" - eval_runner.OUTPUT_PATH = "./score/" eval_runner.main(model, test_category, api_sanity_check) @@ -200,10 +187,10 @@ def scores(): def truncate(text, length=22): return (text[:length] + '...') if len(text) > length else text - # files = ["./score/data_non_live.csv", "./score/data_live.csv", "./score/data_combined.csv"] - files = ["./score/data_overall.csv"] # todo: make ./score configurable + # files = ["./score/data_non_live.csv", "./score/data_live.csv", "./score/data_overall.csv"] + files = [SCORE_PATH / "data_overall.csv"] for file in files: - if os.path.exists(file): + if file.exists(): with open(file, newline='') as csvfile: reader = csv.reader(csvfile) headers = [truncate(header) for header in next(reader)] # Read the header row @@ -213,4 +200,5 @@ def truncate(text, length=22): print(f"\nFile {file} not found.\n") if __name__ == "__main__": + load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file cli() diff --git a/berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py similarity index 97% rename from berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py rename to berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py index f0e0a4f10..03af01fe0 100644 --- a/berkeley-function-call-leaderboard/bfcl/_openfunctions_evaluation.py +++ b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py @@ -14,7 +14,7 @@ TEST_FILE_MAPPING, ) from bfcl.eval_checker.eval_runner_helper import is_executable -from bfcl.model_handler.handler_map import handler_map +from bfcl.model_handler.handler_map import HANDLER_MAP from bfcl.model_handler.model_style import ModelStyle from dotenv import load_dotenv from tqdm import tqdm @@ -46,7 +46,7 @@ def get_args(): def build_handler(model_name, temperature): - handler = handler_map[model_name](model_name, temperature) + handler = HANDLER_MAP[model_name](model_name, temperature) return handler @@ -195,7 +195,6 @@ def generate_results(args, model_name, test_cases_total): def main(args): - load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file if type(args.model) is not list: args.model = [args.model] diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py index 0b9ab3089..494db92f6 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -664,6 +664,7 @@ def main(model, test_category, api_sanity_check): ) args = parser.parse_args() + load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file main(args.model, args.test_category, args.api_sanity_check) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index bff3d2bc8..5393a5a82 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -11,7 +11,7 @@ from bfcl.eval_checker.constant import * from bfcl.eval_checker.executable_eval.custom_exception import BadAPIStatusError from bfcl.eval_checker.model_metadata import * -from bfcl.model_handler.handler_map import handler_map +from bfcl.model_handler.handler_map import HANDLER_MAP from tqdm import tqdm @@ -77,7 +77,7 @@ def load_file(file_path): def get_handler(model_name): - return handler_map[model_name](model_name, temperature=0) #Temperature doesn't matter for evaluation + return HANDLER_MAP[model_name](model_name, temperature=0) #Temperature doesn't matter for evaluation def write_list_of_dicts_to_file(filename, data, subdir=None): diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py index be5ea356e..e8cfa36f9 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py @@ -142,4 +142,4 @@ # "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler, } -handler_map = {**api_inference_handler_map, **local_inference_handler_map} +HANDLER_MAP = {**api_inference_handler_map, **local_inference_handler_map} diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py index 50b04d1ec..d8bc79904 100644 --- a/berkeley-function-call-leaderboard/openfunctions_evaluation.py +++ b/berkeley-function-call-leaderboard/openfunctions_evaluation.py @@ -1,8 +1,11 @@ -from bfcl._openfunctions_evaluation import get_args, main +from bfcl._llm_response_generation import get_args, main +from bfcl.constant import DOTENV_PATH from dotenv import load_dotenv +# Note: This file is still kept for compatibility with the old structure of the codebase. +# It is recommended to use the new `bfcl xxx` cli commands instead. +# We will remove this in the next major release. if __name__ == "__main__": - # TODO: Should we load the .env file here? - load_dotenv(dotenv_path="./.env", verbose=True, override=True) - - main(get_args()) \ No newline at end of file + load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file + + main(get_args()) From f5c597a51d8248c12eedf178e1316b6facab93e7 Mon Sep 17 00:00:00 2001 From: Huanzhi Mao Date: Thu, 10 Oct 2024 18:51:08 -0700 Subject: [PATCH 10/16] fix arguments for eval_runner; should be optional --- berkeley-function-call-leaderboard/bfcl/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index 47f6c75be..cb5f6a5e2 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -161,9 +161,9 @@ def display_name(name: str): @cli.command() def evaluation( - model: List[str] = typer.Option(..., help="A list of model names to evaluate."), + model: List[str] = typer.Option(None, help="A list of model names to evaluate."), test_category: List[str] = typer.Option( - ..., help="A list of test categories to run the evaluation on." + None, help="A list of test categories to run the evaluation on." ), api_sanity_check: bool = typer.Option( False, @@ -173,7 +173,7 @@ def evaluation( ), ): """ - Evaluate results from run of one or more models on a test-category (same as eval_runner). + Evaluate results from run of one or more models on a test-category (same as eval_runner.py). """ eval_runner.main(model, test_category, api_sanity_check) From d531e956cc9c397f6a84115ed3d560b18ba0f221 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Thu, 10 Oct 2024 20:09:53 -0700 Subject: [PATCH 11/16] transpose the score table for better visual --- .../bfcl/__main__.py | 52 ++++++++++++++----- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index cb5f6a5e2..b7698dc39 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -37,7 +37,10 @@ def test_categories(): List available test categories. """ table = tabulate( - [(category, "\n".join(test for test in tests)) for category, tests in TEST_COLLECTION_MAPPING.items()], + [ + (category, "\n".join(test for test in tests)) + for category, tests in TEST_COLLECTION_MAPPING.items() + ], headers=["Test category", "Test names"], tablefmt="grid", ) @@ -75,7 +78,8 @@ def generation( 0.001, help="The temperature parameter for the model." ), include_debugging_log: bool = typer.Option( - False, help="Include debugging log in the response file to see model's interaction with the state machine." + False, + help="Include debugging log in the response file to see model's interaction with the state machine.", ), num_gpus: int = typer.Option(1, help="The number of GPUs to use."), num_threads: int = typer.Option(1, help="The number of threads to use."), @@ -184,20 +188,42 @@ def scores(): """ Display the leaderboard. """ + def truncate(text, length=22): - return (text[:length] + '...') if len(text) > length else text + return (text[:length] + "...") if len(text) > length else text # files = ["./score/data_non_live.csv", "./score/data_live.csv", "./score/data_overall.csv"] - files = [SCORE_PATH / "data_overall.csv"] - for file in files: - if file.exists(): - with open(file, newline='') as csvfile: - reader = csv.reader(csvfile) - headers = [truncate(header) for header in next(reader)] # Read the header row - data = [[truncate(cell) for cell in row] for row in reader] # Read the rest of the data - print(tabulate(data, headers=headers, tablefmt='grid')) - else: - print(f"\nFile {file} not found.\n") + file = SCORE_PATH / "data_overall.csv" + + hidden_columns = [ + "Model Link", + "Cost ($ Per 1k Function Calls)", + "Latency Mean (s)", + "Latency Standard Deviation (s)", + "Latency 95th Percentile (s)", + "Organization", + "License", + ] + + if file.exists(): + with open(file, newline="") as csvfile: + reader = csv.reader(csvfile) + headers = next(reader) # Read the header row + column_indices = [ + i for i, header in enumerate(headers) if header not in hidden_columns + ] + filtered_headers = [headers[i] for i in column_indices] + data = [ + [row[i] for i in column_indices] for row in reader + ] # Read the rest of the data + model_names = [row[2] for row in data] # The model name will be used as the row header + data = [row[:2] + row[3:] for row in data] # Remove the model name from the data + filtered_headers.remove("Model") + transposed_data = list(zip(filtered_headers, *data)) + print(tabulate(transposed_data, headers=model_names, tablefmt="grid")) + else: + print(f"\nFile {file} not found.\n") + if __name__ == "__main__": load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file From 05cfca7c6a1c525972e4fbbb12df09a2e463f140 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Thu, 10 Oct 2024 20:10:14 -0700 Subject: [PATCH 12/16] update README accodingly for the CLI commands --- berkeley-function-call-leaderboard/CHANGELOG.md | 1 + berkeley-function-call-leaderboard/README.md | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md index 31b464a9a..81cda45f4 100644 --- a/berkeley-function-call-leaderboard/CHANGELOG.md +++ b/berkeley-function-call-leaderboard/CHANGELOG.md @@ -2,6 +2,7 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file. +- [Oct 10, 2024] [#621](https://github.com/ShishirPatil/gorilla/pull/621), [#675](https://github.com/ShishirPatil/gorilla/pull/675): Add a basic command-line interface for ease of use. - [Oct 5, 2024] [#642](https://github.com/ShishirPatil/gorilla/pull/642): Add the following new models to the leaderboard: - `Qwen/Qwen2.5-7B-Instruct` - `Qwen/Qwen2.5-1.5B-Instruct` diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index 79257ca82..b86eed0c0 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -101,7 +101,7 @@ If decided to run locally-hosted model, the generation script uses vLLM and ther Use the following command for LLM inference of the evaluation dataset with specific models. ```bash -python openfunctions_evaluation.py --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1 +bfcl generation --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1 ``` You can optionally specify the number of threads to use for _parallel inference_ by setting the `--num-threads` flag to speed up inference for **hosted models**, not applicable for OSS models. @@ -112,7 +112,7 @@ If no `MODEL_NAME` is provided, the model `gorilla-openfunctions-v2` will be use ### Models Available -Below is _a table of models we support_ to run our leaderboard evaluation against. If the models support function calling (FC), we will follow its function calling format provided by official documentation. Otherwise, we use a consistent system message to prompt the model to generate function calls in the right format. +Below is _a table of models we support_ to run our leaderboard evaluation against. If the models support function calling (FC), we will follow its function calling format provided by official documentation. Otherwise, we use a consistent system message to prompt the model to generate function calls in the right format. You can also use `bfcl models` command to list out all available models. |Model | Type | |---|---| @@ -196,7 +196,7 @@ For `Databrick-DBRX-instruct`, you need to create a Databrick Azure workspace an In the following two sections, the optional `--test-category` parameter can be used to specify the category of tests to run. You can specify multiple categories separated by spaces. Available options include: -- Available test groups: +- Available test groups (you can also use `bfcl test-categories` command to see): - `all`: All test categories. - This is the default option if no test category is provided. - `multi_turn`: All multi-turn test categories. @@ -247,7 +247,7 @@ In the following two sections, the optional `--test-category` parameter can be u Navigate to the `gorilla/berkeley-function-call-leaderboard/bfcl/eval_checker` directory and run the `eval_runner.py` script with the desired parameters. The basic syntax is as follows: ```bash -python eval_runner.py --model MODEL_NAME --test-category TEST_CATEGORY +bfcl evaluation --model MODEL_NAME --test-category TEST_CATEGORY ``` For available options for `MODEL_NAME` and `TEST_CATEGORY`, please refer to the [Models Available](#models-available) and [Available Test Category](#available-test-category) section. @@ -259,25 +259,25 @@ If no `MODEL_NAME` is provided, all available model results will be evaluated by If you want to run all tests for the `gorilla-openfunctions-v2` model, you can use the following command: ```bash -python eval_runner.py --model gorilla-openfunctions-v2 +bfcl evaluation --model gorilla-openfunctions-v2 ``` If you want to evaluate all offline tests (do not require RapidAPI keys) for OpenAI GPT-3.5, you can use the following command: ```bash -python eval_runner.py --model gpt-3.5-turbo-0125 --test-category ast +bfcl evaluation --model gpt-3.5-turbo-0125 --test-category ast ``` If you want to run the `rest` tests for a few Claude models, you can use the following command: ```bash -python eval_runner.py --model claude-3-5-sonnet-20240620 claude-3-opus-20240229 claude-3-sonnet-20240229 --test-category rest +bfcl evaluation --model claude-3-5-sonnet-20240620 claude-3-opus-20240229 claude-3-sonnet-20240229 --test-category rest ``` If you want to run `live_simple` and `javascript` tests for a few models and `gorilla-openfunctions-v2`, you can use the following command: ```bash -python eval_runner.py --model gorilla-openfunctions-v2 claude-3-5-sonnet-20240620 gpt-4-0125-preview gemini-1.5-pro-preview-0514 --test-category live_simple javascript +bfcl evaluation --model gorilla-openfunctions-v2 claude-3-5-sonnet-20240620 gpt-4-0125-preview gemini-1.5-pro-preview-0514 --test-category live_simple javascript ``` ### Model-Specific Optimization From f5a6d6cd2cccd9c7a1d72bf24e40de9b34ee9349 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Thu, 10 Oct 2024 20:15:04 -0700 Subject: [PATCH 13/16] rename 'run' to 'generate' --- berkeley-function-call-leaderboard/README.md | 12 ++++++------ berkeley-function-call-leaderboard/bfcl/__main__.py | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index b86eed0c0..0a3c3920c 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -101,7 +101,7 @@ If decided to run locally-hosted model, the generation script uses vLLM and ther Use the following command for LLM inference of the evaluation dataset with specific models. ```bash -bfcl generation --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1 +bfcl generate --model MODEL_NAME --test-category TEST_CATEGORY --num-threads 1 ``` You can optionally specify the number of threads to use for _parallel inference_ by setting the `--num-threads` flag to speed up inference for **hosted models**, not applicable for OSS models. @@ -247,7 +247,7 @@ In the following two sections, the optional `--test-category` parameter can be u Navigate to the `gorilla/berkeley-function-call-leaderboard/bfcl/eval_checker` directory and run the `eval_runner.py` script with the desired parameters. The basic syntax is as follows: ```bash -bfcl evaluation --model MODEL_NAME --test-category TEST_CATEGORY +bfcl evaluate --model MODEL_NAME --test-category TEST_CATEGORY ``` For available options for `MODEL_NAME` and `TEST_CATEGORY`, please refer to the [Models Available](#models-available) and [Available Test Category](#available-test-category) section. @@ -259,25 +259,25 @@ If no `MODEL_NAME` is provided, all available model results will be evaluated by If you want to run all tests for the `gorilla-openfunctions-v2` model, you can use the following command: ```bash -bfcl evaluation --model gorilla-openfunctions-v2 +bfcl evaluate --model gorilla-openfunctions-v2 ``` If you want to evaluate all offline tests (do not require RapidAPI keys) for OpenAI GPT-3.5, you can use the following command: ```bash -bfcl evaluation --model gpt-3.5-turbo-0125 --test-category ast +bfcl evaluate --model gpt-3.5-turbo-0125 --test-category ast ``` If you want to run the `rest` tests for a few Claude models, you can use the following command: ```bash -bfcl evaluation --model claude-3-5-sonnet-20240620 claude-3-opus-20240229 claude-3-sonnet-20240229 --test-category rest +bfcl evaluate --model claude-3-5-sonnet-20240620 claude-3-opus-20240229 claude-3-sonnet-20240229 --test-category rest ``` If you want to run `live_simple` and `javascript` tests for a few models and `gorilla-openfunctions-v2`, you can use the following command: ```bash -bfcl evaluation --model gorilla-openfunctions-v2 claude-3-5-sonnet-20240620 gpt-4-0125-preview gemini-1.5-pro-preview-0514 --test-category live_simple javascript +bfcl evaluate --model gorilla-openfunctions-v2 claude-3-5-sonnet-20240620 gpt-4-0125-preview gemini-1.5-pro-preview-0514 --test-category live_simple javascript ``` ### Model-Specific Optimization diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index b7698dc39..41dfe4fef 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -17,9 +17,9 @@ def list_commands(self, ctx): return [ "models", "test-categories", - "generation", + "generate", "results", - "evaluation", + "evaluate", "scores", ] @@ -61,7 +61,7 @@ def models(): @cli.command() -def generation( +def generate( model: List[str] = typer.Option( ["gorilla-openfunctions-v2"], help="A list of model names to evaluate." ), @@ -164,7 +164,7 @@ def display_name(name: str): @cli.command() -def evaluation( +def evaluate( model: List[str] = typer.Option(None, help="A list of model names to evaluate."), test_category: List[str] = typer.Option( None, help="A list of test categories to run the evaluation on." From 8275fe3f1f4c781eb784373591c00768bb7ae25e Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Thu, 10 Oct 2024 20:17:20 -0700 Subject: [PATCH 14/16] ignore .DS_Store in results command output --- berkeley-function-call-leaderboard/bfcl/__main__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index 41dfe4fef..4d2c72792 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -145,6 +145,10 @@ def display_name(name: str): results_data = [] for dir in result_dir.iterdir(): + # Check if it is a directory and not a file + if not dir.is_dir(): + continue + results_data.append( ( display_name(dir.name), From aee5587ae08ecaa4549e9904ed12f9e08cfb14cf Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Thu, 10 Oct 2024 22:25:17 -0700 Subject: [PATCH 15/16] better display score table --- .../bfcl/__main__.py | 33 +++++++++---------- .../bfcl/_apply_function_credential_config.py | 1 + 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/__main__.py b/berkeley-function-call-leaderboard/bfcl/__main__.py index 4d2c72792..6a75b8586 100644 --- a/berkeley-function-call-leaderboard/bfcl/__main__.py +++ b/berkeley-function-call-leaderboard/bfcl/__main__.py @@ -104,6 +104,7 @@ def generate( ], ) + load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file generation_main( generationArgs( model=model, @@ -184,6 +185,7 @@ def evaluate( Evaluate results from run of one or more models on a test-category (same as eval_runner.py). """ + load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file eval_runner.main(model, test_category, api_sanity_check) @@ -199,36 +201,31 @@ def truncate(text, length=22): # files = ["./score/data_non_live.csv", "./score/data_live.csv", "./score/data_overall.csv"] file = SCORE_PATH / "data_overall.csv" - hidden_columns = [ - "Model Link", - "Cost ($ Per 1k Function Calls)", - "Latency Mean (s)", - "Latency Standard Deviation (s)", - "Latency 95th Percentile (s)", - "Organization", - "License", + selected_columns = [ + "Rank", + "Model", + "Overall Acc", + "Non-Live AST Acc", + "Non-Live Exec Acc", + "Live Acc", + "Multi Turn Acc", + "Relevance Detection", + "Irrelevance Detection", ] if file.exists(): with open(file, newline="") as csvfile: reader = csv.reader(csvfile) headers = next(reader) # Read the header row - column_indices = [ - i for i, header in enumerate(headers) if header not in hidden_columns - ] - filtered_headers = [headers[i] for i in column_indices] + column_indices = [headers.index(col) for col in selected_columns] data = [ [row[i] for i in column_indices] for row in reader ] # Read the rest of the data - model_names = [row[2] for row in data] # The model name will be used as the row header - data = [row[:2] + row[3:] for row in data] # Remove the model name from the data - filtered_headers.remove("Model") - transposed_data = list(zip(filtered_headers, *data)) - print(tabulate(transposed_data, headers=model_names, tablefmt="grid")) + selected_columns = selected_columns[:-2] + ["Relevance", "Irrelevance"] # Shorten the column names + print(tabulate(data, headers=selected_columns, tablefmt="grid")) else: print(f"\nFile {file} not found.\n") if __name__ == "__main__": - load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file cli() diff --git a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py index 0cc282144..f30539863 100644 --- a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py +++ b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py @@ -72,6 +72,7 @@ def process_dir(input_dir, output_dir): def apply_function_credential_config(input_path=None, output_path=None): # Load the actual API keys, and verify that they are present for var in ENV_VARS: + # print(var in os.environ, os.getenv(var)) if var not in os.environ or not os.getenv(var): raise NoAPIKeyError() PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var) From 285b9034e898edfb6239b6b0a334b8e9420d5968 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Thu, 10 Oct 2024 22:25:58 -0700 Subject: [PATCH 16/16] clean up --- .../bfcl/_apply_function_credential_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py index f30539863..0cc282144 100644 --- a/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py +++ b/berkeley-function-call-leaderboard/bfcl/_apply_function_credential_config.py @@ -72,7 +72,6 @@ def process_dir(input_dir, output_dir): def apply_function_credential_config(input_path=None, output_path=None): # Load the actual API keys, and verify that they are present for var in ENV_VARS: - # print(var in os.environ, os.getenv(var)) if var not in os.environ or not os.getenv(var): raise NoAPIKeyError() PLACEHOLDERS[f"YOUR-{var.replace('_', '-')}"] = os.getenv(var)