From 893c9afeb920a60383815dfe0854d7ae020014f5 Mon Sep 17 00:00:00 2001 From: Devansh Amin Date: Mon, 8 Jul 2024 16:24:30 -0400 Subject: [PATCH] Make `eval_checker` consistent with `main` branch by merging (#496) --- .../bfcl/eval_checker/checker.py | 29 +- .../bfcl/eval_checker/custom_exception.py | 8 +- .../bfcl/eval_checker/eval_runner.py | 535 ++++++++++++++++++ .../bfcl/eval_checker/eval_runner_helper.py | 52 +- .../bfcl/evaluate.py | 518 ----------------- 5 files changed, 607 insertions(+), 535 deletions(-) create mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py index 7a64bc3bf..ee9562145 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py @@ -1,5 +1,3 @@ -from js_type_converter import js_type_converter -from java_type_converter import java_type_converter from model_handler.constant import ( UNDERSCORE_TO_DOT, JAVA_TYPE_CONVERSION, @@ -12,6 +10,11 @@ import time import json +# We switch to conditional import for the following two imports to avoid unnecessary installations. +# User doesn't need to setup the tree-sitter packages if they are not running the test for that language. +# from js_type_converter import js_type_converter +# from java_type_converter import java_type_converter + PYTHON_TYPE_MAPPING = { "string": str, "integer": int, @@ -362,9 +365,19 @@ def simple_function_checker( nested_type_converted = None if language == "Java": + from java_type_converter import java_type_converter + expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description] if expected_type_description in JAVA_TYPE_CONVERSION: + if type(value) != str: + result["valid"] = False + result["error"].append( + f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + result["error_type"] = "type_error:java" + return result + if expected_type_description in NESTED_CONVERSION_TYPE_LIST: nested_type = param_details[param]["items"]["type"] nested_type_converted = JAVA_TYPE_CONVERSION[nested_type] @@ -375,9 +388,19 @@ def simple_function_checker( value = java_type_converter(value, expected_type_description) elif language == "JavaScript": + from js_type_converter import js_type_converter + expected_type_converted = JS_TYPE_CONVERSION[expected_type_description] if expected_type_description in JS_TYPE_CONVERSION: + if type(value) != str: + result["valid"] = False + result["error"].append( + f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + result["error_type"] = "type_error:js" + return result + if expected_type_description in NESTED_CONVERSION_TYPE_LIST: nested_type = param_details[param]["items"]["type"] nested_type_converted = JS_TYPE_CONVERSION[nested_type] @@ -945,4 +968,4 @@ def exec_checker(decoded_result: list, func_description: dict, test_category: st func_description["execution_result"][0], func_description["execution_result_type"][0], False, - ) + ) \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py index e30fe81c5..3504862d8 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py @@ -1,10 +1,10 @@ class NoAPIKeyError(Exception): def __init__(self): - self.message = "Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." + self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." super().__init__(self.message) class BadAPIStatusError(Exception): - def __init__(self, message): - self.message = message - super().__init__(self.message) \ No newline at end of file + def __init__(self, errors, error_rate): + self.errors = errors + self.error_rate = error_rate \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py new file mode 100644 index 000000000..dd45c5dd4 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py @@ -0,0 +1,535 @@ +import sys + +sys.path.append("../") + +from checker import ast_checker, exec_checker, executable_checker_rest +from custom_exception import BadAPIStatusError +from eval_runner_helper import * +from tqdm import tqdm +import argparse + + +# NOTE: This file should be run in the `eval_checker` directory + + +def single_executable_file_runner( + handler, model_result, prompt, model_name, test_category +): + assert len(model_result) == len(prompt) + + result = [] + correct_count = 0 + for i in tqdm(range(len(model_result)), desc="Running tests"): + raw_result = model_result[i]["result"] + try: + decoded_result = handler.decode_execute(raw_result) + except Exception as e: + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [f"Failed to decode executable. {str(e)}"], + "error_type": "executable_decoder:decoder_failed", + "prompt": prompt[i], + "model_result_raw": raw_result, + } + ) + continue + + if "rest" in test_category: + # REST is always single-functioned. Therefore we take the first one and pass it to the REST checker. + if not is_rest_format_output(decoded_result): + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [ + "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." + ], + "error_type": "executable_decoder:rest_wrong_output_format", + "prompt": prompt[i], + "model_result_raw": str(raw_result), + "model_result_decoded": str(decoded_result), + } + ) + continue + + checker_result = executable_checker_rest(decoded_result[0], i) + + else: + if not is_executable_format_output(decoded_result): + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [ + "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." + ], + "error_type": "executable_decoder:wrong_output_format", + "prompt": prompt[i], + "model_result_raw": str(raw_result), + "model_result_decoded": str(decoded_result), + } + ) + continue + + prompt_item = prompt[i] + checker_result = exec_checker(decoded_result, prompt_item, test_category) + + if checker_result["valid"]: + correct_count += 1 + else: + temp = {} + temp["id"] = i + 1 + temp["model_name"] = model_name + temp["test_category"] = test_category + temp["valid"] = checker_result["valid"] + temp["error"] = checker_result["error"] + temp["error_type"] = checker_result["error_type"] + temp["prompt"] = prompt[i] + temp["model_result_raw"] = raw_result + temp["model_result_decoded"] = decoded_result + if "model_executed_output" in checker_result: + temp["model_executed_output"] = checker_result["model_executed_output"] + result.append(temp) + + accuracy = correct_count / len(model_result) + result.insert( + 0, + { + "accuracy": accuracy, + "correct_count": correct_count, + "total_count": len(model_result), + }, + ) + output_file_name = test_category + "_score.json" + output_file_dir = os.path.join(OUTPUT_PATH, model_name) + write_list_of_dicts_to_file(output_file_name, result, output_file_dir) + + return accuracy, len(model_result) + + +def single_relevance_file_runner(handler, model_result, model_name, test_category): + + result = [] + correct_count = 0 + for i in range(len(model_result)): + model_result_item = model_result[i]["result"] + success = False + decoded_result = None + + try: + decoded_result = handler.decode_ast(model_result_item, language="Python") + success = False + if is_empty_output(decoded_result): + success = True + + except Exception as e: + success = True + + if success: + correct_count += 1 + else: + temp = {} + temp["id"] = i + 1 + temp["model_name"] = model_name + temp["test_category"] = test_category + temp["valid"] = success + temp["error"] = [ + f"Valid syntax. Successfully decode AST when it should not." + ] + temp["error_type"] = "relevance_error:decoder_success" + temp["model_result"] = model_result_item + temp["decoded_result"] = decoded_result + + result.append(temp) + + accuracy = correct_count / len(model_result) + result.insert( + 0, + { + "accuracy": accuracy, + "correct_count": correct_count, + "total_count": len(model_result), + }, + ) + output_file_name = test_category + "_score.json" + output_file_dir = os.path.join(OUTPUT_PATH, model_name) + write_list_of_dicts_to_file(output_file_name, result, output_file_dir) + + return accuracy, len(model_result) + + +def single_ast_file_runner( + handler, model_result, prompt, possible_answer, language, test_category, model_name +): + assert ( + len(model_result) == len(prompt) == len(possible_answer) + ), "The length of the model result does not match the length of the prompt or possible answer. Please check the input files for completeness." + + result = [] + correct_count = 0 + for i in range(len(model_result)): + model_result_item = model_result[i]["result"] + prompt_item = prompt[i]["function"] + possible_answer_item = possible_answer[i] + + try: + model_result_item_raw = model_result_item + model_result_item = handler.decode_ast(model_result_item, language) + except Exception as e: + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [f"Invalid syntax. Failed to decode AST. {str(e)}"], + "error_type": "ast_decoder:decoder_failed", + "prompt": prompt[i], + "model_result_raw": model_result_item_raw, + "possible_answer": possible_answer_item, + } + ) + continue + + decoder_output_valid = is_function_calling_format_output(model_result_item) + if not decoder_output_valid: + result.append( + { + "id": i + 1, + "model_name": model_name, + "test_category": test_category, + "valid": False, + "error": [ + "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." + ], + "error_type": "ast_decoder:decoder_wrong_output_format", + "prompt": prompt[i], + "model_result_raw": str(model_result_item_raw), + "model_result_decoded": str(model_result_item), + "possible_answer": possible_answer_item, + } + ) + continue + + checker_result = ast_checker( + prompt_item, + model_result_item, + possible_answer_item, + language, + test_category, + model_name, + ) + + if checker_result["valid"]: + correct_count += 1 + else: + temp = {} + temp["id"] = i + 1 + temp["model_name"] = model_name + temp["test_category"] = test_category + temp["valid"] = checker_result["valid"] + temp["error"] = checker_result["error"] + temp["error_type"] = checker_result["error_type"] + temp["prompt"] = prompt[i] + temp["model_result_raw"] = model_result_item_raw + temp["model_result_decoded"] = model_result_item + temp["possible_answer"] = possible_answer_item + result.append(temp) + + accuracy = correct_count / len(model_result) + result.insert( + 0, + { + "accuracy": accuracy, + "correct_count": correct_count, + "total_count": len(model_result), + }, + ) + output_file_name = test_category + "_score.json" + output_file_dir = os.path.join(OUTPUT_PATH, model_name) + write_list_of_dicts_to_file(output_file_name, result, output_file_dir) + + return accuracy, len(model_result) + + +#### Main runner function #### +def runner(model_names, test_categories, api_sanity_check): + + # A flag to indicate if the API has been tested. + # We should always test the API with ground truth first before running the executable tests. + # Sometimes the API may not be working as expected and we want to catch that before running the evaluation to ensure the results are accurate. + API_TESTED = False + API_STATUS_ERROR_REST = None + API_STATUS_ERROR_EXECUTABLE = None + + # Before running the executable evaluation, we need to get the expected output from the ground truth. + # So we need a list of all the test categories that we have ran the ground truth evaluation on. + # We only get the expected output once for each test category. + EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = [] + + # Get a list of all entries in the folder + entries = os.scandir(INPUT_PATH) + + # Filter out the subdirectories + subdirs = [entry.path for entry in entries if entry.is_dir()] + + # Traverse each subdirectory + for subdir in subdirs: + + model_name = subdir.split(INPUT_PATH)[1] + if model_names is not None and model_name not in model_names: + continue + + model_name_escaped = model_name.replace("_", "/") + + files = [ + f + for f in os.listdir(subdir) + if os.path.isfile(os.path.join(subdir, f)) and not f.startswith(".") + ] + # Check if there is only one file and that file is 'result.json' + # If so, this is an OSS model result file and we need to special process it first + if len(files) == 1 and files[0] == "result.json": + result_json_file_path = os.path.join(subdir, "result.json") + oss_file_formatter(result_json_file_path, subdir) + print( + f"Detected OSS model: {model_name}. result.json has been split into individual test category files." + ) + + # Pattern to match JSON files in this subdirectory + json_files_pattern = os.path.join(subdir, "*.json") + + print(f"🦍 Model: {model_name}") + + # Find and process all JSON files in the subdirectory + for model_result_json in glob.glob(json_files_pattern): + + if os.path.basename(model_result_json) == "result.json": + continue + + test_category = extract_after_test(model_result_json) + if test_categories is not None and test_category not in test_categories: + continue + + handler = get_handler(model_name_escaped) + + # We don't evaluate chatable and SQL models in our current leaderboard + if is_chatable(test_category) or is_sql(test_category): + continue + + language = "Python" + if is_java(test_category): + language = "Java" + if is_js(test_category): + language = "JavaScript" + + print(f"πŸ” Running test: {test_category}") + + model_result = load_file(model_result_json) + record_cost_latency(LEADERBOARD_TABLE, model_name, model_result) + + if is_relevance(test_category): + accuracy, total_count = single_relevance_file_runner( + handler, model_result, model_name, test_category + ) + record_result( + LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count + ) + print(f"βœ… Test completed: {test_category}. 🎯 Accuracy: {accuracy}") + continue + + # Find the corresponding test file + prompt_file = find_file_with_suffix(PROMPT_PATH, test_category) + prompt = load_file(prompt_file) + + if is_executable(test_category): + # We only test the API with ground truth once + if not API_TESTED and api_sanity_check: + print("---- Sanity checking API status ----") + try: + api_status_sanity_check_rest() + except BadAPIStatusError as e: + API_STATUS_ERROR_REST = e + + try: + api_status_sanity_check_executable() + except BadAPIStatusError as e: + API_STATUS_ERROR_EXECUTABLE = e + + display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=True) + print("Continuing evaluation...") + + API_TESTED = True + + if ( + test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN + and not is_rest(test_category) + ): + print( + f"---- Getting real-time execution result from ground truth for {test_category} ----" + ) + get_executable_expected_output(prompt_file) + print( + f"---- Ground truth real-time execution result obtained for {test_category} 🌟 ----" + ) + EXECUTABLE_TEST_CATEGORIES_HAVE_RUN.append(test_category) + # Need to re-load the prompt file after getting the expected output, as the prompt file has been updated + prompt = load_file(prompt_file) + + accuracy, total_count = single_executable_file_runner( + handler, model_result, prompt, model_name, test_category + ) + record_result( + LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count + ) + print(f"βœ… Test completed: {test_category}. 🎯 Accuracy: {accuracy}") + + continue + + # Find the corresponding possible answer file + possible_answer_file = find_file_with_suffix( + POSSIBLE_ANSWER_PATH, test_category + ) + possible_answer = load_file(possible_answer_file) + accuracy, total_count = single_ast_file_runner( + handler, + model_result, + prompt, + possible_answer, + language, + test_category, + model_name, + ) + record_result( + LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count + ) + print(f"βœ… Test completed: {test_category}. 🎯 Accuracy: {accuracy}") + + # This function reads all the score files from local folder and updates the leaderboard table. + # This is helpful when you only want to run the evaluation for a subset of models and test categories. + update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH) + # Write the leaderboard table to a file + generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH) + + # Clean up the executable expected output files + # They should be re-generated the next time the evaluation is run + clean_up_executable_expected_output( + PROMPT_PATH, EXECUTABLE_TEST_CATEGORIES_HAVE_RUN + ) + + display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=False) + + print(f"🏁 Evaluation completed. See {os.path.abspath(OUTPUT_PATH + 'data.csv')} for evaluation results.") + + +ARG_PARSE_MAPPING = { + "ast": [ + "simple", + "multiple_function", + "parallel_function", + "parallel_multiple_function", + "java", + "javascript", + "relevance", + ], + "executable": [ + "executable_simple", + "executable_multiple_function", + "executable_parallel_function", + "executable_parallel_multiple_function", + "rest", + ], + "all": [ + "simple", + "multiple_function", + "parallel_function", + "parallel_multiple_function", + "java", + "javascript", + "relevance", + "executable_simple", + "executable_multiple_function", + "executable_parallel_function", + "executable_parallel_multiple_function", + "rest", + ], + "non-python": [ + "java", + "javascript", + ], + "python": [ + "simple", + "multiple_function", + "parallel_function", + "parallel_multiple_function", + "relevance", + "executable_simple", + "executable_multiple_function", + "executable_parallel_function", + "executable_parallel_multiple_function", + "rest", + ], +} + + +INPUT_PATH = "../result/" +PROMPT_PATH = "../data/" +POSSIBLE_ANSWER_PATH = "../data/possible_answer/" +OUTPUT_PATH = "../score/" + +# A dictionary to store the results +# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count +LEADERBOARD_TABLE = {} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process two lists of strings.") + + # Add arguments for two lists of strings + parser.add_argument( + "--model", nargs="+", type=str, help="A list of model names to evaluate" + ) + parser.add_argument( + "--test-category", + nargs="+", + type=str, + help="A list of test categories to run the evaluation on", + ) + parser.add_argument( + "-c", + "--api-sanity-check", + action="store_true", + default=False, # Default value is False, meaning the sanity check is skipped unless the flag is specified + help="Perform the REST API status sanity check before running the evaluation. By default, the sanity check is skipped.", + ) + + args = parser.parse_args() + + api_sanity_check = args.api_sanity_check + test_categories = None + if args.test_category is not None: + test_categories = [] + for test_category in args.test_category: + if test_category in ARG_PARSE_MAPPING: + test_categories.extend(ARG_PARSE_MAPPING[test_category]) + else: + test_categories.append(test_category) + + model_names = args.model + if args.model is not None: + model_names = [] + for model_name in args.model: + # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. + # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). + # We patch it here to avoid confusing the user. + model_names.append(model_name.replace("/", "_")) + + runner(model_names, test_categories, api_sanity_check) \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py index be44faf66..83e1e8917 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py @@ -372,6 +372,12 @@ "https://huggingface.co/nvidia/nemotron-4-340b-instruct", "NVIDIA", "nvidia-open-model-license" + ], + "THUDM/glm-4-9b-chat": [ + "GLM-4-9b-Chat (FC)", + "https://huggingface.co/THUDM/glm-4-9b-chat", + "THUDM", + "glm-4" ] } @@ -467,6 +473,7 @@ "meta-llama/Meta-Llama-3-8B-Instruct": 73, "meta-llama/Meta-Llama-3-70B-Instruct": 307, "gorilla-openfunctions-v2": 83, + "THUDM/glm-4-9b-chat": 223 } @@ -479,9 +486,10 @@ "meetkai/functionary-small-v2.4-FC", "snowflake/arctic", "nvidia/nemotron-4-340b-instruct", + "THUDM/glm-4-9b-chat", ] -# Price got from Azure, 22.032 per hour for 8 V100, Pay As You Go Total Price +# Price got from AZure, 22.032 per hour for 8 V100, Pay As You Go Total Price # Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/ V100_x8_PRICE_PER_HOUR = 22.032 @@ -630,9 +638,7 @@ def api_status_sanity_check_rest(): errors.append((data, status)) if correct_count != len(ground_truth_replaced): - [print("Data:", data, "\nError:", status["error"]) for data, status in errors] - error_msg = f"API Status Test Failed for REST Section. {len(ground_truth_replaced) - correct_count} out of {len(ground_truth_replaced)} API behaviors are not as expected. Be careful with executable test category results; they may be inaccurate." - raise BadAPIStatusError(error_msg) + raise BadAPIStatusError(errors, f"{len(ground_truth_replaced) - correct_count} / {len(ground_truth_replaced)}") def api_status_sanity_check_executable(): @@ -656,11 +662,37 @@ def api_status_sanity_check_executable(): errors.append((data, status)) if correct_count != len(ground_truth): - [print("Data:", data, "\nError:", status["error"]) for data, status in errors] - error_msg = f"API Status Test Failed for Executable Section. {len(ground_truth) - correct_count} out of {len(ground_truth)} API behaviors are not as expected. Be careful with executable test category results; they may be inaccurate." - raise BadAPIStatusError(error_msg) - - + raise BadAPIStatusError(errors, f"{len(ground_truth) - correct_count} / {len(ground_truth)}") + + +def display_api_status_error(rest_error, executable_error, display_success=False): + if not rest_error and not executable_error: + if display_success: + print("🟒 All API Status Test Passed!") + return None + + RED_FONT = "\033[91m" + RESET = "\033[0m" + + print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n") + + if rest_error: + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test category (REST). Please contact API provider.\n") + print(f"{rest_error.error_rate} APIs affected:\n") + for data, status in rest_error.errors: + print(f" - Test Case: {data['ground_truth']}") + print(f" Error Type: {status['error_type']}\n") + + if executable_error: + print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test categories (Non-REST). Please contact API provider.\n") + print(f"{executable_error.error_rate} APIs affected:\n") + for data, status in executable_error.errors: + print(f" - Test Case: {data['ground_truth'][0]}") + print(f" Error Type: {status['error_type']}\n") + + print(f"{RED_FONT}{'-' * 100}\n{RESET}") + + def get_executable_expected_output(prompt_file_path): # Before we run the evaluation, we need to add the "execution_result" field to the prompt file, using the ground truth data. prompt_content = load_file(prompt_file_path) @@ -995,4 +1027,4 @@ def collapse_json_objects(file_path): for obj in objects: json_obj = json.loads(obj) compact_json = json.dumps(json_obj, separators=(",", ":")) - out_file.write(compact_json + "\n") + out_file.write(compact_json + "\n") \ No newline at end of file diff --git a/berkeley-function-call-leaderboard/bfcl/evaluate.py b/berkeley-function-call-leaderboard/bfcl/evaluate.py index ec0b557c1..e69de29bb 100644 --- a/berkeley-function-call-leaderboard/bfcl/evaluate.py +++ b/berkeley-function-call-leaderboard/bfcl/evaluate.py @@ -1,518 +0,0 @@ -import sys - -sys.path.append("../") - -from checker import ast_checker, exec_checker, executable_checker_rest -from eval_runner_helper import * -from tqdm import tqdm -import argparse - - -# NOTE: This file should be run in the `eval_checker` directory - - -def single_executable_file_runner( - handler, model_result, prompt, model_name, test_category -): - assert len(model_result) == len(prompt) - - result = [] - correct_count = 0 - for i in tqdm(range(len(model_result)), desc="Running tests"): - raw_result = model_result[i]["result"] - try: - decoded_result = handler.decode_execute(raw_result) - except Exception as e: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [f"Failed to decode executable. {str(e)}"], - "error_type": "executable_decoder:decoder_failed", - "prompt": prompt[i], - "model_result_raw": raw_result, - } - ) - continue - - if "rest" in test_category: - # REST is always single-functioned. Therefore we take the first one and pass it to the REST checker. - if not is_rest_format_output(decoded_result): - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "executable_decoder:rest_wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(raw_result), - "model_result_decoded": str(decoded_result), - } - ) - continue - - checker_result = executable_checker_rest(decoded_result[0], i) - - else: - if not is_executable_format_output(decoded_result): - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "executable_decoder:wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(raw_result), - "model_result_decoded": str(decoded_result), - } - ) - continue - - prompt_item = prompt[i] - checker_result = exec_checker(decoded_result, prompt_item, test_category) - - if checker_result["valid"]: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = checker_result["valid"] - temp["error"] = checker_result["error"] - temp["error_type"] = checker_result["error_type"] - temp["prompt"] = prompt[i] - temp["model_result_raw"] = raw_result - temp["model_result_decoded"] = decoded_result - if "model_executed_output" in checker_result: - temp["model_executed_output"] = checker_result["model_executed_output"] - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -def single_relevance_file_runner(handler, model_result, model_name, test_category): - - result = [] - correct_count = 0 - for i in range(len(model_result)): - model_result_item = model_result[i]["result"] - success = False - decoded_result = None - - try: - decoded_result = handler.decode_ast(model_result_item, language="Python") - success = False - if is_empty_output(decoded_result): - success = True - - except Exception as e: - success = True - - if success: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = success - temp["error"] = [ - f"Valid syntax. Successfully decode AST when it should not." - ] - temp["error_type"] = "relevance_error:decoder_success" - temp["model_result"] = model_result_item - temp["decoded_result"] = decoded_result - - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -def single_ast_file_runner( - handler, model_result, prompt, possible_answer, language, test_category, model_name -): - assert ( - len(model_result) == len(prompt) == len(possible_answer) - ), "The length of the model result does not match the length of the prompt or possible answer. Please check the input files for completeness." - - result = [] - correct_count = 0 - for i in range(len(model_result)): - model_result_item = model_result[i]["result"] - prompt_item = prompt[i]["function"] - possible_answer_item = possible_answer[i] - - try: - model_result_item_raw = model_result_item - model_result_item = handler.decode_ast(model_result_item, language) - except Exception as e: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [f"Invalid syntax. Failed to decode AST. {str(e)}"], - "error_type": "ast_decoder:decoder_failed", - "prompt": prompt[i], - "model_result_raw": model_result_item_raw, - "possible_answer": possible_answer_item, - } - ) - continue - - decoder_output_valid = is_function_calling_format_output(model_result_item) - if not decoder_output_valid: - result.append( - { - "id": i + 1, - "model_name": model_name, - "test_category": test_category, - "valid": False, - "error": [ - "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." - ], - "error_type": "ast_decoder:decoder_wrong_output_format", - "prompt": prompt[i], - "model_result_raw": str(model_result_item_raw), - "model_result_decoded": str(model_result_item), - "possible_answer": possible_answer_item, - } - ) - continue - - checker_result = ast_checker( - prompt_item, - model_result_item, - possible_answer_item, - language, - test_category, - model_name, - ) - - if checker_result["valid"]: - correct_count += 1 - else: - temp = {} - temp["id"] = i + 1 - temp["model_name"] = model_name - temp["test_category"] = test_category - temp["valid"] = checker_result["valid"] - temp["error"] = checker_result["error"] - temp["error_type"] = checker_result["error_type"] - temp["prompt"] = prompt[i] - temp["model_result_raw"] = model_result_item_raw - temp["model_result_decoded"] = model_result_item - temp["possible_answer"] = possible_answer_item - result.append(temp) - - accuracy = correct_count / len(model_result) - result.insert( - 0, - { - "accuracy": accuracy, - "correct_count": correct_count, - "total_count": len(model_result), - }, - ) - output_file_name = test_category + "_score.json" - output_file_dir = os.path.join(OUTPUT_PATH, model_name) - write_list_of_dicts_to_file(output_file_name, result, output_file_dir) - - return accuracy, len(model_result) - - -#### Main runner function #### -def runner(model_names, test_categories, api_sanity_check): - - # A flag to indicate if the API has been tested. - # We should always test the API with ground truth first before running the executable tests. - # Sometimes the API may not be working as expected and we want to catch that before running the evaluation to ensure the results are accurate. - API_TESTED = False - - # Before running the executable evaluation, we need to get the expected output from the ground truth. - # So we need a list of all the test categories that we have ran the ground truth evaluation on. - # We only get the expected output once for each test category. - EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = [] - - # Get a list of all entries in the folder - entries = os.scandir(INPUT_PATH) - - # Filter out the subdirectories - subdirs = [entry.path for entry in entries if entry.is_dir()] - - # Traverse each subdirectory - for subdir in subdirs: - - model_name = subdir.split(INPUT_PATH)[1] - if model_names is not None and model_name not in model_names: - continue - - model_name_escaped = model_name.replace("_", "/") - - files = [ - f - for f in os.listdir(subdir) - if os.path.isfile(os.path.join(subdir, f)) and not f.startswith(".") - ] - # Check if there is only one file and that file is 'result.json' - # If so, this is an OSS model result file and we need to special process it first - if len(files) == 1 and files[0] == "result.json": - result_json_file_path = os.path.join(subdir, "result.json") - oss_file_formatter(result_json_file_path, subdir) - print( - f"Detected OSS model: {model_name}. result.json has been split into individual test category files." - ) - - # Pattern to match JSON files in this subdirectory - json_files_pattern = os.path.join(subdir, "*.json") - - print(f"🦍 Model: {model_name}") - - # Find and process all JSON files in the subdirectory - for model_result_json in glob.glob(json_files_pattern): - - if os.path.basename(model_result_json) == "result.json": - continue - - test_category = extract_after_test(model_result_json) - if test_categories is not None and test_category not in test_categories: - continue - - handler = get_handler(model_name_escaped) - - # We don't evaluate chatable and SQL models in our current leaderboard - if is_chatable(test_category) or is_sql(test_category): - continue - - language = "Python" - if is_java(test_category): - language = "Java" - if is_js(test_category): - language = "JavaScript" - - print(f"πŸ” Running test: {test_category}") - - model_result = load_file(model_result_json) - record_cost_latency(LEADERBOARD_TABLE, model_name, model_result) - - if is_relevance(test_category): - accuracy, total_count = single_relevance_file_runner( - handler, model_result, model_name, test_category - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"βœ… Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - continue - - # Find the corresponding test file - prompt_file = find_file_with_suffix(PROMPT_PATH, test_category) - prompt = load_file(prompt_file) - - if is_executable(test_category): - # We only test the API with ground truth once - if not API_TESTED and api_sanity_check: - print("---- Sanity checking API status ----") - api_status_sanity_check_rest() - api_status_sanity_check_executable() - print("---- Sanity check Passed πŸ’― ----") - API_TESTED = True - - if ( - test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN - and not is_rest(test_category) - ): - print( - f"---- Getting real-time execution result from ground truth for {test_category} ----" - ) - get_executable_expected_output(prompt_file) - print( - f"---- Ground truth real-time execution result obtained for {test_category} 🌟 ----" - ) - EXECUTABLE_TEST_CATEGORIES_HAVE_RUN.append(test_category) - # Need to re-load the prompt file after getting the expected output, as the prompt file has been updated - prompt = load_file(prompt_file) - - accuracy, total_count = single_executable_file_runner( - handler, model_result, prompt, model_name, test_category - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"βœ… Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - - continue - - # Find the corresponding possible answer file - possible_answer_file = find_file_with_suffix( - POSSIBLE_ANSWER_PATH, test_category - ) - possible_answer = load_file(possible_answer_file) - accuracy, total_count = single_ast_file_runner( - handler, - model_result, - prompt, - possible_answer, - language, - test_category, - model_name, - ) - record_result( - LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count - ) - print(f"βœ… Test completed: {test_category}. 🎯 Accuracy: {accuracy}") - - # This function reads all the score files from local folder and updates the leaderboard table. - # This is helpful when you only want to run the evaluation for a subset of models and test categories. - update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH) - # Write the leaderboard table to a file - generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH) - - # Clean up the executable expected output files - # They should be re-generated the next time the evaluation is run - clean_up_executable_expected_output( - PROMPT_PATH, EXECUTABLE_TEST_CATEGORIES_HAVE_RUN - ) - - -ARG_PARSE_MAPPING = { - "ast": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "java", - "javascript", - "relevance", - ], - "executable": [ - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], - "all": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "java", - "javascript", - "relevance", - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], - "non-python": [ - "java", - "javascript", - ], - "python": [ - "simple", - "multiple_function", - "parallel_function", - "parallel_multiple_function", - "relevance", - "executable_simple", - "executable_multiple_function", - "executable_parallel_function", - "executable_parallel_multiple_function", - "rest", - ], -} - - -INPUT_PATH = "../result/" -PROMPT_PATH = "../data/" -POSSIBLE_ANSWER_PATH = "../data/possible_answer/" -OUTPUT_PATH = "../score/" - -# A dictionary to store the results -# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count -LEADERBOARD_TABLE = {} - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Process two lists of strings.") - - # Add arguments for two lists of strings - parser.add_argument( - "--model", nargs="+", type=str, help="A list of model names to evaluate" - ) - parser.add_argument( - "--test-category", - nargs="+", - type=str, - help="A list of test categories to run the evaluation on", - ) - parser.add_argument( - "-s", - "--skip-api-sanity-check", - action="store_false", - default=True, # Default value is True, meaning the sanity check is performed unless the flag is specified - help="Skip the REST API status sanity check before running the evaluation. By default, the sanity check is performed.", - ) - - args = parser.parse_args() - - api_sanity_check = args.skip_api_sanity_check - test_categories = None - if args.test_category is not None: - test_categories = [] - for test_category in args.test_category: - if test_category in ARG_PARSE_MAPPING: - test_categories.extend(ARG_PARSE_MAPPING[test_category]) - else: - test_categories.append(test_category) - - model_names = args.model - if args.model is not None: - model_names = [] - for model_name in args.model: - # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues. - # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/"). - # We patch it here to avoid confusing the user. - model_names.append(model_name.replace("/", "_")) - - runner(model_names, test_categories, api_sanity_check)