From 893c9afeb920a60383815dfe0854d7ae020014f5 Mon Sep 17 00:00:00 2001
From: Devansh Amin <devanshamin97@gmail.com>
Date: Mon, 8 Jul 2024 16:24:30 -0400
Subject: [PATCH] Make `eval_checker` consistent with `main` branch by merging
 (#496)

---
 .../bfcl/eval_checker/checker.py              |  29 +-
 .../bfcl/eval_checker/custom_exception.py     |   8 +-
 .../bfcl/eval_checker/eval_runner.py          | 535 ++++++++++++++++++
 .../bfcl/eval_checker/eval_runner_helper.py   |  52 +-
 .../bfcl/evaluate.py                          | 518 -----------------
 5 files changed, 607 insertions(+), 535 deletions(-)
 create mode 100644 berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py
index 7a64bc3bf..ee9562145 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/checker.py
@@ -1,5 +1,3 @@
-from js_type_converter import js_type_converter
-from java_type_converter import java_type_converter
 from model_handler.constant import (
     UNDERSCORE_TO_DOT,
     JAVA_TYPE_CONVERSION,
@@ -12,6 +10,11 @@
 import time
 import json
 
+# We switch to conditional import for the following two imports to avoid unnecessary installations.
+# User doesn't need to setup the tree-sitter packages if they are not running the test for that language.
+# from js_type_converter import js_type_converter
+# from java_type_converter import java_type_converter
+
 PYTHON_TYPE_MAPPING = {
     "string": str,
     "integer": int,
@@ -362,9 +365,19 @@ def simple_function_checker(
         nested_type_converted = None
 
         if language == "Java":
+            from java_type_converter import java_type_converter
+
             expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description]
 
             if expected_type_description in JAVA_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:java"
+                    return result
+
                 if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
                     nested_type = param_details[param]["items"]["type"]
                     nested_type_converted = JAVA_TYPE_CONVERSION[nested_type]
@@ -375,9 +388,19 @@ def simple_function_checker(
                     value = java_type_converter(value, expected_type_description)
 
         elif language == "JavaScript":
+            from js_type_converter import js_type_converter
+
             expected_type_converted = JS_TYPE_CONVERSION[expected_type_description]
 
             if expected_type_description in JS_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:js"
+                    return result
+
                 if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
                     nested_type = param_details[param]["items"]["type"]
                     nested_type_converted = JS_TYPE_CONVERSION[nested_type]
@@ -945,4 +968,4 @@ def exec_checker(decoded_result: list, func_description: dict, test_category: st
             func_description["execution_result"][0],
             func_description["execution_result_type"][0],
             False,
-        )
+        )
\ No newline at end of file
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py
index e30fe81c5..3504862d8 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/custom_exception.py
@@ -1,10 +1,10 @@
 class NoAPIKeyError(Exception):
     def __init__(self):
-        self.message = "Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
+        self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
         super().__init__(self.message)
         
         
 class BadAPIStatusError(Exception):
-    def __init__(self, message):
-        self.message = message
-        super().__init__(self.message)
\ No newline at end of file
+    def __init__(self, errors, error_rate):
+        self.errors = errors
+        self.error_rate = error_rate
\ No newline at end of file
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
new file mode 100644
index 000000000..dd45c5dd4
--- /dev/null
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -0,0 +1,535 @@
+import sys
+
+sys.path.append("../")
+
+from checker import ast_checker, exec_checker, executable_checker_rest
+from custom_exception import BadAPIStatusError
+from eval_runner_helper import *
+from tqdm import tqdm
+import argparse
+
+
+# NOTE: This file should be run in the `eval_checker` directory
+
+
+def single_executable_file_runner(
+    handler, model_result, prompt, model_name, test_category
+):
+    assert len(model_result) == len(prompt)
+
+    result = []
+    correct_count = 0
+    for i in tqdm(range(len(model_result)), desc="Running tests"):
+        raw_result = model_result[i]["result"]
+        try:
+            decoded_result = handler.decode_execute(raw_result)
+        except Exception as e:
+            result.append(
+                {
+                    "id": i + 1,
+                    "model_name": model_name,
+                    "test_category": test_category,
+                    "valid": False,
+                    "error": [f"Failed to decode executable. {str(e)}"],
+                    "error_type": "executable_decoder:decoder_failed",
+                    "prompt": prompt[i],
+                    "model_result_raw": raw_result,
+                }
+            )
+            continue
+
+        if "rest" in test_category:
+            # REST is always single-functioned. Therefore we take the first one and pass it to the REST checker.
+            if not is_rest_format_output(decoded_result):
+                result.append(
+                    {
+                        "id": i + 1,
+                        "model_name": model_name,
+                        "test_category": test_category,
+                        "valid": False,
+                        "error": [
+                            "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
+                        ],
+                        "error_type": "executable_decoder:rest_wrong_output_format",
+                        "prompt": prompt[i],
+                        "model_result_raw": str(raw_result),
+                        "model_result_decoded": str(decoded_result),
+                    }
+                )
+                continue
+
+            checker_result = executable_checker_rest(decoded_result[0], i)
+
+        else:
+            if not is_executable_format_output(decoded_result):
+                result.append(
+                    {
+                        "id": i + 1,
+                        "model_name": model_name,
+                        "test_category": test_category,
+                        "valid": False,
+                        "error": [
+                            "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
+                        ],
+                        "error_type": "executable_decoder:wrong_output_format",
+                        "prompt": prompt[i],
+                        "model_result_raw": str(raw_result),
+                        "model_result_decoded": str(decoded_result),
+                    }
+                )
+                continue
+
+            prompt_item = prompt[i]
+            checker_result = exec_checker(decoded_result, prompt_item, test_category)
+
+        if checker_result["valid"]:
+            correct_count += 1
+        else:
+            temp = {}
+            temp["id"] = i + 1
+            temp["model_name"] = model_name
+            temp["test_category"] = test_category
+            temp["valid"] = checker_result["valid"]
+            temp["error"] = checker_result["error"]
+            temp["error_type"] = checker_result["error_type"]
+            temp["prompt"] = prompt[i]
+            temp["model_result_raw"] = raw_result
+            temp["model_result_decoded"] = decoded_result
+            if "model_executed_output" in checker_result:
+                temp["model_executed_output"] = checker_result["model_executed_output"]
+            result.append(temp)
+
+    accuracy = correct_count / len(model_result)
+    result.insert(
+        0,
+        {
+            "accuracy": accuracy,
+            "correct_count": correct_count,
+            "total_count": len(model_result),
+        },
+    )
+    output_file_name = test_category + "_score.json"
+    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
+
+    return accuracy, len(model_result)
+
+
+def single_relevance_file_runner(handler, model_result, model_name, test_category):
+
+    result = []
+    correct_count = 0
+    for i in range(len(model_result)):
+        model_result_item = model_result[i]["result"]
+        success = False
+        decoded_result = None
+
+        try:
+            decoded_result = handler.decode_ast(model_result_item, language="Python")
+            success = False
+            if is_empty_output(decoded_result):
+                success = True
+
+        except Exception as e:
+            success = True
+
+        if success:
+            correct_count += 1
+        else:
+            temp = {}
+            temp["id"] = i + 1
+            temp["model_name"] = model_name
+            temp["test_category"] = test_category
+            temp["valid"] = success
+            temp["error"] = [
+                f"Valid syntax. Successfully decode AST when it should not."
+            ]
+            temp["error_type"] = "relevance_error:decoder_success"
+            temp["model_result"] = model_result_item
+            temp["decoded_result"] = decoded_result
+
+            result.append(temp)
+
+    accuracy = correct_count / len(model_result)
+    result.insert(
+        0,
+        {
+            "accuracy": accuracy,
+            "correct_count": correct_count,
+            "total_count": len(model_result),
+        },
+    )
+    output_file_name = test_category + "_score.json"
+    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
+
+    return accuracy, len(model_result)
+
+
+def single_ast_file_runner(
+    handler, model_result, prompt, possible_answer, language, test_category, model_name
+):
+    assert (
+        len(model_result) == len(prompt) == len(possible_answer)
+    ), "The length of the model result does not match the length of the prompt or possible answer. Please check the input files for completeness."
+
+    result = []
+    correct_count = 0
+    for i in range(len(model_result)):
+        model_result_item = model_result[i]["result"]
+        prompt_item = prompt[i]["function"]
+        possible_answer_item = possible_answer[i]
+
+        try:
+            model_result_item_raw = model_result_item
+            model_result_item = handler.decode_ast(model_result_item, language)
+        except Exception as e:
+            result.append(
+                {
+                    "id": i + 1,
+                    "model_name": model_name,
+                    "test_category": test_category,
+                    "valid": False,
+                    "error": [f"Invalid syntax. Failed to decode AST. {str(e)}"],
+                    "error_type": "ast_decoder:decoder_failed",
+                    "prompt": prompt[i],
+                    "model_result_raw": model_result_item_raw,
+                    "possible_answer": possible_answer_item,
+                }
+            )
+            continue
+
+        decoder_output_valid = is_function_calling_format_output(model_result_item)
+        if not decoder_output_valid:
+            result.append(
+                {
+                    "id": i + 1,
+                    "model_name": model_name,
+                    "test_category": test_category,
+                    "valid": False,
+                    "error": [
+                        "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
+                    ],
+                    "error_type": "ast_decoder:decoder_wrong_output_format",
+                    "prompt": prompt[i],
+                    "model_result_raw": str(model_result_item_raw),
+                    "model_result_decoded": str(model_result_item),
+                    "possible_answer": possible_answer_item,
+                }
+            )
+            continue
+
+        checker_result = ast_checker(
+            prompt_item,
+            model_result_item,
+            possible_answer_item,
+            language,
+            test_category,
+            model_name,
+        )
+
+        if checker_result["valid"]:
+            correct_count += 1
+        else:
+            temp = {}
+            temp["id"] = i + 1
+            temp["model_name"] = model_name
+            temp["test_category"] = test_category
+            temp["valid"] = checker_result["valid"]
+            temp["error"] = checker_result["error"]
+            temp["error_type"] = checker_result["error_type"]
+            temp["prompt"] = prompt[i]
+            temp["model_result_raw"] = model_result_item_raw
+            temp["model_result_decoded"] = model_result_item
+            temp["possible_answer"] = possible_answer_item
+            result.append(temp)
+
+    accuracy = correct_count / len(model_result)
+    result.insert(
+        0,
+        {
+            "accuracy": accuracy,
+            "correct_count": correct_count,
+            "total_count": len(model_result),
+        },
+    )
+    output_file_name = test_category + "_score.json"
+    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
+    write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
+
+    return accuracy, len(model_result)
+
+
+#### Main runner function ####
+def runner(model_names, test_categories, api_sanity_check):
+
+    # A flag to indicate if the API has been tested.
+    # We should always test the API with ground truth first before running the executable tests.
+    # Sometimes the API may not be working as expected and we want to catch that before running the evaluation to ensure the results are accurate.
+    API_TESTED = False
+    API_STATUS_ERROR_REST = None
+    API_STATUS_ERROR_EXECUTABLE = None
+
+    # Before running the executable evaluation, we need to get the expected output from the ground truth.
+    # So we need a list of all the test categories that we have ran the ground truth evaluation on.
+    # We only get the expected output once for each test category.
+    EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = []
+
+    # Get a list of all entries in the folder
+    entries = os.scandir(INPUT_PATH)
+
+    # Filter out the subdirectories
+    subdirs = [entry.path for entry in entries if entry.is_dir()]
+
+    # Traverse each subdirectory
+    for subdir in subdirs:
+
+        model_name = subdir.split(INPUT_PATH)[1]
+        if model_names is not None and model_name not in model_names:
+            continue
+
+        model_name_escaped = model_name.replace("_", "/")
+
+        files = [
+            f
+            for f in os.listdir(subdir)
+            if os.path.isfile(os.path.join(subdir, f)) and not f.startswith(".")
+        ]
+        # Check if there is only one file and that file is 'result.json'
+        # If so, this is an OSS model result file and we need to special process it first
+        if len(files) == 1 and files[0] == "result.json":
+            result_json_file_path = os.path.join(subdir, "result.json")
+            oss_file_formatter(result_json_file_path, subdir)
+            print(
+                f"Detected OSS model: {model_name}. result.json has been split into individual test category files."
+            )
+
+        # Pattern to match JSON files in this subdirectory
+        json_files_pattern = os.path.join(subdir, "*.json")
+
+        print(f"🦍 Model: {model_name}")
+
+        # Find and process all JSON files in the subdirectory
+        for model_result_json in glob.glob(json_files_pattern):
+
+            if os.path.basename(model_result_json) == "result.json":
+                continue
+
+            test_category = extract_after_test(model_result_json)
+            if test_categories is not None and test_category not in test_categories:
+                continue
+
+            handler = get_handler(model_name_escaped)
+
+            # We don't evaluate chatable and SQL models in our current leaderboard
+            if is_chatable(test_category) or is_sql(test_category):
+                continue
+
+            language = "Python"
+            if is_java(test_category):
+                language = "Java"
+            if is_js(test_category):
+                language = "JavaScript"
+
+            print(f"🔍 Running test: {test_category}")
+
+            model_result = load_file(model_result_json)
+            record_cost_latency(LEADERBOARD_TABLE, model_name, model_result)
+
+            if is_relevance(test_category):
+                accuracy, total_count = single_relevance_file_runner(
+                    handler, model_result, model_name, test_category
+                )
+                record_result(
+                    LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count
+                )
+                print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}")
+                continue
+
+            # Find the corresponding test file
+            prompt_file = find_file_with_suffix(PROMPT_PATH, test_category)
+            prompt = load_file(prompt_file)
+
+            if is_executable(test_category):
+                # We only test the API with ground truth once
+                if not API_TESTED and api_sanity_check:
+                    print("---- Sanity checking API status ----")
+                    try:
+                        api_status_sanity_check_rest()
+                    except BadAPIStatusError as e:
+                        API_STATUS_ERROR_REST = e
+
+                    try:
+                        api_status_sanity_check_executable()
+                    except BadAPIStatusError as e:
+                        API_STATUS_ERROR_EXECUTABLE = e    
+
+                    display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=True)
+                    print("Continuing evaluation...")
+                    
+                    API_TESTED = True
+
+                if (
+                    test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN
+                    and not is_rest(test_category)
+                ):
+                    print(
+                        f"---- Getting real-time execution result from ground truth for {test_category} ----"
+                    )
+                    get_executable_expected_output(prompt_file)
+                    print(
+                        f"---- Ground truth real-time execution result obtained for {test_category} 🌟 ----"
+                    )
+                    EXECUTABLE_TEST_CATEGORIES_HAVE_RUN.append(test_category)
+                    # Need to re-load the prompt file after getting the expected output, as the prompt file has been updated
+                    prompt = load_file(prompt_file)
+
+                accuracy, total_count = single_executable_file_runner(
+                    handler, model_result, prompt, model_name, test_category
+                )
+                record_result(
+                    LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count
+                )
+                print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}")
+
+                continue
+
+            # Find the corresponding possible answer file
+            possible_answer_file = find_file_with_suffix(
+                POSSIBLE_ANSWER_PATH, test_category
+            )
+            possible_answer = load_file(possible_answer_file)
+            accuracy, total_count = single_ast_file_runner(
+                handler,
+                model_result,
+                prompt,
+                possible_answer,
+                language,
+                test_category,
+                model_name,
+            )
+            record_result(
+                LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count
+            )
+            print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}")
+
+    # This function reads all the score files from local folder and updates the leaderboard table.
+    # This is helpful when you only want to run the evaluation for a subset of models and test categories.
+    update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH)
+    # Write the leaderboard table to a file
+    generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH)
+
+    # Clean up the executable expected output files
+    # They should be re-generated the next time the evaluation is run
+    clean_up_executable_expected_output(
+        PROMPT_PATH, EXECUTABLE_TEST_CATEGORIES_HAVE_RUN
+    )
+    
+    display_api_status_error(API_STATUS_ERROR_REST, API_STATUS_ERROR_EXECUTABLE, display_success=False)
+    
+    print(f"🏁 Evaluation completed. See {os.path.abspath(OUTPUT_PATH + 'data.csv')} for evaluation results.")
+
+
+ARG_PARSE_MAPPING = {
+    "ast": [
+        "simple",
+        "multiple_function",
+        "parallel_function",
+        "parallel_multiple_function",
+        "java",
+        "javascript",
+        "relevance",
+    ],
+    "executable": [
+        "executable_simple",
+        "executable_multiple_function",
+        "executable_parallel_function",
+        "executable_parallel_multiple_function",
+        "rest",
+    ],
+    "all": [
+        "simple",
+        "multiple_function",
+        "parallel_function",
+        "parallel_multiple_function",
+        "java",
+        "javascript",
+        "relevance",
+        "executable_simple",
+        "executable_multiple_function",
+        "executable_parallel_function",
+        "executable_parallel_multiple_function",
+        "rest",
+    ],
+    "non-python": [
+        "java",
+        "javascript",
+    ],
+    "python": [
+        "simple",
+        "multiple_function",
+        "parallel_function",
+        "parallel_multiple_function",
+        "relevance",
+        "executable_simple",
+        "executable_multiple_function",
+        "executable_parallel_function",
+        "executable_parallel_multiple_function",
+        "rest",
+    ],
+}
+
+
+INPUT_PATH = "../result/"
+PROMPT_PATH = "../data/"
+POSSIBLE_ANSWER_PATH = "../data/possible_answer/"
+OUTPUT_PATH = "../score/"
+
+# A dictionary to store the results
+# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count
+LEADERBOARD_TABLE = {}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process two lists of strings.")
+
+    # Add arguments for two lists of strings
+    parser.add_argument(
+        "--model", nargs="+", type=str, help="A list of model names to evaluate"
+    )
+    parser.add_argument(
+        "--test-category",
+        nargs="+",
+        type=str,
+        help="A list of test categories to run the evaluation on",
+    )
+    parser.add_argument(
+        "-c",
+        "--api-sanity-check",
+        action="store_true",
+        default=False,  # Default value is False, meaning the sanity check is skipped unless the flag is specified
+        help="Perform the REST API status sanity check before running the evaluation. By default, the sanity check is skipped.",
+    )
+
+    args = parser.parse_args()
+
+    api_sanity_check = args.api_sanity_check
+    test_categories = None
+    if args.test_category is not None:
+        test_categories = []
+        for test_category in args.test_category:
+            if test_category in ARG_PARSE_MAPPING:
+                test_categories.extend(ARG_PARSE_MAPPING[test_category])
+            else:
+                test_categories.append(test_category)
+
+    model_names = args.model
+    if args.model is not None:
+        model_names = []
+        for model_name in args.model:
+            # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues.
+            # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/").
+            # We patch it here to avoid confusing the user.
+            model_names.append(model_name.replace("/", "_"))
+
+    runner(model_names, test_categories, api_sanity_check)
\ No newline at end of file
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
index be44faf66..83e1e8917 100644
--- a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
+++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
@@ -372,6 +372,12 @@
         "https://huggingface.co/nvidia/nemotron-4-340b-instruct",
         "NVIDIA",
         "nvidia-open-model-license"
+    ],
+    "THUDM/glm-4-9b-chat": [
+        "GLM-4-9b-Chat (FC)",
+        "https://huggingface.co/THUDM/glm-4-9b-chat",
+        "THUDM",
+        "glm-4"
     ]
 }
 
@@ -467,6 +473,7 @@
     "meta-llama/Meta-Llama-3-8B-Instruct": 73,
     "meta-llama/Meta-Llama-3-70B-Instruct": 307,
     "gorilla-openfunctions-v2": 83,
+    "THUDM/glm-4-9b-chat": 223
 }
 
 
@@ -479,9 +486,10 @@
     "meetkai/functionary-small-v2.4-FC",
     "snowflake/arctic",
     "nvidia/nemotron-4-340b-instruct",
+    "THUDM/glm-4-9b-chat",
 ]
 
-# Price got from Azure, 22.032 per hour for 8 V100, Pay As You Go Total Price
+# Price got from AZure, 22.032 per hour for 8 V100, Pay As You Go Total Price
 # Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/
 V100_x8_PRICE_PER_HOUR = 22.032
 
@@ -630,9 +638,7 @@ def api_status_sanity_check_rest():
             errors.append((data, status))
 
     if correct_count != len(ground_truth_replaced):
-        [print("Data:", data, "\nError:", status["error"]) for data, status in errors]
-        error_msg = f"API Status Test Failed for REST Section. {len(ground_truth_replaced) - correct_count} out of {len(ground_truth_replaced)} API behaviors are not as expected. Be careful with executable test category results; they may be inaccurate."
-        raise BadAPIStatusError(error_msg)
+        raise BadAPIStatusError(errors, f"{len(ground_truth_replaced) - correct_count} / {len(ground_truth_replaced)}")
 
 
 def api_status_sanity_check_executable():
@@ -656,11 +662,37 @@ def api_status_sanity_check_executable():
             errors.append((data, status))
 
     if correct_count != len(ground_truth):
-        [print("Data:", data, "\nError:", status["error"]) for data, status in errors]
-        error_msg = f"API Status Test Failed for Executable Section. {len(ground_truth) - correct_count} out of {len(ground_truth)} API behaviors are not as expected. Be careful with executable test category results; they may be inaccurate."
-        raise BadAPIStatusError(error_msg)
-
-
+        raise BadAPIStatusError(errors, f"{len(ground_truth) - correct_count} / {len(ground_truth)}")
+
+
+def display_api_status_error(rest_error, executable_error, display_success=False):
+    if not rest_error and not executable_error:
+        if display_success:
+            print("🟢 All API Status Test Passed!")
+        return None
+
+    RED_FONT = "\033[91m"
+    RESET = "\033[0m"
+    
+    print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n")
+
+    if rest_error:
+        print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test category (REST). Please contact API provider.\n")
+        print(f"{rest_error.error_rate} APIs affected:\n")
+        for data, status in rest_error.errors:
+            print(f"  - Test Case: {data['ground_truth']}")
+            print(f"    Error Type: {status['error_type']}\n")
+            
+    if executable_error:
+        print(f"❗️ Warning: Unable to verify health of executable APIs used in executable test categories (Non-REST). Please contact API provider.\n")
+        print(f"{executable_error.error_rate} APIs affected:\n")
+        for data, status in executable_error.errors:
+            print(f"  - Test Case: {data['ground_truth'][0]}")
+            print(f"    Error Type: {status['error_type']}\n")
+
+    print(f"{RED_FONT}{'-' * 100}\n{RESET}")
+    
+    
 def get_executable_expected_output(prompt_file_path):
     # Before we run the evaluation, we need to add the "execution_result" field to the prompt file, using the ground truth data.
     prompt_content = load_file(prompt_file_path)
@@ -995,4 +1027,4 @@ def collapse_json_objects(file_path):
         for obj in objects:
             json_obj = json.loads(obj)
             compact_json = json.dumps(json_obj, separators=(",", ":"))
-            out_file.write(compact_json + "\n")
+            out_file.write(compact_json + "\n")
\ No newline at end of file
diff --git a/berkeley-function-call-leaderboard/bfcl/evaluate.py b/berkeley-function-call-leaderboard/bfcl/evaluate.py
index ec0b557c1..e69de29bb 100644
--- a/berkeley-function-call-leaderboard/bfcl/evaluate.py
+++ b/berkeley-function-call-leaderboard/bfcl/evaluate.py
@@ -1,518 +0,0 @@
-import sys
-
-sys.path.append("../")
-
-from checker import ast_checker, exec_checker, executable_checker_rest
-from eval_runner_helper import *
-from tqdm import tqdm
-import argparse
-
-
-# NOTE: This file should be run in the `eval_checker` directory
-
-
-def single_executable_file_runner(
-    handler, model_result, prompt, model_name, test_category
-):
-    assert len(model_result) == len(prompt)
-
-    result = []
-    correct_count = 0
-    for i in tqdm(range(len(model_result)), desc="Running tests"):
-        raw_result = model_result[i]["result"]
-        try:
-            decoded_result = handler.decode_execute(raw_result)
-        except Exception as e:
-            result.append(
-                {
-                    "id": i + 1,
-                    "model_name": model_name,
-                    "test_category": test_category,
-                    "valid": False,
-                    "error": [f"Failed to decode executable. {str(e)}"],
-                    "error_type": "executable_decoder:decoder_failed",
-                    "prompt": prompt[i],
-                    "model_result_raw": raw_result,
-                }
-            )
-            continue
-
-        if "rest" in test_category:
-            # REST is always single-functioned. Therefore we take the first one and pass it to the REST checker.
-            if not is_rest_format_output(decoded_result):
-                result.append(
-                    {
-                        "id": i + 1,
-                        "model_name": model_name,
-                        "test_category": test_category,
-                        "valid": False,
-                        "error": [
-                            "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
-                        ],
-                        "error_type": "executable_decoder:rest_wrong_output_format",
-                        "prompt": prompt[i],
-                        "model_result_raw": str(raw_result),
-                        "model_result_decoded": str(decoded_result),
-                    }
-                )
-                continue
-
-            checker_result = executable_checker_rest(decoded_result[0], i)
-
-        else:
-            if not is_executable_format_output(decoded_result):
-                result.append(
-                    {
-                        "id": i + 1,
-                        "model_name": model_name,
-                        "test_category": test_category,
-                        "valid": False,
-                        "error": [
-                            "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
-                        ],
-                        "error_type": "executable_decoder:wrong_output_format",
-                        "prompt": prompt[i],
-                        "model_result_raw": str(raw_result),
-                        "model_result_decoded": str(decoded_result),
-                    }
-                )
-                continue
-
-            prompt_item = prompt[i]
-            checker_result = exec_checker(decoded_result, prompt_item, test_category)
-
-        if checker_result["valid"]:
-            correct_count += 1
-        else:
-            temp = {}
-            temp["id"] = i + 1
-            temp["model_name"] = model_name
-            temp["test_category"] = test_category
-            temp["valid"] = checker_result["valid"]
-            temp["error"] = checker_result["error"]
-            temp["error_type"] = checker_result["error_type"]
-            temp["prompt"] = prompt[i]
-            temp["model_result_raw"] = raw_result
-            temp["model_result_decoded"] = decoded_result
-            if "model_executed_output" in checker_result:
-                temp["model_executed_output"] = checker_result["model_executed_output"]
-            result.append(temp)
-
-    accuracy = correct_count / len(model_result)
-    result.insert(
-        0,
-        {
-            "accuracy": accuracy,
-            "correct_count": correct_count,
-            "total_count": len(model_result),
-        },
-    )
-    output_file_name = test_category + "_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
-    write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
-
-    return accuracy, len(model_result)
-
-
-def single_relevance_file_runner(handler, model_result, model_name, test_category):
-
-    result = []
-    correct_count = 0
-    for i in range(len(model_result)):
-        model_result_item = model_result[i]["result"]
-        success = False
-        decoded_result = None
-
-        try:
-            decoded_result = handler.decode_ast(model_result_item, language="Python")
-            success = False
-            if is_empty_output(decoded_result):
-                success = True
-
-        except Exception as e:
-            success = True
-
-        if success:
-            correct_count += 1
-        else:
-            temp = {}
-            temp["id"] = i + 1
-            temp["model_name"] = model_name
-            temp["test_category"] = test_category
-            temp["valid"] = success
-            temp["error"] = [
-                f"Valid syntax. Successfully decode AST when it should not."
-            ]
-            temp["error_type"] = "relevance_error:decoder_success"
-            temp["model_result"] = model_result_item
-            temp["decoded_result"] = decoded_result
-
-            result.append(temp)
-
-    accuracy = correct_count / len(model_result)
-    result.insert(
-        0,
-        {
-            "accuracy": accuracy,
-            "correct_count": correct_count,
-            "total_count": len(model_result),
-        },
-    )
-    output_file_name = test_category + "_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
-    write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
-
-    return accuracy, len(model_result)
-
-
-def single_ast_file_runner(
-    handler, model_result, prompt, possible_answer, language, test_category, model_name
-):
-    assert (
-        len(model_result) == len(prompt) == len(possible_answer)
-    ), "The length of the model result does not match the length of the prompt or possible answer. Please check the input files for completeness."
-
-    result = []
-    correct_count = 0
-    for i in range(len(model_result)):
-        model_result_item = model_result[i]["result"]
-        prompt_item = prompt[i]["function"]
-        possible_answer_item = possible_answer[i]
-
-        try:
-            model_result_item_raw = model_result_item
-            model_result_item = handler.decode_ast(model_result_item, language)
-        except Exception as e:
-            result.append(
-                {
-                    "id": i + 1,
-                    "model_name": model_name,
-                    "test_category": test_category,
-                    "valid": False,
-                    "error": [f"Invalid syntax. Failed to decode AST. {str(e)}"],
-                    "error_type": "ast_decoder:decoder_failed",
-                    "prompt": prompt[i],
-                    "model_result_raw": model_result_item_raw,
-                    "possible_answer": possible_answer_item,
-                }
-            )
-            continue
-
-        decoder_output_valid = is_function_calling_format_output(model_result_item)
-        if not decoder_output_valid:
-            result.append(
-                {
-                    "id": i + 1,
-                    "model_name": model_name,
-                    "test_category": test_category,
-                    "valid": False,
-                    "error": [
-                        "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
-                    ],
-                    "error_type": "ast_decoder:decoder_wrong_output_format",
-                    "prompt": prompt[i],
-                    "model_result_raw": str(model_result_item_raw),
-                    "model_result_decoded": str(model_result_item),
-                    "possible_answer": possible_answer_item,
-                }
-            )
-            continue
-
-        checker_result = ast_checker(
-            prompt_item,
-            model_result_item,
-            possible_answer_item,
-            language,
-            test_category,
-            model_name,
-        )
-
-        if checker_result["valid"]:
-            correct_count += 1
-        else:
-            temp = {}
-            temp["id"] = i + 1
-            temp["model_name"] = model_name
-            temp["test_category"] = test_category
-            temp["valid"] = checker_result["valid"]
-            temp["error"] = checker_result["error"]
-            temp["error_type"] = checker_result["error_type"]
-            temp["prompt"] = prompt[i]
-            temp["model_result_raw"] = model_result_item_raw
-            temp["model_result_decoded"] = model_result_item
-            temp["possible_answer"] = possible_answer_item
-            result.append(temp)
-
-    accuracy = correct_count / len(model_result)
-    result.insert(
-        0,
-        {
-            "accuracy": accuracy,
-            "correct_count": correct_count,
-            "total_count": len(model_result),
-        },
-    )
-    output_file_name = test_category + "_score.json"
-    output_file_dir = os.path.join(OUTPUT_PATH, model_name)
-    write_list_of_dicts_to_file(output_file_name, result, output_file_dir)
-
-    return accuracy, len(model_result)
-
-
-#### Main runner function ####
-def runner(model_names, test_categories, api_sanity_check):
-
-    # A flag to indicate if the API has been tested.
-    # We should always test the API with ground truth first before running the executable tests.
-    # Sometimes the API may not be working as expected and we want to catch that before running the evaluation to ensure the results are accurate.
-    API_TESTED = False
-
-    # Before running the executable evaluation, we need to get the expected output from the ground truth.
-    # So we need a list of all the test categories that we have ran the ground truth evaluation on.
-    # We only get the expected output once for each test category.
-    EXECUTABLE_TEST_CATEGORIES_HAVE_RUN = []
-
-    # Get a list of all entries in the folder
-    entries = os.scandir(INPUT_PATH)
-
-    # Filter out the subdirectories
-    subdirs = [entry.path for entry in entries if entry.is_dir()]
-
-    # Traverse each subdirectory
-    for subdir in subdirs:
-
-        model_name = subdir.split(INPUT_PATH)[1]
-        if model_names is not None and model_name not in model_names:
-            continue
-
-        model_name_escaped = model_name.replace("_", "/")
-
-        files = [
-            f
-            for f in os.listdir(subdir)
-            if os.path.isfile(os.path.join(subdir, f)) and not f.startswith(".")
-        ]
-        # Check if there is only one file and that file is 'result.json'
-        # If so, this is an OSS model result file and we need to special process it first
-        if len(files) == 1 and files[0] == "result.json":
-            result_json_file_path = os.path.join(subdir, "result.json")
-            oss_file_formatter(result_json_file_path, subdir)
-            print(
-                f"Detected OSS model: {model_name}. result.json has been split into individual test category files."
-            )
-
-        # Pattern to match JSON files in this subdirectory
-        json_files_pattern = os.path.join(subdir, "*.json")
-
-        print(f"🦍 Model: {model_name}")
-
-        # Find and process all JSON files in the subdirectory
-        for model_result_json in glob.glob(json_files_pattern):
-
-            if os.path.basename(model_result_json) == "result.json":
-                continue
-
-            test_category = extract_after_test(model_result_json)
-            if test_categories is not None and test_category not in test_categories:
-                continue
-
-            handler = get_handler(model_name_escaped)
-
-            # We don't evaluate chatable and SQL models in our current leaderboard
-            if is_chatable(test_category) or is_sql(test_category):
-                continue
-
-            language = "Python"
-            if is_java(test_category):
-                language = "Java"
-            if is_js(test_category):
-                language = "JavaScript"
-
-            print(f"🔍 Running test: {test_category}")
-
-            model_result = load_file(model_result_json)
-            record_cost_latency(LEADERBOARD_TABLE, model_name, model_result)
-
-            if is_relevance(test_category):
-                accuracy, total_count = single_relevance_file_runner(
-                    handler, model_result, model_name, test_category
-                )
-                record_result(
-                    LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count
-                )
-                print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}")
-                continue
-
-            # Find the corresponding test file
-            prompt_file = find_file_with_suffix(PROMPT_PATH, test_category)
-            prompt = load_file(prompt_file)
-
-            if is_executable(test_category):
-                # We only test the API with ground truth once
-                if not API_TESTED and api_sanity_check:
-                    print("---- Sanity checking API status ----")
-                    api_status_sanity_check_rest()
-                    api_status_sanity_check_executable()
-                    print("---- Sanity check Passed 💯 ----")
-                    API_TESTED = True
-
-                if (
-                    test_category not in EXECUTABLE_TEST_CATEGORIES_HAVE_RUN
-                    and not is_rest(test_category)
-                ):
-                    print(
-                        f"---- Getting real-time execution result from ground truth for {test_category} ----"
-                    )
-                    get_executable_expected_output(prompt_file)
-                    print(
-                        f"---- Ground truth real-time execution result obtained for {test_category} 🌟 ----"
-                    )
-                    EXECUTABLE_TEST_CATEGORIES_HAVE_RUN.append(test_category)
-                    # Need to re-load the prompt file after getting the expected output, as the prompt file has been updated
-                    prompt = load_file(prompt_file)
-
-                accuracy, total_count = single_executable_file_runner(
-                    handler, model_result, prompt, model_name, test_category
-                )
-                record_result(
-                    LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count
-                )
-                print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}")
-
-                continue
-
-            # Find the corresponding possible answer file
-            possible_answer_file = find_file_with_suffix(
-                POSSIBLE_ANSWER_PATH, test_category
-            )
-            possible_answer = load_file(possible_answer_file)
-            accuracy, total_count = single_ast_file_runner(
-                handler,
-                model_result,
-                prompt,
-                possible_answer,
-                language,
-                test_category,
-                model_name,
-            )
-            record_result(
-                LEADERBOARD_TABLE, model_name, test_category, accuracy, total_count
-            )
-            print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}")
-
-    # This function reads all the score files from local folder and updates the leaderboard table.
-    # This is helpful when you only want to run the evaluation for a subset of models and test categories.
-    update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH)
-    # Write the leaderboard table to a file
-    generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH)
-
-    # Clean up the executable expected output files
-    # They should be re-generated the next time the evaluation is run
-    clean_up_executable_expected_output(
-        PROMPT_PATH, EXECUTABLE_TEST_CATEGORIES_HAVE_RUN
-    )
-
-
-ARG_PARSE_MAPPING = {
-    "ast": [
-        "simple",
-        "multiple_function",
-        "parallel_function",
-        "parallel_multiple_function",
-        "java",
-        "javascript",
-        "relevance",
-    ],
-    "executable": [
-        "executable_simple",
-        "executable_multiple_function",
-        "executable_parallel_function",
-        "executable_parallel_multiple_function",
-        "rest",
-    ],
-    "all": [
-        "simple",
-        "multiple_function",
-        "parallel_function",
-        "parallel_multiple_function",
-        "java",
-        "javascript",
-        "relevance",
-        "executable_simple",
-        "executable_multiple_function",
-        "executable_parallel_function",
-        "executable_parallel_multiple_function",
-        "rest",
-    ],
-    "non-python": [
-        "java",
-        "javascript",
-    ],
-    "python": [
-        "simple",
-        "multiple_function",
-        "parallel_function",
-        "parallel_multiple_function",
-        "relevance",
-        "executable_simple",
-        "executable_multiple_function",
-        "executable_parallel_function",
-        "executable_parallel_multiple_function",
-        "rest",
-    ],
-}
-
-
-INPUT_PATH = "../result/"
-PROMPT_PATH = "../data/"
-POSSIBLE_ANSWER_PATH = "../data/possible_answer/"
-OUTPUT_PATH = "../score/"
-
-# A dictionary to store the results
-# Key is model name, value is a dictionary with keys as test category and values as a dictionary with accuracy and total count
-LEADERBOARD_TABLE = {}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Process two lists of strings.")
-
-    # Add arguments for two lists of strings
-    parser.add_argument(
-        "--model", nargs="+", type=str, help="A list of model names to evaluate"
-    )
-    parser.add_argument(
-        "--test-category",
-        nargs="+",
-        type=str,
-        help="A list of test categories to run the evaluation on",
-    )
-    parser.add_argument(
-        "-s",
-        "--skip-api-sanity-check",
-        action="store_false",
-        default=True,  # Default value is True, meaning the sanity check is performed unless the flag is specified
-        help="Skip the REST API status sanity check before running the evaluation. By default, the sanity check is performed.",
-    )
-
-    args = parser.parse_args()
-
-    api_sanity_check = args.skip_api_sanity_check
-    test_categories = None
-    if args.test_category is not None:
-        test_categories = []
-        for test_category in args.test_category:
-            if test_category in ARG_PARSE_MAPPING:
-                test_categories.extend(ARG_PARSE_MAPPING[test_category])
-            else:
-                test_categories.append(test_category)
-
-    model_names = args.model
-    if args.model is not None:
-        model_names = []
-        for model_name in args.model:
-            # Runner takes in the model name that contains "_", instead of "/", for the sake of file path issues.
-            # This is differnet than the model name format that the generation script "openfunctions_evaluation.py" takes in (where the name contains "/").
-            # We patch it here to avoid confusing the user.
-            model_names.append(model_name.replace("/", "_"))
-
-    runner(model_names, test_categories, api_sanity_check)