-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bff6702
commit 9c7c4f6
Showing
3 changed files
with
332 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Jupyter notebook | ||
|
||
# For text count | ||
*.ipynb text | ||
|
||
# To ignore it use below | ||
*.ipynb linguist-documentation |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,289 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Analysis Notebook" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from pathlib import Path" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"DATA_DIR = Path(\"../plots\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 50, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"leaderboard = pd.read_csv(DATA_DIR / \"Aya_RM_Benchmarks - full.csv\")\n", | ||
"top_ten = leaderboard.sort_values(by=\"Avg_Multilingual\", ascending=False).reset_index(drop=True).head(10)[\"Model\"].to_list()\n", | ||
"model_type = leaderboard.sort_values(by=\"Avg_Multilingual\", ascending=False).reset_index(drop=True).head(10)[\"Model_Type\"].to_list()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 41, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def sort_dataframe_by_order(df, column_name, order_list):\n", | ||
" \"\"\"\n", | ||
" Filter and sort a DataFrame based on a predefined order of values in a specified column.\n", | ||
" \n", | ||
" Parameters:\n", | ||
" df (pd.DataFrame): Input DataFrame\n", | ||
" column_name (str): Name of the column to sort by\n", | ||
" order_list (list): List of values defining the desired order\n", | ||
" \n", | ||
" Returns:\n", | ||
" pd.DataFrame: Filtered and sorted DataFrame\n", | ||
" \"\"\"\n", | ||
" # Filter and sort\n", | ||
" df_result = df[df[column_name].isin(order_list)].copy()\n", | ||
" order_dict = {val: index for index, val in enumerate(order_list)}\n", | ||
" df_result['sort_order'] = df_result[column_name].map(order_dict)\n", | ||
" return df_result.sort_values('sort_order').drop('sort_order', axis=1).reset_index(drop=True).rename(columns={column_name: \"Model\"})" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 46, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"results_category = {\n", | ||
" \"chat\": sort_dataframe_by_order(pd.read_csv(DATA_DIR / \"chat_leaderboard.csv\"), column_name=\"Unnamed: 0\", order_list=top_ten),\n", | ||
" \"chat_hard\":sort_dataframe_by_order(pd.read_csv(DATA_DIR / \"chat-hard_leaderboard.csv\"), column_name=\"Unnamed: 0\", order_list=top_ten),\n", | ||
" \"safety\":sort_dataframe_by_order(pd.read_csv(DATA_DIR / \"safety_leaderboard.csv\"), column_name=\"Unnamed: 0\", order_list=top_ten),\n", | ||
" \"reasoning\":sort_dataframe_by_order(pd.read_csv(DATA_DIR / \"reasoning_leaderboard.csv\"), column_name=\"Unnamed: 0\", order_list=top_ten),\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 58, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"delta_df = {\n", | ||
" \"Model\": top_ten,\n", | ||
" \"Model_Type\": model_type,\n", | ||
"}\n", | ||
"for category, results_df in results_category.items():\n", | ||
" results_df[\"Avg\"] = results_df[\"Avg\"] * 100\n", | ||
" results_df[\"eng_Latn\"] = results_df[\"eng_Latn\"] * 100\n", | ||
" results_df[\"delta\"] = results_df[\"Avg\"] - results_df[\"eng_Latn\"]\n", | ||
" delta_df[category] = results_df[\"delta\"].to_list()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 59, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>Model</th>\n", | ||
" <th>Model_Type</th>\n", | ||
" <th>chat</th>\n", | ||
" <th>chat_hard</th>\n", | ||
" <th>safety</th>\n", | ||
" <th>reasoning</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>openai/gpt-4-turbo-2024-04-09</td>\n", | ||
" <td>Generative RM</td>\n", | ||
" <td>-1.546242</td>\n", | ||
" <td>-3.545359</td>\n", | ||
" <td>-3.218002</td>\n", | ||
" <td>0.844814</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>openai/gpt-4o-2024-05-13</td>\n", | ||
" <td>Generative RM</td>\n", | ||
" <td>-2.760678</td>\n", | ||
" <td>-5.991787</td>\n", | ||
" <td>-4.148092</td>\n", | ||
" <td>-2.834079</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>google/gemma-2-9b-it</td>\n", | ||
" <td>Generative RM</td>\n", | ||
" <td>-0.581029</td>\n", | ||
" <td>-6.465797</td>\n", | ||
" <td>-4.774244</td>\n", | ||
" <td>-0.624124</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>LxzGordon/URM-LLaMa-3.1-8B</td>\n", | ||
" <td>Sequence Classifier</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>meta-llama/Meta-Llama-3.1-70B-Instruct</td>\n", | ||
" <td>Generative RM</td>\n", | ||
" <td>-1.815071</td>\n", | ||
" <td>-11.615303</td>\n", | ||
" <td>-8.513508</td>\n", | ||
" <td>-2.865601</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>5</th>\n", | ||
" <td>meta-llama/Meta-Llama-3-70B-Instruct</td>\n", | ||
" <td>Generative RM</td>\n", | ||
" <td>-2.388924</td>\n", | ||
" <td>-9.046439</td>\n", | ||
" <td>2.899366</td>\n", | ||
" <td>-2.099771</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>6</th>\n", | ||
" <td>CIR-AMS/BTRM_Qwen2_7b_0613</td>\n", | ||
" <td>Sequence Classifier</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>7</th>\n", | ||
" <td>cohere/command-r-plus-08-2024</td>\n", | ||
" <td>Generative RM</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>8</th>\n", | ||
" <td>allenai/tulu-2-dpo-13b</td>\n", | ||
" <td>DPO</td>\n", | ||
" <td>-20.385063</td>\n", | ||
" <td>-2.337985</td>\n", | ||
" <td>-11.460820</td>\n", | ||
" <td>1.036116</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>9</th>\n", | ||
" <td>cohere/c4ai-aya-23-35b</td>\n", | ||
" <td>Generative RM</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" <td>NaN</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" Model Model_Type chat \\\n", | ||
"0 openai/gpt-4-turbo-2024-04-09 Generative RM -1.546242 \n", | ||
"1 openai/gpt-4o-2024-05-13 Generative RM -2.760678 \n", | ||
"2 google/gemma-2-9b-it Generative RM -0.581029 \n", | ||
"3 LxzGordon/URM-LLaMa-3.1-8B Sequence Classifier NaN \n", | ||
"4 meta-llama/Meta-Llama-3.1-70B-Instruct Generative RM -1.815071 \n", | ||
"5 meta-llama/Meta-Llama-3-70B-Instruct Generative RM -2.388924 \n", | ||
"6 CIR-AMS/BTRM_Qwen2_7b_0613 Sequence Classifier NaN \n", | ||
"7 cohere/command-r-plus-08-2024 Generative RM NaN \n", | ||
"8 allenai/tulu-2-dpo-13b DPO -20.385063 \n", | ||
"9 cohere/c4ai-aya-23-35b Generative RM NaN \n", | ||
"\n", | ||
" chat_hard safety reasoning \n", | ||
"0 -3.545359 -3.218002 0.844814 \n", | ||
"1 -5.991787 -4.148092 -2.834079 \n", | ||
"2 -6.465797 -4.774244 -0.624124 \n", | ||
"3 NaN NaN NaN \n", | ||
"4 -11.615303 -8.513508 -2.865601 \n", | ||
"5 -9.046439 2.899366 -2.099771 \n", | ||
"6 NaN NaN NaN \n", | ||
"7 NaN NaN NaN \n", | ||
"8 -2.337985 -11.460820 1.036116 \n", | ||
"9 NaN NaN NaN " | ||
] | ||
}, | ||
"execution_count": 59, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"pd.DataFrame(delta_df)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |