Skip to content

Commit

Permalink
Add new plots (#40)
Browse files Browse the repository at this point in the history
  • Loading branch information
ljvmiranda921 authored Oct 7, 2024
1 parent bff6702 commit 9c7c4f6
Show file tree
Hide file tree
Showing 3 changed files with 332 additions and 21 deletions.
7 changes: 7 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Jupyter notebook

# For text count
*.ipynb text

# To ignore it use below
*.ipynb linguist-documentation
57 changes: 36 additions & 21 deletions analysis/plot_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,10 @@ def plot_main_heatmap(
df.pop("eng_Latn")

df = df.sort_values(by="Avg_Multilingual", ascending=False).head(10).reset_index(drop=True)
data = df[[col for col in df.columns if col not in ("Model_Type", "Avg_Multilingual")]]
data = df[[col for col in df.columns if col not in ["Model_Type"]]].rename(columns={"Avg_Multilingual": "Avg"})
data = data.set_index("Model")
data = data * 100
data = data[sorted(data.columns)]

fig, ax = plt.subplots(1, 1, figsize=figsize)
sns.heatmap(data, ax=ax, cmap="YlGn", annot=True, annot_kws={"size": 14}, fmt=".2f", cbar=False)
Expand All @@ -100,41 +101,45 @@ def plot_eng_drop_line(
output_path: Path,
figsize: Optional[tuple[int, int]] = (18, 5),
top_n: Optional[int] = None,
):
from scipy.stats import pearsonr

from scipy.stats import pearsonr, spearmanr

df = pd.read_csv(input_path)
df = df[["Model", "eng_Latn", "Avg_Multilingual"]]
df = df[["Model", "Model_Type", "eng_Latn", "Avg_Multilingual"]]
df = df.sort_values(by="Avg_Multilingual", ascending=False).reset_index(drop=True)
data = df.set_index("Model").dropna() * 100
data = df.set_index("Model").dropna()
data = data[["eng_Latn", "Avg_Multilingual"]] * 100
model_types = df.dropna().pop("Model_Type")
if top_n:
logging.info(f"Showing top {top_n}")
data = data.head(top_n)

model_types = model_types[:top_n]

fig, ax = plt.subplots(figsize=figsize)
mrewardbench_scores = data["Avg_Multilingual"]
rewardbench_scores = data["eng_Latn"]
r, _ = pearsonr(mrewardbench_scores, rewardbench_scores)
ax.scatter(mrewardbench_scores, rewardbench_scores, marker="o", s=30, color="black")

min_val = min(mrewardbench_scores.min(), rewardbench_scores.min())
max_val = max(mrewardbench_scores.max(), rewardbench_scores.max())
ax.plot(
[min_val, max_val],
[min_val, max_val],
linestyle="--",
color="black",
)
ax.set_xlabel(f"M-RewardBench (Pearson r: {r:.2f})")
ax.set_ylabel("RewardBench (Lambert et al., 2024)")
res = spearmanr(mrewardbench_scores, rewardbench_scores)

colormap = {"Generative RM": "green", "Sequence Classifier": "blue", "DPO": "red"}
colors = [colormap[model_type] for model_type in model_types]

ax.scatter(rewardbench_scores, mrewardbench_scores, marker="o", s=30, color=colors)

min_val = min(rewardbench_scores.min(), mrewardbench_scores.min())
max_val = max(rewardbench_scores.max(), mrewardbench_scores.max())
ax.plot([min_val, max_val], [min_val, max_val], linestyle="--", color="black", alpha=0.25)
ax.set_xlabel("RewardBench (Lambert et al., 2024)")
ax.set_ylabel("M-RewardBench")
ax.set_aspect("equal")

model_names = [model.split("/")[1] for model in data.index]
texts = [
ax.text(
mrewardbench_scores[idx],
rewardbench_scores[idx],
data.index[idx],
fontsize=11,
mrewardbench_scores[idx],
model_names[idx],
fontsize=12,
)
for idx in range(len(data))
]
Expand All @@ -145,6 +150,16 @@ def plot_eng_drop_line(
arrowprops=dict(arrowstyle="->", color="gray"),
)

ax.text(
0.1,
0.9,
s=f"Pearson-r: {r:.2f}\nSpearman-r: {res.statistic:.2f}",
fontsize=14,
transform=ax.transAxes,
verticalalignment="top",
bbox=dict(facecolor="white", edgecolor="black", boxstyle="round,pad=0.5"),
)

ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
plt.tight_layout()
Expand Down
289 changes: 289 additions & 0 deletions notebooks/analysis.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analysis Notebook"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = Path(\"../plots\")"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"leaderboard = pd.read_csv(DATA_DIR / \"Aya_RM_Benchmarks - full.csv\")\n",
"top_ten = leaderboard.sort_values(by=\"Avg_Multilingual\", ascending=False).reset_index(drop=True).head(10)[\"Model\"].to_list()\n",
"model_type = leaderboard.sort_values(by=\"Avg_Multilingual\", ascending=False).reset_index(drop=True).head(10)[\"Model_Type\"].to_list()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"def sort_dataframe_by_order(df, column_name, order_list):\n",
" \"\"\"\n",
" Filter and sort a DataFrame based on a predefined order of values in a specified column.\n",
" \n",
" Parameters:\n",
" df (pd.DataFrame): Input DataFrame\n",
" column_name (str): Name of the column to sort by\n",
" order_list (list): List of values defining the desired order\n",
" \n",
" Returns:\n",
" pd.DataFrame: Filtered and sorted DataFrame\n",
" \"\"\"\n",
" # Filter and sort\n",
" df_result = df[df[column_name].isin(order_list)].copy()\n",
" order_dict = {val: index for index, val in enumerate(order_list)}\n",
" df_result['sort_order'] = df_result[column_name].map(order_dict)\n",
" return df_result.sort_values('sort_order').drop('sort_order', axis=1).reset_index(drop=True).rename(columns={column_name: \"Model\"})"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"results_category = {\n",
" \"chat\": sort_dataframe_by_order(pd.read_csv(DATA_DIR / \"chat_leaderboard.csv\"), column_name=\"Unnamed: 0\", order_list=top_ten),\n",
" \"chat_hard\":sort_dataframe_by_order(pd.read_csv(DATA_DIR / \"chat-hard_leaderboard.csv\"), column_name=\"Unnamed: 0\", order_list=top_ten),\n",
" \"safety\":sort_dataframe_by_order(pd.read_csv(DATA_DIR / \"safety_leaderboard.csv\"), column_name=\"Unnamed: 0\", order_list=top_ten),\n",
" \"reasoning\":sort_dataframe_by_order(pd.read_csv(DATA_DIR / \"reasoning_leaderboard.csv\"), column_name=\"Unnamed: 0\", order_list=top_ten),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"delta_df = {\n",
" \"Model\": top_ten,\n",
" \"Model_Type\": model_type,\n",
"}\n",
"for category, results_df in results_category.items():\n",
" results_df[\"Avg\"] = results_df[\"Avg\"] * 100\n",
" results_df[\"eng_Latn\"] = results_df[\"eng_Latn\"] * 100\n",
" results_df[\"delta\"] = results_df[\"Avg\"] - results_df[\"eng_Latn\"]\n",
" delta_df[category] = results_df[\"delta\"].to_list()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Model</th>\n",
" <th>Model_Type</th>\n",
" <th>chat</th>\n",
" <th>chat_hard</th>\n",
" <th>safety</th>\n",
" <th>reasoning</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>openai/gpt-4-turbo-2024-04-09</td>\n",
" <td>Generative RM</td>\n",
" <td>-1.546242</td>\n",
" <td>-3.545359</td>\n",
" <td>-3.218002</td>\n",
" <td>0.844814</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>openai/gpt-4o-2024-05-13</td>\n",
" <td>Generative RM</td>\n",
" <td>-2.760678</td>\n",
" <td>-5.991787</td>\n",
" <td>-4.148092</td>\n",
" <td>-2.834079</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>google/gemma-2-9b-it</td>\n",
" <td>Generative RM</td>\n",
" <td>-0.581029</td>\n",
" <td>-6.465797</td>\n",
" <td>-4.774244</td>\n",
" <td>-0.624124</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>LxzGordon/URM-LLaMa-3.1-8B</td>\n",
" <td>Sequence Classifier</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>meta-llama/Meta-Llama-3.1-70B-Instruct</td>\n",
" <td>Generative RM</td>\n",
" <td>-1.815071</td>\n",
" <td>-11.615303</td>\n",
" <td>-8.513508</td>\n",
" <td>-2.865601</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>meta-llama/Meta-Llama-3-70B-Instruct</td>\n",
" <td>Generative RM</td>\n",
" <td>-2.388924</td>\n",
" <td>-9.046439</td>\n",
" <td>2.899366</td>\n",
" <td>-2.099771</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>CIR-AMS/BTRM_Qwen2_7b_0613</td>\n",
" <td>Sequence Classifier</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>cohere/command-r-plus-08-2024</td>\n",
" <td>Generative RM</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>allenai/tulu-2-dpo-13b</td>\n",
" <td>DPO</td>\n",
" <td>-20.385063</td>\n",
" <td>-2.337985</td>\n",
" <td>-11.460820</td>\n",
" <td>1.036116</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>cohere/c4ai-aya-23-35b</td>\n",
" <td>Generative RM</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Model Model_Type chat \\\n",
"0 openai/gpt-4-turbo-2024-04-09 Generative RM -1.546242 \n",
"1 openai/gpt-4o-2024-05-13 Generative RM -2.760678 \n",
"2 google/gemma-2-9b-it Generative RM -0.581029 \n",
"3 LxzGordon/URM-LLaMa-3.1-8B Sequence Classifier NaN \n",
"4 meta-llama/Meta-Llama-3.1-70B-Instruct Generative RM -1.815071 \n",
"5 meta-llama/Meta-Llama-3-70B-Instruct Generative RM -2.388924 \n",
"6 CIR-AMS/BTRM_Qwen2_7b_0613 Sequence Classifier NaN \n",
"7 cohere/command-r-plus-08-2024 Generative RM NaN \n",
"8 allenai/tulu-2-dpo-13b DPO -20.385063 \n",
"9 cohere/c4ai-aya-23-35b Generative RM NaN \n",
"\n",
" chat_hard safety reasoning \n",
"0 -3.545359 -3.218002 0.844814 \n",
"1 -5.991787 -4.148092 -2.834079 \n",
"2 -6.465797 -4.774244 -0.624124 \n",
"3 NaN NaN NaN \n",
"4 -11.615303 -8.513508 -2.865601 \n",
"5 -9.046439 2.899366 -2.099771 \n",
"6 NaN NaN NaN \n",
"7 NaN NaN NaN \n",
"8 -2.337985 -11.460820 1.036116 \n",
"9 NaN NaN NaN "
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(delta_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 9c7c4f6

Please sign in to comment.