Skip to content

Commit

Permalink
Add check when english results does not exist
Browse files Browse the repository at this point in the history
  • Loading branch information
ljvmiranda921 committed Oct 2, 2024
1 parent f77eba8 commit a40c03f
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 22 deletions.
52 changes: 31 additions & 21 deletions analysis/plot_leaderboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def get_args():
parser.add_argument("--output_dir", type=Path, help="Directory to save the output plots."),
parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results-gtranslate-v2", help="HuggingFace dataset that stores the eval results.")
parser.add_argument("--force_download", action="store_true", help="If set, will redownload the dataset.")
parser.add_argument("--show_english_drop", action="store_true", help="If set, will show English drop.")
# fmt: on
return parser.parse_args()

Expand Down Expand Up @@ -59,7 +60,12 @@ def main():
model_types = leaderboard_df["Type"].unique().tolist()
for model_type in model_types:
model_type_df = leaderboard_df[leaderboard_df["Type"] == model_type]
data = model_type_df.drop(["eng_Latn", "Type", "Std"], axis=1)
columns = ["Type", "Std"]
if "eng_Latn" not in model_type_df.columns:
logging.warning(f"Language 'eng_Latn' not found for {model_type}!")
else:
columns += ["eng_Latn"]
data = model_type_df.drop(columns, axis=1)
avg_col = "Avg"
data = data[[avg_col] + [c for c in data.columns if c != avg_col]]
data = data.dropna()
Expand All @@ -76,28 +82,31 @@ def main():
ax.set_yticklabels(ax.get_yticklabels(), fontsize=16)
fig.tight_layout()
output_file = output_dir / f"leaderboard-{model_type.replace(' ', '_')}.png"
csv_output_file = output_dir / f"leaderboard-{model_type.replace(' ', '_')}.csv"
data.to_csv(csv_output_file)
fig.savefig(output_file, dpi=120)
logging.info(f"Saved to {output_file}")

# *** English drop ***
eng_drop_df = pd.DataFrame(
{
"Overall": get_eng_drop(leaderboard_df)["Percentage_Change"],
"Chat": get_eng_drop(chat_leaderboard_df)["Percentage_Change"],
"Chat Hard": get_eng_drop(chat_hard_leaderboard_df)["Percentage_Change"],
"Safety": get_eng_drop(safety_leaderboard_df)["Percentage_Change"],
"Reasoning": get_eng_drop(reasoning_leaderboard_df)["Percentage_Change"],
}
)
# Only get top-3 and bottom-3. Put bottom 3 at the top rows
top_bottom_n = pd.concat([eng_drop_df.nsmallest(3, "Overall"), eng_drop_df.nlargest(3, "Overall")])
fig, ax = plt.subplots(figsize=(9, 4))
sns.heatmap(top_bottom_n, annot=True, cmap="Reds_r", fmt=".1f", annot_kws={"size": 18}, cbar=False)
ax.xaxis.tick_top()
fig.tight_layout()
output_file = output_dir / "eng-drop-overall.png"
fig.savefig(output_file, dpi=120)
logging.info(f"Saved to {output_file}")
if args.show_english_drop:
eng_drop_df = pd.DataFrame(
{
"Overall": get_eng_drop(leaderboard_df)["Percentage_Change"],
"Chat": get_eng_drop(chat_leaderboard_df)["Percentage_Change"],
"Chat Hard": get_eng_drop(chat_hard_leaderboard_df)["Percentage_Change"],
"Safety": get_eng_drop(safety_leaderboard_df)["Percentage_Change"],
"Reasoning": get_eng_drop(reasoning_leaderboard_df)["Percentage_Change"],
}
)
# Only get top-3 and bottom-3. Put bottom 3 at the top rows
top_bottom_n = pd.concat([eng_drop_df.nsmallest(3, "Overall"), eng_drop_df.nlargest(3, "Overall")])
fig, ax = plt.subplots(figsize=(9, 4))
sns.heatmap(top_bottom_n, annot=True, cmap="Reds_r", fmt=".1f", annot_kws={"size": 18}, cbar=False)
ax.xaxis.tick_top()
fig.tight_layout()
output_file = output_dir / "eng-drop-overall.png"
fig.savefig(output_file, dpi=120)
logging.info(f"Saved to {output_file}")


def get_eng_drop(df: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -134,8 +143,9 @@ def get_leaderboard(dataset: str, force_download: bool, category: Optional[str]
)

# Get average but dont include eng_Latn
lang_scores_df["Avg"] = lang_scores_df.drop(["eng_Latn", "Type"], axis=1).mean(axis=1, skipna=False)
lang_scores_df["Std"] = lang_scores_df.drop(["eng_Latn", "Type"], axis=1).std(axis=1, skipna=False)
columns = ["Type"] if "eng_Latn" not in lang_scores_df else ["eng_Latn", "Type"]
lang_scores_df["Avg"] = lang_scores_df.drop(columns, axis=1).mean(axis=1, skipna=False)
lang_scores_df["Std"] = lang_scores_df.drop(columns, axis=1).std(axis=1, skipna=False)
lang_scores_df = lang_scores_df.sort_values(by=["Type", "Avg"], ascending=False)
return lang_scores_df

Expand Down
15 changes: 14 additions & 1 deletion analysis/plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


PLOT_PARAMS = {
"text.usetex": True,
"text.usetex": False,
"xtick.labelsize": 18,
"ytick.labelsize": 18,
"legend.fontsize": 18,
Expand Down Expand Up @@ -50,6 +50,19 @@ def _compute_category_scores(results: Dict[str, float]) -> Dict[str, float]:
"subset_scores": result["subset"],
}
)
elif result.get("ref_model"):
# Most likely DPO:
category_scores = _compute_category_scores(result["extra_results"])
model_scores.append(
{
"model": result["model"],
"model_type": "DPO",
"chat_template": result["chat_template"],
"score": sum(category_scores.values()) / len(category_scores),
"category_scores": category_scores,
"subset_scores": result["extra_results"],
}
)
else:
category_scores = _compute_category_scores(result["extra_results"])
model_scores.append(
Expand Down

0 comments on commit a40c03f

Please sign in to comment.