From a40c03f1f7cb92c0c835028664d6ebcb8e3e9edf Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Wed, 2 Oct 2024 13:49:26 -0700 Subject: [PATCH] Add check when english results does not exist --- analysis/plot_leaderboard.py | 52 +++++++++++++++++++++--------------- analysis/plot_utils.py | 15 ++++++++++- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/analysis/plot_leaderboard.py b/analysis/plot_leaderboard.py index 60fd152..edb5bfa 100644 --- a/analysis/plot_leaderboard.py +++ b/analysis/plot_leaderboard.py @@ -21,6 +21,7 @@ def get_args(): parser.add_argument("--output_dir", type=Path, help="Directory to save the output plots."), parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results-gtranslate-v2", help="HuggingFace dataset that stores the eval results.") parser.add_argument("--force_download", action="store_true", help="If set, will redownload the dataset.") + parser.add_argument("--show_english_drop", action="store_true", help="If set, will show English drop.") # fmt: on return parser.parse_args() @@ -59,7 +60,12 @@ def main(): model_types = leaderboard_df["Type"].unique().tolist() for model_type in model_types: model_type_df = leaderboard_df[leaderboard_df["Type"] == model_type] - data = model_type_df.drop(["eng_Latn", "Type", "Std"], axis=1) + columns = ["Type", "Std"] + if "eng_Latn" not in model_type_df.columns: + logging.warning(f"Language 'eng_Latn' not found for {model_type}!") + else: + columns += ["eng_Latn"] + data = model_type_df.drop(columns, axis=1) avg_col = "Avg" data = data[[avg_col] + [c for c in data.columns if c != avg_col]] data = data.dropna() @@ -76,28 +82,31 @@ def main(): ax.set_yticklabels(ax.get_yticklabels(), fontsize=16) fig.tight_layout() output_file = output_dir / f"leaderboard-{model_type.replace(' ', '_')}.png" + csv_output_file = output_dir / f"leaderboard-{model_type.replace(' ', '_')}.csv" + data.to_csv(csv_output_file) fig.savefig(output_file, dpi=120) logging.info(f"Saved to {output_file}") # *** English drop *** - eng_drop_df = pd.DataFrame( - { - "Overall": get_eng_drop(leaderboard_df)["Percentage_Change"], - "Chat": get_eng_drop(chat_leaderboard_df)["Percentage_Change"], - "Chat Hard": get_eng_drop(chat_hard_leaderboard_df)["Percentage_Change"], - "Safety": get_eng_drop(safety_leaderboard_df)["Percentage_Change"], - "Reasoning": get_eng_drop(reasoning_leaderboard_df)["Percentage_Change"], - } - ) - # Only get top-3 and bottom-3. Put bottom 3 at the top rows - top_bottom_n = pd.concat([eng_drop_df.nsmallest(3, "Overall"), eng_drop_df.nlargest(3, "Overall")]) - fig, ax = plt.subplots(figsize=(9, 4)) - sns.heatmap(top_bottom_n, annot=True, cmap="Reds_r", fmt=".1f", annot_kws={"size": 18}, cbar=False) - ax.xaxis.tick_top() - fig.tight_layout() - output_file = output_dir / "eng-drop-overall.png" - fig.savefig(output_file, dpi=120) - logging.info(f"Saved to {output_file}") + if args.show_english_drop: + eng_drop_df = pd.DataFrame( + { + "Overall": get_eng_drop(leaderboard_df)["Percentage_Change"], + "Chat": get_eng_drop(chat_leaderboard_df)["Percentage_Change"], + "Chat Hard": get_eng_drop(chat_hard_leaderboard_df)["Percentage_Change"], + "Safety": get_eng_drop(safety_leaderboard_df)["Percentage_Change"], + "Reasoning": get_eng_drop(reasoning_leaderboard_df)["Percentage_Change"], + } + ) + # Only get top-3 and bottom-3. Put bottom 3 at the top rows + top_bottom_n = pd.concat([eng_drop_df.nsmallest(3, "Overall"), eng_drop_df.nlargest(3, "Overall")]) + fig, ax = plt.subplots(figsize=(9, 4)) + sns.heatmap(top_bottom_n, annot=True, cmap="Reds_r", fmt=".1f", annot_kws={"size": 18}, cbar=False) + ax.xaxis.tick_top() + fig.tight_layout() + output_file = output_dir / "eng-drop-overall.png" + fig.savefig(output_file, dpi=120) + logging.info(f"Saved to {output_file}") def get_eng_drop(df: pd.DataFrame) -> pd.DataFrame: @@ -134,8 +143,9 @@ def get_leaderboard(dataset: str, force_download: bool, category: Optional[str] ) # Get average but dont include eng_Latn - lang_scores_df["Avg"] = lang_scores_df.drop(["eng_Latn", "Type"], axis=1).mean(axis=1, skipna=False) - lang_scores_df["Std"] = lang_scores_df.drop(["eng_Latn", "Type"], axis=1).std(axis=1, skipna=False) + columns = ["Type"] if "eng_Latn" not in lang_scores_df else ["eng_Latn", "Type"] + lang_scores_df["Avg"] = lang_scores_df.drop(columns, axis=1).mean(axis=1, skipna=False) + lang_scores_df["Std"] = lang_scores_df.drop(columns, axis=1).std(axis=1, skipna=False) lang_scores_df = lang_scores_df.sort_values(by=["Type", "Avg"], ascending=False) return lang_scores_df diff --git a/analysis/plot_utils.py b/analysis/plot_utils.py index 6d1e631..1080f00 100644 --- a/analysis/plot_utils.py +++ b/analysis/plot_utils.py @@ -7,7 +7,7 @@ PLOT_PARAMS = { - "text.usetex": True, + "text.usetex": False, "xtick.labelsize": 18, "ytick.labelsize": 18, "legend.fontsize": 18, @@ -50,6 +50,19 @@ def _compute_category_scores(results: Dict[str, float]) -> Dict[str, float]: "subset_scores": result["subset"], } ) + elif result.get("ref_model"): + # Most likely DPO: + category_scores = _compute_category_scores(result["extra_results"]) + model_scores.append( + { + "model": result["model"], + "model_type": "DPO", + "chat_template": result["chat_template"], + "score": sum(category_scores.values()) / len(category_scores), + "category_scores": category_scores, + "subset_scores": result["extra_results"], + } + ) else: category_scores = _compute_category_scores(result["extra_results"]) model_scores.append(