Add check when english results does not exist

for-ai · Oct 2, 2024 · a40c03f · a40c03f
1 parent f77eba8
commit a40c03f
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 22 deletions.
diff --git a/analysis/plot_leaderboard.py b/analysis/plot_leaderboard.py
@@ -21,6 +21,7 @@ def get_args():
     parser.add_argument("--output_dir", type=Path, help="Directory to save the output plots."),
     parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results-gtranslate-v2", help="HuggingFace dataset that stores the eval results.")
     parser.add_argument("--force_download", action="store_true", help="If set, will redownload the dataset.")
+    parser.add_argument("--show_english_drop", action="store_true", help="If set, will show English drop.")
     # fmt: on
     return parser.parse_args()
 
@@ -59,7 +60,12 @@ def main():
     model_types = leaderboard_df["Type"].unique().tolist()
     for model_type in model_types:
         model_type_df = leaderboard_df[leaderboard_df["Type"] == model_type]
-        data = model_type_df.drop(["eng_Latn", "Type", "Std"], axis=1)
+        columns = ["Type", "Std"]
+        if "eng_Latn" not in model_type_df.columns:
+            logging.warning(f"Language 'eng_Latn' not found for {model_type}!")
+        else:
+            columns += ["eng_Latn"]
+        data = model_type_df.drop(columns, axis=1)
         avg_col = "Avg"
         data = data[[avg_col] + [c for c in data.columns if c != avg_col]]
         data = data.dropna()
@@ -76,28 +82,31 @@ def main():
         ax.set_yticklabels(ax.get_yticklabels(), fontsize=16)
         fig.tight_layout()
         output_file = output_dir / f"leaderboard-{model_type.replace(' ', '_')}.png"
+        csv_output_file = output_dir / f"leaderboard-{model_type.replace(' ', '_')}.csv"
+        data.to_csv(csv_output_file)
         fig.savefig(output_file, dpi=120)
         logging.info(f"Saved to {output_file}")
 
     # *** English drop ***
-    eng_drop_df = pd.DataFrame(
-        {
-            "Overall": get_eng_drop(leaderboard_df)["Percentage_Change"],
-            "Chat": get_eng_drop(chat_leaderboard_df)["Percentage_Change"],
-            "Chat Hard": get_eng_drop(chat_hard_leaderboard_df)["Percentage_Change"],
-            "Safety": get_eng_drop(safety_leaderboard_df)["Percentage_Change"],
-            "Reasoning": get_eng_drop(reasoning_leaderboard_df)["Percentage_Change"],
-        }
-    )
-    # Only get top-3 and bottom-3. Put bottom 3 at the top rows
-    top_bottom_n = pd.concat([eng_drop_df.nsmallest(3, "Overall"), eng_drop_df.nlargest(3, "Overall")])
-    fig, ax = plt.subplots(figsize=(9, 4))
-    sns.heatmap(top_bottom_n, annot=True, cmap="Reds_r", fmt=".1f", annot_kws={"size": 18}, cbar=False)
-    ax.xaxis.tick_top()
-    fig.tight_layout()
-    output_file = output_dir / "eng-drop-overall.png"
-    fig.savefig(output_file, dpi=120)
-    logging.info(f"Saved to {output_file}")
+    if args.show_english_drop:
+        eng_drop_df = pd.DataFrame(
+            {
+                "Overall": get_eng_drop(leaderboard_df)["Percentage_Change"],
+                "Chat": get_eng_drop(chat_leaderboard_df)["Percentage_Change"],
+                "Chat Hard": get_eng_drop(chat_hard_leaderboard_df)["Percentage_Change"],
+                "Safety": get_eng_drop(safety_leaderboard_df)["Percentage_Change"],
+                "Reasoning": get_eng_drop(reasoning_leaderboard_df)["Percentage_Change"],
+            }
+        )
+        # Only get top-3 and bottom-3. Put bottom 3 at the top rows
+        top_bottom_n = pd.concat([eng_drop_df.nsmallest(3, "Overall"), eng_drop_df.nlargest(3, "Overall")])
+        fig, ax = plt.subplots(figsize=(9, 4))
+        sns.heatmap(top_bottom_n, annot=True, cmap="Reds_r", fmt=".1f", annot_kws={"size": 18}, cbar=False)
+        ax.xaxis.tick_top()
+        fig.tight_layout()
+        output_file = output_dir / "eng-drop-overall.png"
+        fig.savefig(output_file, dpi=120)
+        logging.info(f"Saved to {output_file}")
 
 
 def get_eng_drop(df: pd.DataFrame) -> pd.DataFrame:
@@ -134,8 +143,9 @@ def get_leaderboard(dataset: str, force_download: bool, category: Optional[str]
     )
 
     # Get average but dont include eng_Latn
-    lang_scores_df["Avg"] = lang_scores_df.drop(["eng_Latn", "Type"], axis=1).mean(axis=1, skipna=False)
-    lang_scores_df["Std"] = lang_scores_df.drop(["eng_Latn", "Type"], axis=1).std(axis=1, skipna=False)
+    columns = ["Type"] if "eng_Latn" not in lang_scores_df else ["eng_Latn", "Type"]
+    lang_scores_df["Avg"] = lang_scores_df.drop(columns, axis=1).mean(axis=1, skipna=False)
+    lang_scores_df["Std"] = lang_scores_df.drop(columns, axis=1).std(axis=1, skipna=False)
     lang_scores_df = lang_scores_df.sort_values(by=["Type", "Avg"], ascending=False)
     return lang_scores_df
 

diff --git a/analysis/plot_utils.py b/analysis/plot_utils.py
@@ -7,7 +7,7 @@
 
 
 PLOT_PARAMS = {
-    "text.usetex": True,
+    "text.usetex": False,
     "xtick.labelsize": 18,
     "ytick.labelsize": 18,
     "legend.fontsize": 18,
@@ -50,6 +50,19 @@ def _compute_category_scores(results: Dict[str, float]) -> Dict[str, float]:
                     "subset_scores": result["subset"],
                 }
             )
+        elif result.get("ref_model"):
+            # Most likely DPO:
+            category_scores = _compute_category_scores(result["extra_results"])
+            model_scores.append(
+                {
+                    "model": result["model"],
+                    "model_type": "DPO",
+                    "chat_template": result["chat_template"],
+                    "score": sum(category_scores.values()) / len(category_scores),
+                    "category_scores": category_scores,
+                    "subset_scores": result["extra_results"],
+                }
+            )
         else:
             category_scores = _compute_category_scores(result["extra_results"])
             model_scores.append(