diff --git a/.gitignore b/.gitignore index 341a836..7918c23 100644 --- a/.gitignore +++ b/.gitignore @@ -164,4 +164,5 @@ cython_debug/ data/* output/* configs/*.yml -.DS_Store \ No newline at end of file +.DS_Store +plots/* \ No newline at end of file diff --git a/analysis/__init__.py b/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/analysis/get_human_eval.py b/analysis/get_human_eval.py index 3d5a7fb..5dce713 100644 --- a/analysis/get_human_eval.py +++ b/analysis/get_human_eval.py @@ -1,10 +1,8 @@ import argparse -from pathlib import Path import logging +from pathlib import Path import pandas as pd -from pycm import ConfusionMatrix - logging.basicConfig(level=logging.INFO) @@ -62,6 +60,5 @@ def main(): disagree_human_vs_llm.to_csv(f"{lang_code}-disagreement-human-vs-llm.csv", index=False) - if __name__ == "__main__": main() diff --git a/analysis/plot_leaderboard.py b/analysis/plot_leaderboard.py new file mode 100644 index 0000000..72621b3 --- /dev/null +++ b/analysis/plot_leaderboard.py @@ -0,0 +1,84 @@ +import argparse +import logging +from pathlib import Path + +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from huggingface_hub import snapshot_download + +from analysis.plot_utils import get_scores, PLOT_PARAMS + +logging.basicConfig(level=logging.INFO) + +plt.rcParams.update(PLOT_PARAMS) + + +def get_args(): + # fmt: off + parser = argparse.ArgumentParser() + parser.add_argument("--output_dir", type=Path, help="Directory to save the output plots."), + parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset that stores the eval results.") + parser.add_argument("--force_download", action="store_true", help="If set, will redownload the dataset.") + # fmt: on + return parser.parse_args() + + +def main(): + args = get_args() + output_dir = Path(args.output_dir) + leaderboard_df = get_leaderboard(dataset=args.dataset, force_download=args.force_download) + + # Get average of non eng_Latn + leaderboard_df["Avg"] = leaderboard_df.drop(["eng_Latn", "Type"], axis=1).mean(axis=1, skipna=False) + leaderboard_df["Std"] = leaderboard_df.drop(["eng_Latn", "Type"], axis=1).std(axis=1, skipna=False) + leaderboard_df = leaderboard_df.sort_values(by=["Type", "Avg"], ascending=False) + + # Save per model type + model_types = leaderboard_df["Type"].unique().tolist() + for model_type in model_types: + model_type_df = leaderboard_df[leaderboard_df["Type"] == model_type] + data = model_type_df.drop(["eng_Latn", "Type", "Std"], axis=1) + avg_col = "Avg" + data = data[[avg_col] + [c for c in data.columns if c != avg_col]] + data = data.dropna() + + if "Generative" in model_type: + figsize = (24, 8) + else: + figsize = (24, 3) + + fig, ax = plt.subplots(figsize=figsize) + sns.heatmap(data, annot=True, cmap="BuPu", ax=ax, annot_kws={"size": 14}) + ax.xaxis.tick_top() + ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="left", fontsize=16) + ax.set_yticklabels(ax.get_yticklabels(), fontsize=16) + fig.tight_layout() + fig.savefig(output_dir / f"leaderboard-{model_type.replace(' ', '_')}.png", dpi=120) + + +def get_leaderboard(dataset: str, force_download: bool) -> "pd.DataFrame": + dataset_dir = Path(snapshot_download(dataset, repo_type="dataset", force_download=force_download)) + lang_folders = [d for d in dataset_dir.iterdir() if d.is_dir()] + + lang_scores = {} + # Track model type + model_type = {} + for lang_dir in lang_folders: + model_scores = get_scores(lang_dir) + lang_scores[lang_dir.name] = {score["model"]: score["score"] for score in model_scores} + for model in model_scores: + model_name = model.get("model") + if model_name not in model_type.keys(): + model_type[model_name] = model.get("model_type") + + lang_scores_df = pd.DataFrame(lang_scores).merge( + pd.Series(model_type).rename("Type"), + left_index=True, + right_index=True, + ) + return lang_scores_df + + +if __name__ == "__main__": + main() diff --git a/scripts/get_results.py b/analysis/plot_utils.py similarity index 53% rename from scripts/get_results.py rename to analysis/plot_utils.py index b3f00c6..91addd4 100644 --- a/scripts/get_results.py +++ b/analysis/plot_utils.py @@ -1,54 +1,19 @@ -import argparse import json import logging from pathlib import Path from typing import Any, Dict, List -import pandas as pd -from huggingface_hub import snapshot_download from rewardbench.constants import EXAMPLE_COUNTS, SUBSET_MAPPING logging.basicConfig(level=logging.INFO) -def get_args(): - # fmt: off - parser = argparse.ArgumentParser(description="Get evaluation results") - parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset that stores the eval results.") - parser.add_argument("--langs", nargs="*", required=False, type=str, help="If set, will only show the results for the particular language codes provided.") - parser.add_argument("--show_subsets", action="store_true", help="If set, will show subset results instead of per-category results.") - parser.add_argument("--force_download", action="store_true", help="If set, will redownload the dataset.") - # fmt: on - return parser.parse_args() - - -def main(): - args = get_args() - dataset_dir = Path(snapshot_download(args.dataset, repo_type="dataset", force_download=args.force_download)) - lang_folders = [d for d in dataset_dir.iterdir() if d.is_dir()] - - if args.langs: - logging.info(f"Only showing detailed results for the ff languages: {','.join(args.langs)}") - for lang_dir in lang_folders: - if lang_dir.name in args.langs: - model_scores = get_scores(lang_dir) - df = pd.DataFrame(model_scores) - metadata_df = df[["model", "model_type", "score"]] - key = "subset_scores" if args.show_subsets else "category_scores" - scores_df = pd.DataFrame(df[key].tolist()) - lang_scores_df = pd.concat([metadata_df, scores_df], axis=1).sort_values(by="score", ascending=False) - print(f"\n*** Results for {lang_dir.name} ***\n") - print(lang_scores_df.to_markdown(tablefmt="github", index=False)) - - else: - logging.info("Showing m-rewardbench scores for all languages") - lang_scores = {} - for lang_dir in lang_folders: - model_scores = get_scores(lang_dir) - lang_scores[lang_dir.name] = {score["model"]: score["score"] for score in model_scores} - - lang_scores_df = pd.DataFrame(lang_scores) - print(lang_scores_df.to_markdown(tablefmt="github")) +PLOT_PARAMS = { + "text.usetex": True, + "xtick.labelsize": 18, + "ytick.labelsize": 18, + "legend.fontsize": 18, +} def get_scores(lang_dir: Path) -> List[Dict[str, Any]]: @@ -98,7 +63,3 @@ def _compute_category_scores(results: Dict[str, float]) -> Dict[str, float]: } ) return model_scores - - -if __name__ == "__main__": - main() diff --git a/analysis/print_results.py b/analysis/print_results.py new file mode 100644 index 0000000..e934057 --- /dev/null +++ b/analysis/print_results.py @@ -0,0 +1,52 @@ +import argparse +import logging +from pathlib import Path + +import pandas as pd +from huggingface_hub import snapshot_download + +from analysis.plot_utils import get_scores + + +def get_args(): + # fmt: off + parser = argparse.ArgumentParser(description="Get evaluation results") + parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset that stores the eval results.") + parser.add_argument("--langs", nargs="*", required=False, type=str, help="If set, will only show the results for the particular language codes provided.") + parser.add_argument("--show_subsets", action="store_true", help="If set, will show subset results instead of per-category results.") + parser.add_argument("--force_download", action="store_true", help="If set, will redownload the dataset.") + # fmt: on + return parser.parse_args() + + +def main(): + args = get_args() + dataset_dir = Path(snapshot_download(args.dataset, repo_type="dataset", force_download=args.force_download)) + lang_folders = [d for d in dataset_dir.iterdir() if d.is_dir()] + + if args.langs: + logging.info(f"Only showing detailed results for the ff languages: {','.join(args.langs)}") + for lang_dir in lang_folders: + if lang_dir.name in args.langs: + model_scores = get_scores(lang_dir) + df = pd.DataFrame(model_scores) + metadata_df = df[["model", "model_type", "score"]] + key = "subset_scores" if args.show_subsets else "category_scores" + scores_df = pd.DataFrame(df[key].tolist()) + lang_scores_df = pd.concat([metadata_df, scores_df], axis=1).sort_values(by="score", ascending=False) + print(f"\n*** Results for {lang_dir.name} ***\n") + print(lang_scores_df.to_markdown(tablefmt="github", index=False)) + + else: + logging.info("Showing m-rewardbench scores for all languages") + lang_scores = {} + for lang_dir in lang_folders: + model_scores = get_scores(lang_dir) + lang_scores[lang_dir.name] = {score["model"]: score["score"] for score in model_scores} + + lang_scores_df = pd.DataFrame(lang_scores) + print(lang_scores_df.to_markdown(tablefmt="github")) + + +if __name__ == "__main__": + main()