Skip to content

Commit

Permalink
Add some charts (#26)
Browse files Browse the repository at this point in the history
  • Loading branch information
ljvmiranda921 authored Aug 23, 2024
1 parent 89b076e commit 2e757de
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 50 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,5 @@ cython_debug/
data/*
output/*
configs/*.yml
.DS_Store
.DS_Store
plots/*
Empty file added analysis/__init__.py
Empty file.
5 changes: 1 addition & 4 deletions analysis/get_human_eval.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import argparse
from pathlib import Path
import logging
from pathlib import Path

import pandas as pd
from pycm import ConfusionMatrix


logging.basicConfig(level=logging.INFO)

Expand Down Expand Up @@ -62,6 +60,5 @@ def main():
disagree_human_vs_llm.to_csv(f"{lang_code}-disagreement-human-vs-llm.csv", index=False)



if __name__ == "__main__":
main()
84 changes: 84 additions & 0 deletions analysis/plot_leaderboard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import argparse
import logging
from pathlib import Path

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from huggingface_hub import snapshot_download

from analysis.plot_utils import get_scores, PLOT_PARAMS

logging.basicConfig(level=logging.INFO)

plt.rcParams.update(PLOT_PARAMS)


def get_args():
# fmt: off
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=Path, help="Directory to save the output plots."),
parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset that stores the eval results.")
parser.add_argument("--force_download", action="store_true", help="If set, will redownload the dataset.")
# fmt: on
return parser.parse_args()


def main():
args = get_args()
output_dir = Path(args.output_dir)
leaderboard_df = get_leaderboard(dataset=args.dataset, force_download=args.force_download)

# Get average of non eng_Latn
leaderboard_df["Avg"] = leaderboard_df.drop(["eng_Latn", "Type"], axis=1).mean(axis=1, skipna=False)
leaderboard_df["Std"] = leaderboard_df.drop(["eng_Latn", "Type"], axis=1).std(axis=1, skipna=False)
leaderboard_df = leaderboard_df.sort_values(by=["Type", "Avg"], ascending=False)

# Save per model type
model_types = leaderboard_df["Type"].unique().tolist()
for model_type in model_types:
model_type_df = leaderboard_df[leaderboard_df["Type"] == model_type]
data = model_type_df.drop(["eng_Latn", "Type", "Std"], axis=1)
avg_col = "Avg"
data = data[[avg_col] + [c for c in data.columns if c != avg_col]]
data = data.dropna()

if "Generative" in model_type:
figsize = (24, 8)
else:
figsize = (24, 3)

fig, ax = plt.subplots(figsize=figsize)
sns.heatmap(data, annot=True, cmap="BuPu", ax=ax, annot_kws={"size": 14})
ax.xaxis.tick_top()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="left", fontsize=16)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=16)
fig.tight_layout()
fig.savefig(output_dir / f"leaderboard-{model_type.replace(' ', '_')}.png", dpi=120)


def get_leaderboard(dataset: str, force_download: bool) -> "pd.DataFrame":
dataset_dir = Path(snapshot_download(dataset, repo_type="dataset", force_download=force_download))
lang_folders = [d for d in dataset_dir.iterdir() if d.is_dir()]

lang_scores = {}
# Track model type
model_type = {}
for lang_dir in lang_folders:
model_scores = get_scores(lang_dir)
lang_scores[lang_dir.name] = {score["model"]: score["score"] for score in model_scores}
for model in model_scores:
model_name = model.get("model")
if model_name not in model_type.keys():
model_type[model_name] = model.get("model_type")

lang_scores_df = pd.DataFrame(lang_scores).merge(
pd.Series(model_type).rename("Type"),
left_index=True,
right_index=True,
)
return lang_scores_df


if __name__ == "__main__":
main()
51 changes: 6 additions & 45 deletions scripts/get_results.py → analysis/plot_utils.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,19 @@
import argparse
import json
import logging
from pathlib import Path
from typing import Any, Dict, List

import pandas as pd
from huggingface_hub import snapshot_download
from rewardbench.constants import EXAMPLE_COUNTS, SUBSET_MAPPING

logging.basicConfig(level=logging.INFO)


def get_args():
# fmt: off
parser = argparse.ArgumentParser(description="Get evaluation results")
parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset that stores the eval results.")
parser.add_argument("--langs", nargs="*", required=False, type=str, help="If set, will only show the results for the particular language codes provided.")
parser.add_argument("--show_subsets", action="store_true", help="If set, will show subset results instead of per-category results.")
parser.add_argument("--force_download", action="store_true", help="If set, will redownload the dataset.")
# fmt: on
return parser.parse_args()


def main():
args = get_args()
dataset_dir = Path(snapshot_download(args.dataset, repo_type="dataset", force_download=args.force_download))
lang_folders = [d for d in dataset_dir.iterdir() if d.is_dir()]

if args.langs:
logging.info(f"Only showing detailed results for the ff languages: {','.join(args.langs)}")
for lang_dir in lang_folders:
if lang_dir.name in args.langs:
model_scores = get_scores(lang_dir)
df = pd.DataFrame(model_scores)
metadata_df = df[["model", "model_type", "score"]]
key = "subset_scores" if args.show_subsets else "category_scores"
scores_df = pd.DataFrame(df[key].tolist())
lang_scores_df = pd.concat([metadata_df, scores_df], axis=1).sort_values(by="score", ascending=False)
print(f"\n*** Results for {lang_dir.name} ***\n")
print(lang_scores_df.to_markdown(tablefmt="github", index=False))

else:
logging.info("Showing m-rewardbench scores for all languages")
lang_scores = {}
for lang_dir in lang_folders:
model_scores = get_scores(lang_dir)
lang_scores[lang_dir.name] = {score["model"]: score["score"] for score in model_scores}

lang_scores_df = pd.DataFrame(lang_scores)
print(lang_scores_df.to_markdown(tablefmt="github"))
PLOT_PARAMS = {
"text.usetex": True,
"xtick.labelsize": 18,
"ytick.labelsize": 18,
"legend.fontsize": 18,
}


def get_scores(lang_dir: Path) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -98,7 +63,3 @@ def _compute_category_scores(results: Dict[str, float]) -> Dict[str, float]:
}
)
return model_scores


if __name__ == "__main__":
main()
52 changes: 52 additions & 0 deletions analysis/print_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import argparse
import logging
from pathlib import Path

import pandas as pd
from huggingface_hub import snapshot_download

from analysis.plot_utils import get_scores


def get_args():
# fmt: off
parser = argparse.ArgumentParser(description="Get evaluation results")
parser.add_argument("--dataset", type=str, default="aya-rm-multilingual/eval-results", help="HuggingFace dataset that stores the eval results.")
parser.add_argument("--langs", nargs="*", required=False, type=str, help="If set, will only show the results for the particular language codes provided.")
parser.add_argument("--show_subsets", action="store_true", help="If set, will show subset results instead of per-category results.")
parser.add_argument("--force_download", action="store_true", help="If set, will redownload the dataset.")
# fmt: on
return parser.parse_args()


def main():
args = get_args()
dataset_dir = Path(snapshot_download(args.dataset, repo_type="dataset", force_download=args.force_download))
lang_folders = [d for d in dataset_dir.iterdir() if d.is_dir()]

if args.langs:
logging.info(f"Only showing detailed results for the ff languages: {','.join(args.langs)}")
for lang_dir in lang_folders:
if lang_dir.name in args.langs:
model_scores = get_scores(lang_dir)
df = pd.DataFrame(model_scores)
metadata_df = df[["model", "model_type", "score"]]
key = "subset_scores" if args.show_subsets else "category_scores"
scores_df = pd.DataFrame(df[key].tolist())
lang_scores_df = pd.concat([metadata_df, scores_df], axis=1).sort_values(by="score", ascending=False)
print(f"\n*** Results for {lang_dir.name} ***\n")
print(lang_scores_df.to_markdown(tablefmt="github", index=False))

else:
logging.info("Showing m-rewardbench scores for all languages")
lang_scores = {}
for lang_dir in lang_folders:
model_scores = get_scores(lang_dir)
lang_scores[lang_dir.name] = {score["model"]: score["score"] for score in model_scores}

lang_scores_df = pd.DataFrame(lang_scores)
print(lang_scores_df.to_markdown(tablefmt="github"))


if __name__ == "__main__":
main()

0 comments on commit 2e757de

Please sign in to comment.