Skip to content

Commit

Permalink
Update agreement (#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
ljvmiranda921 authored Aug 20, 2024
1 parent 75c756e commit 89b076e
Showing 1 changed file with 29 additions and 17 deletions.
46 changes: 29 additions & 17 deletions analysis/get_human_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
from pycm import ConfusionMatrix


logging.basicConfig(level=logging.INFO)


Expand All @@ -28,27 +29,38 @@ def main():
annotations = annotations[["id", "human_preference", "notes"]]

# Combine in single dataframe and apply random swaps
# We swap the gold preferences because if they're all the same value,
# it affects the random-corrected chance in the IAA measures
df = pd.merge(reference, annotations, on="id")
df["gold_preference"] = df.apply(lambda row: "B" if row["swapped"] == 1 else "A", axis=1)
df["llm_preference"] = df.apply(lambda row: "A" if row["scores"] == 1 else "B", axis=1)
df["human_preference"] = df.apply(
lambda row: (
# fmt: off
"B" if row["human_preference"] == "A" and row["swapped"] == 1
else "A" if row["human_preference"] == "B" and row["swapped"] == 1 else row["human_preference"]
# fmt: on
),
axis=1,
)

if args.dropna:
df = df.dropna(subset=["human_preference"])
logging.info(f"Dropped instances with no annotations. No. of instances: {len(df)}")
logging.info(f"Dropped instances with no human annotations. No. of instances left: {len(df)}")

# Compute accuracy
acc_human_vs_gold_all = (df["human_preference"] == df["gold_preference"]).sum() / len(df)
logging.info(f"Accuracy (human vs. gold): {acc_human_vs_gold_all * 100:.2f}")
subset_df = df[df["scores"] != 0.5] # get instances with LLM scores
acc_human_vs_gold_sub = (subset_df["human_preference"] == subset_df["gold_preference"]).sum() / len(subset_df)
acc_human_vs_llm_sub = (subset_df["human_preference"] == subset_df["llm_preference"]).sum() / len(subset_df)
logging.info(f"Accuracy (human vs. gold) subset: {acc_human_vs_gold_sub*100:.2f}")
logging.info(f"Accuracy (human vs. llm) subset: {acc_human_vs_llm_sub*100:.2f}")

# Save disagreements
lang_code = df["language"].to_list()[0]
disagree_human_vs_gold = df[df["human_preference"] != df["gold_preference"]]
disagree_human_vs_llm = subset_df[subset_df["human_preference"] != subset_df["llm_preference"]]
disagree_human_vs_gold.to_csv(f"{lang_code}-disagreement-human-vs-gold.csv", index=False)
disagree_human_vs_llm.to_csv(f"{lang_code}-disagreement-human-vs-llm.csv", index=False)

cm = ConfusionMatrix(
actual_vector=df["gold_preference"].to_list(),
predict_vector=df["human_preference"].to_list(),
)
print(
f"*** Overall metrics ***\n",
f"Accuracy: {cm.Overall_ACC}\n",
f"F1-score: {cm.F1_Macro}\n",
f"Per-class accuracy: {cm.ACC}\n",
f"Cohen's Kappa: {cm.Kappa}\n",
f"Krippendorff Alpha: {cm.Alpha}\n",
f"Gwet's AC1: {cm.AC1}\n",
)


if __name__ == "__main__":
Expand Down

0 comments on commit 89b076e

Please sign in to comment.