Update agreement (#25)

for-ai · Aug 20, 2024 · 89b076e · 89b076e
1 parent 75c756e
commit 89b076e
Showing 1 changed file with 29 additions and 17 deletions.
diff --git a/analysis/get_human_eval.py b/analysis/get_human_eval.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from pycm import ConfusionMatrix
 
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -28,27 +29,38 @@ def main():
     annotations = annotations[["id", "human_preference", "notes"]]
 
     # Combine in single dataframe and apply random swaps
-    # We swap the gold preferences because if they're all the same value,
-    # it affects the random-corrected chance in the IAA measures
     df = pd.merge(reference, annotations, on="id")
-    df["gold_preference"] = df.apply(lambda row: "B" if row["swapped"] == 1 else "A", axis=1)
+    df["llm_preference"] = df.apply(lambda row: "A" if row["scores"] == 1 else "B", axis=1)
+    df["human_preference"] = df.apply(
+        lambda row: (
+            # fmt: off
+            "B" if row["human_preference"] == "A" and row["swapped"] == 1
+            else "A" if row["human_preference"] == "B" and row["swapped"] == 1 else row["human_preference"]
+            # fmt: on
+        ),
+        axis=1,
+    )
+
     if args.dropna:
         df = df.dropna(subset=["human_preference"])
-        logging.info(f"Dropped instances with no annotations. No. of instances: {len(df)}")
+        logging.info(f"Dropped instances with no human annotations. No. of instances left: {len(df)}")
+
+    # Compute accuracy
+    acc_human_vs_gold_all = (df["human_preference"] == df["gold_preference"]).sum() / len(df)
+    logging.info(f"Accuracy (human vs. gold): {acc_human_vs_gold_all * 100:.2f}")
+    subset_df = df[df["scores"] != 0.5]  # get instances with LLM scores
+    acc_human_vs_gold_sub = (subset_df["human_preference"] == subset_df["gold_preference"]).sum() / len(subset_df)
+    acc_human_vs_llm_sub = (subset_df["human_preference"] == subset_df["llm_preference"]).sum() / len(subset_df)
+    logging.info(f"Accuracy (human vs. gold) subset: {acc_human_vs_gold_sub*100:.2f}")
+    logging.info(f"Accuracy (human vs. llm) subset: {acc_human_vs_llm_sub*100:.2f}")
+
+    # Save disagreements
+    lang_code = df["language"].to_list()[0]
+    disagree_human_vs_gold = df[df["human_preference"] != df["gold_preference"]]
+    disagree_human_vs_llm = subset_df[subset_df["human_preference"] != subset_df["llm_preference"]]
+    disagree_human_vs_gold.to_csv(f"{lang_code}-disagreement-human-vs-gold.csv", index=False)
+    disagree_human_vs_llm.to_csv(f"{lang_code}-disagreement-human-vs-llm.csv", index=False)
 
-    cm = ConfusionMatrix(
-        actual_vector=df["gold_preference"].to_list(),
-        predict_vector=df["human_preference"].to_list(),
-    )
-    print(
-        f"*** Overall metrics ***\n",
-        f"Accuracy: {cm.Overall_ACC}\n",
-        f"F1-score: {cm.F1_Macro}\n",
-        f"Per-class accuracy: {cm.ACC}\n",
-        f"Cohen's Kappa: {cm.Kappa}\n",
-        f"Krippendorff Alpha: {cm.Alpha}\n",
-        f"Gwet's AC1: {cm.AC1}\n",
-    )
 
 
 if __name__ == "__main__":