Improving tag filtering

Additional tag filtering now better detects unexpected levels of "incorrect" tags.
twitter · Sep 1, 2023 · 3ec3129 · 3ec3129
1 parent 0ef11d9
commit 3ec3129
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 50 deletions.
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -234,13 +234,9 @@ def rater_factor_key(i):
 
 incorrectFilterColumns = [
   "notHelpfulIncorrect_interval",
-  "cnt_interval",
+  "p_incorrect_user_interval",
   "num_voters_interval",
   "tf_idf_incorrect_interval",
-  "notHelpfulIncorrect_same",
-  "cnt_same",
-  "num_voters_same",
-  "tf_idf_incorrect_same",
 ]
 
 misleadingTags = [

diff --git a/sourcecode/scoring/incorrect_filter.py b/sourcecode/scoring/incorrect_filter.py
@@ -6,49 +6,40 @@
 import pandas as pd
 
 
-def _get_user_incorrect_ratio(ratings: pd.DataFrame) -> pd.DataFrame:
+def _get_user_incorrect_ratio(nhTagRatings: pd.DataFrame) -> pd.DataFrame:
   """Computes empirical p(incorrect | not helpful tags assigned) per rater.
 
   Args:
-    ratings: initial input ratings DF containing all ratings
+    nhTagRatings: DF containing all ratings with some NH tag
 
   Returns:
     pd.DataFrame containing one row per user who assigned not helpful tags with their empirical propensity
     to assign "incorrect" tag
   """
 
-  notHelpfulTaggedRatings = ratings.loc[ratings[c.notHelpfulTagsTSVOrder].sum(axis=1) > 0]
   user_incorrect = (
-    notHelpfulTaggedRatings[[c.raterParticipantIdKey, "notHelpfulIncorrect"]]
+    nhTagRatings[[c.raterParticipantIdKey, "notHelpfulIncorrect"]]
     .groupby(c.raterParticipantIdKey)
     .agg("sum")
   )
   user_nh_rating_count = (
-    notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey]]
+    nhTagRatings[[c.raterParticipantIdKey, c.noteIdKey]]
     .groupby(c.raterParticipantIdKey)
     .agg("count")
   )
   user_nh_rating_count.rename(columns={c.noteIdKey: "cnt"}, inplace=True)
   user_totals = user_incorrect.merge(user_nh_rating_count, on=c.raterParticipantIdKey)
 
-  note_totals = (
-    notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey]]
-    .groupby(c.noteIdKey)
-    .agg("count")
-    .reset_index()
-  )
-  note_totals.rename(columns={c.raterParticipantIdKey: "num_voters"}, inplace=True)
-  return user_totals, note_totals
+  return user_totals
 
 
 def _get_incorrect_tfidf_ratio(
-  augmented_ratings: pd.DataFrame, note_nh_count: pd.DataFrame, user_filter: bool, suffix: str
+  augmented_ratings: pd.DataFrame, user_filter: bool, suffix: str
 ) -> pd.DataFrame:
   """Computes empirical p(incorrect | note) / p(incorrect | raters over all notes) subject to rater-note inclusion function.
 
   Args:
     augmented_ratings: ratings DF with note and rater factors and user incorrect TF
-    note_nh_count: DF with total number of NH votes per note
     filter: inclusion criteria for "incorrect" voters
     suffix: suffix for incorrect and count column names for this filter
 
@@ -59,6 +50,14 @@ def _get_incorrect_tfidf_ratio(
 
   ratings_w_user_totals = augmented_ratings[user_filter]
 
+  note_nh_count = (
+    ratings_w_user_totals[[c.raterParticipantIdKey, c.noteIdKey]]
+    .groupby(c.noteIdKey)
+    .agg("count")
+    .reset_index()
+  )
+  note_nh_count.rename(columns={c.raterParticipantIdKey: "num_voters"}, inplace=True)
+
   columns_to_attempt_to_drop = [
     c.internalRaterFactor1Key,
     c.internalNoteFactor1Key,
@@ -67,15 +66,17 @@ def _get_incorrect_tfidf_ratio(
   columns_to_drop = ratings_w_user_totals.columns.intersection(columns_to_attempt_to_drop)
   ratings_w_user_totals.drop(columns_to_drop, inplace=True, axis=1)
 
+  ratings_w_user_totals["p_incorrect_user"] = (
+    ratings_w_user_totals["notHelpfulIncorrect_total"] / ratings_w_user_totals["cnt"]
+  )
+
   rating_aggs = ratings_w_user_totals.groupby(c.noteIdKey).agg("sum").reset_index()
   rating_aggs_w_cnt = rating_aggs.merge(note_nh_count, on=c.noteIdKey)
 
-  rating_aggs_w_cnt["tf_idf_incorrect"] = (
-    rating_aggs_w_cnt["notHelpfulIncorrect"] / rating_aggs_w_cnt["num_voters"]
-  ) / np.log(
-    1 + (rating_aggs_w_cnt["notHelpfulIncorrect_total"] / rating_aggs_w_cnt["cnt"])
+  rating_aggs_w_cnt["tf_idf_incorrect"] = (rating_aggs_w_cnt["notHelpfulIncorrect"]) / np.log(
+    1 + (rating_aggs_w_cnt["p_incorrect_user"])
   )  # p(incorrect over all rater ratings)
-  rating_aggs_w_cnt.drop("notHelpfulIncorrect_total", inplace=True, axis=1)
+  rating_aggs_w_cnt.drop(["notHelpfulIncorrect_total", "cnt"], inplace=True, axis=1)
   rating_aggs_w_cnt.columns = [c.noteIdKey] + [
     f"{col}{suffix}" for col in rating_aggs_w_cnt.columns[1:]
   ]
@@ -97,12 +98,14 @@ def get_incorrect_aggregates(
     aggregates for the Not-Helpful tags, including raw totals, totals adjusted based on the
     distance between the rater and the note and ratios based on the adjusted weight totals.
   """
+  # consider only ratings with some NH tag
+  notHelpfulTaggedRatings = ratings.loc[ratings[c.notHelpfulTagsTSVOrder].sum(axis=1) > 0]
 
   # get per user incorrect term frequency
-  user_totals, note_totals = _get_user_incorrect_ratio(ratings)
+  user_totals = _get_user_incorrect_ratio(notHelpfulTaggedRatings)
   # add user and note factors
   ratings_w_user_totals = (
-    ratings[[c.raterParticipantIdKey, c.noteIdKey, "notHelpfulIncorrect"]]
+    notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey, "notHelpfulIncorrect"]]
     .merge(user_totals, on=c.raterParticipantIdKey, suffixes=(None, "_total"))
     .merge(noteParams[[c.noteIdKey, c.internalNoteFactor1Key]], on=c.noteIdKey)
     .merge(
@@ -112,24 +115,14 @@ def get_incorrect_aggregates(
 
   interval_filter = (
     np.abs(
-      ratings_w_user_totals[c.internalRaterFactor1Key]
-      - ratings_w_user_totals[c.internalNoteFactor1Key]
+      ratings_w_user_totals[c.internalRaterFactor1Key].clip(-0.4, 0.4)
+      - ratings_w_user_totals[c.internalNoteFactor1Key].clip(-0.4, 0.4)
     )
     < c.intervalHalfWidth
   )
-  interval_scores = _get_incorrect_tfidf_ratio(
-    ratings_w_user_totals, note_totals, interval_filter, "_interval"
-  )
-  same_factor_filter = (
-    (ratings_w_user_totals[c.internalRaterFactor1Key] > 0)
-    & (ratings_w_user_totals[c.internalNoteFactor1Key] > 0)
-  ) | (
-    (ratings_w_user_totals[c.internalRaterFactor1Key] < 0)
-    & (ratings_w_user_totals[c.internalNoteFactor1Key] < 0)
-  )
-  same_scores = _get_incorrect_tfidf_ratio(
-    ratings_w_user_totals, note_totals, same_factor_filter, "_same"
+
+  incorrectAggregates = _get_incorrect_tfidf_ratio(
+    ratings_w_user_totals, interval_filter, "_interval"
   )
 
-  incorrectAggregates = interval_scores.merge(same_scores, on=c.noteIdKey)
   return incorrectAggregates
diff --git a/sourcecode/scoring/mf_base_scorer.py b/sourcecode/scoring/mf_base_scorer.py
@@ -35,7 +35,7 @@ def __init__(
     crhThresholdLCBIntercept: float = 0.35,
     crhSuperThreshold: float = 0.5,
     inertiaDelta: float = 0.01,
-    weightedTotalVotes: float = 1.0,
+    weightedTotalVotes: float = 2.5,
   ):
     """Configure MatrixFactorizationScorer object.
 

diff --git a/sourcecode/scoring/scoring_rules.py b/sourcecode/scoring/scoring_rules.py
@@ -279,7 +279,7 @@ def __init__(
     ruleID: RuleID,
     dependencies: Set[RuleID],
     status: str,
-    weightedTotalVotes: float = 1.0,
+    weightedTotalVotes: float = 2.5,
   ):
     """Filter CRH notes for outliers with high levels of incorrect tag from similar factor raters.
 
@@ -303,13 +303,10 @@ def score_notes(
     crhStats = noteStats.merge(crhNotes, on=c.noteIdKey, how="inner")
 
     # Identify impacted notes.
-    crhStats["score"] = crhStats["tf_idf_incorrect_interval"] + crhStats["tf_idf_incorrect_same"]
-
     noteStatusUpdates = crhStats.loc[
-      ((crhStats["notHelpfulIncorrect_interval"] > 1) | (crhStats["notHelpfulIncorrect_same"] > 1))
-      & (crhStats["num_voters_interval"] > 2)
-      & (crhStats["num_voters_same"] > 2)
-      & (crhStats["score"] >= self.weightedTotalVotes)
+      (crhStats["notHelpfulIncorrect_interval"] >= 2)
+      & (crhStats["num_voters_interval"] >= 3)
+      & (crhStats["tf_idf_incorrect_interval"] >= self.weightedTotalVotes)
     ][[c.noteIdKey]]
 
     pd.testing.assert_frame_equal(noteStatusUpdates, noteStatusUpdates.drop_duplicates())