diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py index 6c7e91da..0748cb0c 100644 --- a/sourcecode/scoring/constants.py +++ b/sourcecode/scoring/constants.py @@ -234,13 +234,9 @@ def rater_factor_key(i): incorrectFilterColumns = [ "notHelpfulIncorrect_interval", - "cnt_interval", + "p_incorrect_user_interval", "num_voters_interval", "tf_idf_incorrect_interval", - "notHelpfulIncorrect_same", - "cnt_same", - "num_voters_same", - "tf_idf_incorrect_same", ] misleadingTags = [ diff --git a/sourcecode/scoring/incorrect_filter.py b/sourcecode/scoring/incorrect_filter.py index 760ab098..032f3e18 100644 --- a/sourcecode/scoring/incorrect_filter.py +++ b/sourcecode/scoring/incorrect_filter.py @@ -6,49 +6,40 @@ import pandas as pd -def _get_user_incorrect_ratio(ratings: pd.DataFrame) -> pd.DataFrame: +def _get_user_incorrect_ratio(nhTagRatings: pd.DataFrame) -> pd.DataFrame: """Computes empirical p(incorrect | not helpful tags assigned) per rater. Args: - ratings: initial input ratings DF containing all ratings + nhTagRatings: DF containing all ratings with some NH tag Returns: pd.DataFrame containing one row per user who assigned not helpful tags with their empirical propensity to assign "incorrect" tag """ - notHelpfulTaggedRatings = ratings.loc[ratings[c.notHelpfulTagsTSVOrder].sum(axis=1) > 0] user_incorrect = ( - notHelpfulTaggedRatings[[c.raterParticipantIdKey, "notHelpfulIncorrect"]] + nhTagRatings[[c.raterParticipantIdKey, "notHelpfulIncorrect"]] .groupby(c.raterParticipantIdKey) .agg("sum") ) user_nh_rating_count = ( - notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey]] + nhTagRatings[[c.raterParticipantIdKey, c.noteIdKey]] .groupby(c.raterParticipantIdKey) .agg("count") ) user_nh_rating_count.rename(columns={c.noteIdKey: "cnt"}, inplace=True) user_totals = user_incorrect.merge(user_nh_rating_count, on=c.raterParticipantIdKey) - note_totals = ( - notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey]] - .groupby(c.noteIdKey) - .agg("count") - .reset_index() - ) - note_totals.rename(columns={c.raterParticipantIdKey: "num_voters"}, inplace=True) - return user_totals, note_totals + return user_totals def _get_incorrect_tfidf_ratio( - augmented_ratings: pd.DataFrame, note_nh_count: pd.DataFrame, user_filter: bool, suffix: str + augmented_ratings: pd.DataFrame, user_filter: bool, suffix: str ) -> pd.DataFrame: """Computes empirical p(incorrect | note) / p(incorrect | raters over all notes) subject to rater-note inclusion function. Args: augmented_ratings: ratings DF with note and rater factors and user incorrect TF - note_nh_count: DF with total number of NH votes per note filter: inclusion criteria for "incorrect" voters suffix: suffix for incorrect and count column names for this filter @@ -59,6 +50,14 @@ def _get_incorrect_tfidf_ratio( ratings_w_user_totals = augmented_ratings[user_filter] + note_nh_count = ( + ratings_w_user_totals[[c.raterParticipantIdKey, c.noteIdKey]] + .groupby(c.noteIdKey) + .agg("count") + .reset_index() + ) + note_nh_count.rename(columns={c.raterParticipantIdKey: "num_voters"}, inplace=True) + columns_to_attempt_to_drop = [ c.internalRaterFactor1Key, c.internalNoteFactor1Key, @@ -67,15 +66,17 @@ def _get_incorrect_tfidf_ratio( columns_to_drop = ratings_w_user_totals.columns.intersection(columns_to_attempt_to_drop) ratings_w_user_totals.drop(columns_to_drop, inplace=True, axis=1) + ratings_w_user_totals["p_incorrect_user"] = ( + ratings_w_user_totals["notHelpfulIncorrect_total"] / ratings_w_user_totals["cnt"] + ) + rating_aggs = ratings_w_user_totals.groupby(c.noteIdKey).agg("sum").reset_index() rating_aggs_w_cnt = rating_aggs.merge(note_nh_count, on=c.noteIdKey) - rating_aggs_w_cnt["tf_idf_incorrect"] = ( - rating_aggs_w_cnt["notHelpfulIncorrect"] / rating_aggs_w_cnt["num_voters"] - ) / np.log( - 1 + (rating_aggs_w_cnt["notHelpfulIncorrect_total"] / rating_aggs_w_cnt["cnt"]) + rating_aggs_w_cnt["tf_idf_incorrect"] = (rating_aggs_w_cnt["notHelpfulIncorrect"]) / np.log( + 1 + (rating_aggs_w_cnt["p_incorrect_user"]) ) # p(incorrect over all rater ratings) - rating_aggs_w_cnt.drop("notHelpfulIncorrect_total", inplace=True, axis=1) + rating_aggs_w_cnt.drop(["notHelpfulIncorrect_total", "cnt"], inplace=True, axis=1) rating_aggs_w_cnt.columns = [c.noteIdKey] + [ f"{col}{suffix}" for col in rating_aggs_w_cnt.columns[1:] ] @@ -97,12 +98,14 @@ def get_incorrect_aggregates( aggregates for the Not-Helpful tags, including raw totals, totals adjusted based on the distance between the rater and the note and ratios based on the adjusted weight totals. """ + # consider only ratings with some NH tag + notHelpfulTaggedRatings = ratings.loc[ratings[c.notHelpfulTagsTSVOrder].sum(axis=1) > 0] # get per user incorrect term frequency - user_totals, note_totals = _get_user_incorrect_ratio(ratings) + user_totals = _get_user_incorrect_ratio(notHelpfulTaggedRatings) # add user and note factors ratings_w_user_totals = ( - ratings[[c.raterParticipantIdKey, c.noteIdKey, "notHelpfulIncorrect"]] + notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey, "notHelpfulIncorrect"]] .merge(user_totals, on=c.raterParticipantIdKey, suffixes=(None, "_total")) .merge(noteParams[[c.noteIdKey, c.internalNoteFactor1Key]], on=c.noteIdKey) .merge( @@ -112,24 +115,14 @@ def get_incorrect_aggregates( interval_filter = ( np.abs( - ratings_w_user_totals[c.internalRaterFactor1Key] - - ratings_w_user_totals[c.internalNoteFactor1Key] + ratings_w_user_totals[c.internalRaterFactor1Key].clip(-0.4, 0.4) + - ratings_w_user_totals[c.internalNoteFactor1Key].clip(-0.4, 0.4) ) < c.intervalHalfWidth ) - interval_scores = _get_incorrect_tfidf_ratio( - ratings_w_user_totals, note_totals, interval_filter, "_interval" - ) - same_factor_filter = ( - (ratings_w_user_totals[c.internalRaterFactor1Key] > 0) - & (ratings_w_user_totals[c.internalNoteFactor1Key] > 0) - ) | ( - (ratings_w_user_totals[c.internalRaterFactor1Key] < 0) - & (ratings_w_user_totals[c.internalNoteFactor1Key] < 0) - ) - same_scores = _get_incorrect_tfidf_ratio( - ratings_w_user_totals, note_totals, same_factor_filter, "_same" + + incorrectAggregates = _get_incorrect_tfidf_ratio( + ratings_w_user_totals, interval_filter, "_interval" ) - incorrectAggregates = interval_scores.merge(same_scores, on=c.noteIdKey) return incorrectAggregates diff --git a/sourcecode/scoring/mf_base_scorer.py b/sourcecode/scoring/mf_base_scorer.py index da776174..b035cf63 100644 --- a/sourcecode/scoring/mf_base_scorer.py +++ b/sourcecode/scoring/mf_base_scorer.py @@ -35,7 +35,7 @@ def __init__( crhThresholdLCBIntercept: float = 0.35, crhSuperThreshold: float = 0.5, inertiaDelta: float = 0.01, - weightedTotalVotes: float = 1.0, + weightedTotalVotes: float = 2.5, ): """Configure MatrixFactorizationScorer object. diff --git a/sourcecode/scoring/scoring_rules.py b/sourcecode/scoring/scoring_rules.py index ea9bda94..33c785e1 100644 --- a/sourcecode/scoring/scoring_rules.py +++ b/sourcecode/scoring/scoring_rules.py @@ -279,7 +279,7 @@ def __init__( ruleID: RuleID, dependencies: Set[RuleID], status: str, - weightedTotalVotes: float = 1.0, + weightedTotalVotes: float = 2.5, ): """Filter CRH notes for outliers with high levels of incorrect tag from similar factor raters. @@ -303,13 +303,10 @@ def score_notes( crhStats = noteStats.merge(crhNotes, on=c.noteIdKey, how="inner") # Identify impacted notes. - crhStats["score"] = crhStats["tf_idf_incorrect_interval"] + crhStats["tf_idf_incorrect_same"] - noteStatusUpdates = crhStats.loc[ - ((crhStats["notHelpfulIncorrect_interval"] > 1) | (crhStats["notHelpfulIncorrect_same"] > 1)) - & (crhStats["num_voters_interval"] > 2) - & (crhStats["num_voters_same"] > 2) - & (crhStats["score"] >= self.weightedTotalVotes) + (crhStats["notHelpfulIncorrect_interval"] >= 2) + & (crhStats["num_voters_interval"] >= 3) + & (crhStats["tf_idf_incorrect_interval"] >= self.weightedTotalVotes) ][[c.noteIdKey]] pd.testing.assert_frame_equal(noteStatusUpdates, noteStatusUpdates.drop_duplicates())