From 696215a26f1957624bb4f5c3797c43823965f557 Mon Sep 17 00:00:00 2001 From: Brad Miller Date: Fri, 20 Oct 2023 20:47:34 -0700 Subject: [PATCH] augment incorrect filtering to improve correctness --- sourcecode/scoring/constants.py | 12 ++++-- sourcecode/scoring/incorrect_filter.py | 55 ++++++++++++++++++++------ sourcecode/scoring/mf_base_scorer.py | 5 --- sourcecode/scoring/note_ratings.py | 30 ++++++++++++-- sourcecode/scoring/scoring_rules.py | 28 ++++++++++--- 5 files changed, 101 insertions(+), 29 deletions(-) diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py index 42a46a07..331d81bf 100644 --- a/sourcecode/scoring/constants.py +++ b/sourcecode/scoring/constants.py @@ -200,11 +200,13 @@ def rater_factor_key(i): notHelpfulArgumentativeOrBiasedTagKey = "notHelpfulArgumentativeOrBiased" notHelpfulHardToUnderstandKey = "notHelpfulHardToUnderstand" notHelpfulNoteNotNeededKey = "notHelpfulNoteNotNeeded" +notHelpfulSourcesMissingOrUnreliableTagKey = "notHelpfulSourcesMissingOrUnreliable" +notHelpfulIrrelevantSourcesTagKey = "notHelpfulIrrelevantSources" notHelpfulTagsAndTieBreakOrder = [ (0, notHelpfulOtherTagKey), (8, notHelpfulIncorrectTagKey), - (2, "notHelpfulSourcesMissingOrUnreliable"), + (2, notHelpfulSourcesMissingOrUnreliableTagKey), (4, "notHelpfulOpinionSpeculationOrBias"), (5, "notHelpfulMissingKeyPoints"), (12, "notHelpfulOutdated"), @@ -212,7 +214,7 @@ def rater_factor_key(i): (7, notHelpfulArgumentativeOrBiasedTagKey), (9, "notHelpfulOffTopic"), (11, notHelpfulSpamHarassmentOrAbuseTagKey), - (1, "notHelpfulIrrelevantSources"), + (1, notHelpfulIrrelevantSourcesTagKey), (3, "notHelpfulOpinionSpeculation"), (6, notHelpfulNoteNotNeededKey), ] @@ -235,12 +237,16 @@ def rater_factor_key(i): ] ratingWeightKey = "ratingWeight" -incorrectFilterColumns = [ +wideIncorrectFilterSuffix = "_wide" +_incorrectFilterColumns = [ "notHelpfulIncorrect_interval", "p_incorrect_user_interval", "num_voters_interval", "tf_idf_incorrect_interval", ] +incorrectFilterColumns = _incorrectFilterColumns + [ + f"{col}{wideIncorrectFilterSuffix}" for col in _incorrectFilterColumns +] misleadingTags = [ "misleadingOther", diff --git a/sourcecode/scoring/incorrect_filter.py b/sourcecode/scoring/incorrect_filter.py index 032f3e18..8c715545 100644 --- a/sourcecode/scoring/incorrect_filter.py +++ b/sourcecode/scoring/incorrect_filter.py @@ -1,5 +1,7 @@ """Utilites for tag based scoring logic.""" +from typing import List, Optional + from . import constants as c import numpy as np @@ -18,7 +20,7 @@ def _get_user_incorrect_ratio(nhTagRatings: pd.DataFrame) -> pd.DataFrame: """ user_incorrect = ( - nhTagRatings[[c.raterParticipantIdKey, "notHelpfulIncorrect"]] + nhTagRatings[[c.raterParticipantIdKey, c.notHelpfulIncorrectTagKey]] .groupby(c.raterParticipantIdKey) .agg("sum") ) @@ -34,7 +36,7 @@ def _get_user_incorrect_ratio(nhTagRatings: pd.DataFrame) -> pd.DataFrame: def _get_incorrect_tfidf_ratio( - augmented_ratings: pd.DataFrame, user_filter: bool, suffix: str + augmented_ratings: pd.DataFrame, user_filter: Optional[bool], suffix: str ) -> pd.DataFrame: """Computes empirical p(incorrect | note) / p(incorrect | raters over all notes) subject to rater-note inclusion function. @@ -47,8 +49,10 @@ def _get_incorrect_tfidf_ratio( pd.DataFrame with one row for each note, with computed sum(tf_idf_incorrect) score for raters included in filter """ - - ratings_w_user_totals = augmented_ratings[user_filter] + if user_filter is not None: + ratings_w_user_totals = augmented_ratings[user_filter] + else: + ratings_w_user_totals = augmented_ratings note_nh_count = ( ratings_w_user_totals[[c.raterParticipantIdKey, c.noteIdKey]] @@ -73,7 +77,7 @@ def _get_incorrect_tfidf_ratio( rating_aggs = ratings_w_user_totals.groupby(c.noteIdKey).agg("sum").reset_index() rating_aggs_w_cnt = rating_aggs.merge(note_nh_count, on=c.noteIdKey) - rating_aggs_w_cnt["tf_idf_incorrect"] = (rating_aggs_w_cnt["notHelpfulIncorrect"]) / np.log( + rating_aggs_w_cnt["tf_idf_incorrect"] = (rating_aggs_w_cnt[c.notHelpfulIncorrectTagKey]) / np.log( 1 + (rating_aggs_w_cnt["p_incorrect_user"]) ) # p(incorrect over all rater ratings) rating_aggs_w_cnt.drop(["notHelpfulIncorrect_total", "cnt"], inplace=True, axis=1) @@ -84,7 +88,12 @@ def _get_incorrect_tfidf_ratio( def get_incorrect_aggregates( - ratings: pd.DataFrame, noteParams: pd.DataFrame, raterParams: pd.DataFrame + ratingsOrig: pd.DataFrame, + noteParams: pd.DataFrame, + raterParams: pd.DataFrame, + applyFilter: bool = True, + extraCols: List[str] = [], + colSuffix: str = "", ) -> pd.DataFrame: """Computes non-helpful tag aggregates for each note. @@ -92,12 +101,25 @@ def get_incorrect_aggregates( ratings: initial input ratings DF containing all ratings noteParams: MF results for notes raterParams: MF results for raters + applyFilter: bool indicating whether to filter included ratings based on factor + extraCols: list of tags to include along with notHelpfulIncorrect + colSuffix: str which will be added to the end of each column other than noteId Returns: pd.DataFrame containing one row per note that was scored during MF. Columns correspond to aggregates for the Not-Helpful tags, including raw totals, totals adjusted based on the distance between the rater and the note and ratios based on the adjusted weight totals. """ + # augment notHelpfulIncorrect with any additional columns + ratings = ratingsOrig.copy() + if extraCols: + for column in extraCols: + assert column is not c.notHelpfulIncorrectTagKey + ratings[c.notHelpfulIncorrectTagKey] += ratings[column] + ratings[c.notHelpfulIncorrectTagKey] = ( + ratings[c.notHelpfulIncorrectTagKey].clip(0, 1).astype(np.int64) + ) + # consider only ratings with some NH tag notHelpfulTaggedRatings = ratings.loc[ratings[c.notHelpfulTagsTSVOrder].sum(axis=1) > 0] @@ -105,7 +127,7 @@ def get_incorrect_aggregates( user_totals = _get_user_incorrect_ratio(notHelpfulTaggedRatings) # add user and note factors ratings_w_user_totals = ( - notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey, "notHelpfulIncorrect"]] + notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey, c.notHelpfulIncorrectTagKey]] .merge(user_totals, on=c.raterParticipantIdKey, suffixes=(None, "_total")) .merge(noteParams[[c.noteIdKey, c.internalNoteFactor1Key]], on=c.noteIdKey) .merge( @@ -113,16 +135,23 @@ def get_incorrect_aggregates( ) ) - interval_filter = ( - np.abs( - ratings_w_user_totals[c.internalRaterFactor1Key].clip(-0.4, 0.4) - - ratings_w_user_totals[c.internalNoteFactor1Key].clip(-0.4, 0.4) + interval_filter = None + if applyFilter: + interval_filter = ( + np.abs( + ratings_w_user_totals[c.internalRaterFactor1Key].clip(-0.4, 0.4) + - ratings_w_user_totals[c.internalNoteFactor1Key].clip(-0.4, 0.4) + ) + < c.intervalHalfWidth ) - < c.intervalHalfWidth - ) incorrectAggregates = _get_incorrect_tfidf_ratio( ratings_w_user_totals, interval_filter, "_interval" ) + # apply column suffix + columns = incorrectAggregates.columns + cols = [f"{col}{colSuffix}" if col is not c.noteIdKey else col for col in columns] + incorrectAggregates.columns = cols + return incorrectAggregates diff --git a/sourcecode/scoring/mf_base_scorer.py b/sourcecode/scoring/mf_base_scorer.py index f843df0a..a7bc988d 100644 --- a/sourcecode/scoring/mf_base_scorer.py +++ b/sourcecode/scoring/mf_base_scorer.py @@ -35,7 +35,6 @@ def __init__( crhThresholdLCBIntercept: float = 0.35, crhSuperThreshold: float = 0.5, inertiaDelta: float = 0.01, - weightedTotalVotes: float = 2.5, useStableInitialization: bool = True, ): """Configure MatrixFactorizationScorer object. @@ -70,7 +69,6 @@ def __init__( repeated reason tags in not-helpful ratings to achieve CRH status. inertiaDelta: Minimum amount which a note that has achieve CRH status must drop below the applicable threshold to lose CRH status. - weightedTotalVotes: Minimum number of weighted incorrect votes required to lose CRH status. useStableInitialization: whether to use a specific modeling group of users to stably initialize """ super().__init__(seed) @@ -89,7 +87,6 @@ def __init__( self._crhThresholdLCBIntercept = crhThresholdLCBIntercept self._crhSuperThreshold = crhSuperThreshold self._inertiaDelta = inertiaDelta - self._weightedTotalVotes = weightedTotalVotes self._modelingGroupToInitializeForStability = 13 if useStableInitialization else None self._mfRanker = MatrixFactorization() @@ -305,7 +302,6 @@ def _score_notes_and_users( crhThresholdLCBIntercept=self._crhThresholdLCBIntercept, crhSuperThreshold=self._crhSuperThreshold, inertiaDelta=self._inertiaDelta, - weightedTotalVotes=self._weightedTotalVotes, ) # Determine "valid" ratings @@ -378,7 +374,6 @@ def _score_notes_and_users( crhThresholdLCBIntercept=self._crhThresholdLCBIntercept, crhSuperThreshold=self._crhSuperThreshold, inertiaDelta=self._inertiaDelta, - weightedTotalVotes=self._weightedTotalVotes, finalRound=True, ) # Takes raterParams from most recent MF run, but use the pre-computed diff --git a/sourcecode/scoring/note_ratings.py b/sourcecode/scoring/note_ratings.py index b602602d..ae42b4fd 100644 --- a/sourcecode/scoring/note_ratings.py +++ b/sourcecode/scoring/note_ratings.py @@ -379,7 +379,6 @@ def compute_scored_notes( crhThresholdLCBIntercept: float, crhSuperThreshold: float, inertiaDelta: float, - weightedTotalVotes: float, finalRound: bool = False, # TODO: We might want to consider inputing only the series here, instead of the whole callable is_crh_function: Callable[..., pd.Series] = is_crh, @@ -412,7 +411,6 @@ def compute_scored_notes( repeated reason tags in not-helpful ratings to achieve CRH status. inertiaDelta: Minimum amount which a note that has achieve CRH status must drop below the applicable threshold to lose CRH status. - weightedTotalVotes: Minimum number of weighted incorrect votes required to lose CRH status. finalRound: If true, enable additional status assignment logic which is only applied when determining final status. Given that these mechanisms add complexity we don't apply them in earlier rounds. @@ -485,6 +483,15 @@ def compute_scored_notes( ratings, noteParams, raterParams ) noteStats = noteStats.merge(incorrectAggregates, on=c.noteIdKey, how="outer") + incorrectAggregatesWide = incorrect_filter.get_incorrect_aggregates( + ratings, + noteParams, + raterParams, + applyFilter=False, + extraCols=[c.notHelpfulSourcesMissingOrUnreliableTagKey, c.notHelpfulIrrelevantSourcesTagKey], + colSuffix=c.wideIncorrectFilterSuffix, + ) + noteStats = noteStats.merge(incorrectAggregatesWide, on=c.noteIdKey, how="outer") # Add tag filtering and sticky scoring logic. rules.extend( @@ -528,7 +535,24 @@ def compute_scored_notes( minRatingsNeeded, ), scoring_rules.FilterIncorrect( - RuleID.INCORRECT_OUTLIER, {RuleID.TAG_OUTLIER}, c.needsMoreRatings, weightedTotalVotes + RuleID.INCORRECT_OUTLIER, + {RuleID.TAG_OUTLIER}, + c.needsMoreRatings, + tagThreshold=2, + voteThreshold=3, + weightedTotalVotes=2.5, + superThreshold=None, + colSuffix="", + ), + scoring_rules.FilterIncorrect( + RuleID.INCORRECT_OUTLIER_WIDE, + {RuleID.TAG_OUTLIER}, + c.needsMoreRatings, + tagThreshold=4, + voteThreshold=5, + weightedTotalVotes=4.0, + superThreshold=0.5, + colSuffix=c.wideIncorrectFilterSuffix, ), ] ) diff --git a/sourcecode/scoring/scoring_rules.py b/sourcecode/scoring/scoring_rules.py index 75778475..959792dd 100644 --- a/sourcecode/scoring/scoring_rules.py +++ b/sourcecode/scoring/scoring_rules.py @@ -30,6 +30,7 @@ class RuleID(Enum): ELEVATED_CRH_INERTIA = RuleAndVersion("ElevatedCRHInertia", "1.0", False) LCB_INERTIA = RuleAndVersion("LcbCRHInertia", "1.0", False) INCORRECT_OUTLIER = RuleAndVersion("FilterIncorrect", "1.0", False) + INCORRECT_OUTLIER_WIDE = RuleAndVersion("FilterIncorrectWide", "1.0", False) # Rules used in _meta_score. META_INITIAL_NMR = RuleAndVersion("MetaInitialNMR", "1.0", False) @@ -280,7 +281,11 @@ def __init__( ruleID: RuleID, dependencies: Set[RuleID], status: str, - weightedTotalVotes: float = 2.5, + tagThreshold: int, + voteThreshold: int, + weightedTotalVotes: float, + superThreshold: Optional[float], + colSuffix: str, ): """Filter CRH notes for outliers with high levels of incorrect tag from similar factor raters. @@ -288,12 +293,20 @@ def __init__( rule: enum corresponding to a namedtuple defining a rule name and version string for the ScoringRule. dependencies: Rules which must run before this rule can run. status: the status which each note should be set to (e.g. CRH, CRNH, NMR) + tagThreshold: threshold for number of included raters to issue a tag + voteThreshold: threshold for number of included raters (raters must have issued a NH tag to be inclueed) weightedTotalVotes: For the filter to trigger, the sum of weighted incorrect votes must exceed the minAdjustedTotal. + superThreshold: if set, allow notes with an intercept above threshold to bypass the filter. + colSuffix: string suffix to apply to lookup columns """ super().__init__(ruleID, dependencies) self._status = status - self.weightedTotalVotes = weightedTotalVotes + self._tagThreshold = tagThreshold + self._voteThreshold = voteThreshold + self._weightedTotalVotes = weightedTotalVotes + self._superThreshold = superThreshold + self._colSuffix = colSuffix def score_notes( self, noteStats: pd.DataFrame, currentLabels: pd.DataFrame, statusColumn: str @@ -305,9 +318,14 @@ def score_notes( # Identify impacted notes. noteStatusUpdates = crhStats.loc[ - (crhStats["notHelpfulIncorrect_interval"] >= 2) - & (crhStats["num_voters_interval"] >= 3) - & (crhStats["tf_idf_incorrect_interval"] >= self.weightedTotalVotes) + (crhStats[f"notHelpfulIncorrect_interval{self._colSuffix}"] >= self._tagThreshold) + & (crhStats[f"num_voters_interval{self._colSuffix}"] >= self._voteThreshold) + & (crhStats[f"tf_idf_incorrect_interval{self._colSuffix}"] >= self._weightedTotalVotes) + & ( + True + if self._superThreshold is None + else crhStats[c.internalNoteInterceptKey] < self._superThreshold + ) ][[c.noteIdKey]] pd.testing.assert_frame_equal(noteStatusUpdates, noteStatusUpdates.drop_duplicates())