Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Augment incorrect filtering to improve correctness #158

Merged
merged 1 commit into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions sourcecode/scoring/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,19 +200,21 @@ def rater_factor_key(i):
notHelpfulArgumentativeOrBiasedTagKey = "notHelpfulArgumentativeOrBiased"
notHelpfulHardToUnderstandKey = "notHelpfulHardToUnderstand"
notHelpfulNoteNotNeededKey = "notHelpfulNoteNotNeeded"
notHelpfulSourcesMissingOrUnreliableTagKey = "notHelpfulSourcesMissingOrUnreliable"
notHelpfulIrrelevantSourcesTagKey = "notHelpfulIrrelevantSources"

notHelpfulTagsAndTieBreakOrder = [
(0, notHelpfulOtherTagKey),
(8, notHelpfulIncorrectTagKey),
(2, "notHelpfulSourcesMissingOrUnreliable"),
(2, notHelpfulSourcesMissingOrUnreliableTagKey),
(4, "notHelpfulOpinionSpeculationOrBias"),
(5, "notHelpfulMissingKeyPoints"),
(12, "notHelpfulOutdated"),
(10, notHelpfulHardToUnderstandKey),
(7, notHelpfulArgumentativeOrBiasedTagKey),
(9, "notHelpfulOffTopic"),
(11, notHelpfulSpamHarassmentOrAbuseTagKey),
(1, "notHelpfulIrrelevantSources"),
(1, notHelpfulIrrelevantSourcesTagKey),
(3, "notHelpfulOpinionSpeculation"),
(6, notHelpfulNoteNotNeededKey),
]
Expand All @@ -235,12 +237,16 @@ def rater_factor_key(i):
]
ratingWeightKey = "ratingWeight"

incorrectFilterColumns = [
wideIncorrectFilterSuffix = "_wide"
_incorrectFilterColumns = [
"notHelpfulIncorrect_interval",
"p_incorrect_user_interval",
"num_voters_interval",
"tf_idf_incorrect_interval",
]
incorrectFilterColumns = _incorrectFilterColumns + [
f"{col}{wideIncorrectFilterSuffix}" for col in _incorrectFilterColumns
]

misleadingTags = [
"misleadingOther",
Expand Down
55 changes: 42 additions & 13 deletions sourcecode/scoring/incorrect_filter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Utilites for tag based scoring logic."""

from typing import List, Optional

from . import constants as c

import numpy as np
Expand All @@ -18,7 +20,7 @@ def _get_user_incorrect_ratio(nhTagRatings: pd.DataFrame) -> pd.DataFrame:
"""

user_incorrect = (
nhTagRatings[[c.raterParticipantIdKey, "notHelpfulIncorrect"]]
nhTagRatings[[c.raterParticipantIdKey, c.notHelpfulIncorrectTagKey]]
.groupby(c.raterParticipantIdKey)
.agg("sum")
)
Expand All @@ -34,7 +36,7 @@ def _get_user_incorrect_ratio(nhTagRatings: pd.DataFrame) -> pd.DataFrame:


def _get_incorrect_tfidf_ratio(
augmented_ratings: pd.DataFrame, user_filter: bool, suffix: str
augmented_ratings: pd.DataFrame, user_filter: Optional[bool], suffix: str
) -> pd.DataFrame:
"""Computes empirical p(incorrect | note) / p(incorrect | raters over all notes) subject to rater-note inclusion function.

Expand All @@ -47,8 +49,10 @@ def _get_incorrect_tfidf_ratio(
pd.DataFrame with one row for each note, with computed sum(tf_idf_incorrect) score for raters
included in filter
"""

ratings_w_user_totals = augmented_ratings[user_filter]
if user_filter is not None:
ratings_w_user_totals = augmented_ratings[user_filter]
else:
ratings_w_user_totals = augmented_ratings

note_nh_count = (
ratings_w_user_totals[[c.raterParticipantIdKey, c.noteIdKey]]
Expand All @@ -73,7 +77,7 @@ def _get_incorrect_tfidf_ratio(
rating_aggs = ratings_w_user_totals.groupby(c.noteIdKey).agg("sum").reset_index()
rating_aggs_w_cnt = rating_aggs.merge(note_nh_count, on=c.noteIdKey)

rating_aggs_w_cnt["tf_idf_incorrect"] = (rating_aggs_w_cnt["notHelpfulIncorrect"]) / np.log(
rating_aggs_w_cnt["tf_idf_incorrect"] = (rating_aggs_w_cnt[c.notHelpfulIncorrectTagKey]) / np.log(
1 + (rating_aggs_w_cnt["p_incorrect_user"])
) # p(incorrect over all rater ratings)
rating_aggs_w_cnt.drop(["notHelpfulIncorrect_total", "cnt"], inplace=True, axis=1)
Expand All @@ -84,45 +88,70 @@ def _get_incorrect_tfidf_ratio(


def get_incorrect_aggregates(
ratings: pd.DataFrame, noteParams: pd.DataFrame, raterParams: pd.DataFrame
ratingsOrig: pd.DataFrame,
noteParams: pd.DataFrame,
raterParams: pd.DataFrame,
applyFilter: bool = True,
extraCols: List[str] = [],
colSuffix: str = "",
) -> pd.DataFrame:
"""Computes non-helpful tag aggregates for each note.

Args:
ratings: initial input ratings DF containing all ratings
noteParams: MF results for notes
raterParams: MF results for raters
applyFilter: bool indicating whether to filter included ratings based on factor
extraCols: list of tags to include along with notHelpfulIncorrect
colSuffix: str which will be added to the end of each column other than noteId

Returns:
pd.DataFrame containing one row per note that was scored during MF. Columns correspond to
aggregates for the Not-Helpful tags, including raw totals, totals adjusted based on the
distance between the rater and the note and ratios based on the adjusted weight totals.
"""
# augment notHelpfulIncorrect with any additional columns
ratings = ratingsOrig.copy()
if extraCols:
for column in extraCols:
assert column is not c.notHelpfulIncorrectTagKey
ratings[c.notHelpfulIncorrectTagKey] += ratings[column]
ratings[c.notHelpfulIncorrectTagKey] = (
ratings[c.notHelpfulIncorrectTagKey].clip(0, 1).astype(np.int64)
)

# consider only ratings with some NH tag
notHelpfulTaggedRatings = ratings.loc[ratings[c.notHelpfulTagsTSVOrder].sum(axis=1) > 0]

# get per user incorrect term frequency
user_totals = _get_user_incorrect_ratio(notHelpfulTaggedRatings)
# add user and note factors
ratings_w_user_totals = (
notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey, "notHelpfulIncorrect"]]
notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey, c.notHelpfulIncorrectTagKey]]
.merge(user_totals, on=c.raterParticipantIdKey, suffixes=(None, "_total"))
.merge(noteParams[[c.noteIdKey, c.internalNoteFactor1Key]], on=c.noteIdKey)
.merge(
raterParams[[c.raterParticipantIdKey, c.internalRaterFactor1Key]], on=c.raterParticipantIdKey
)
)

interval_filter = (
np.abs(
ratings_w_user_totals[c.internalRaterFactor1Key].clip(-0.4, 0.4)
- ratings_w_user_totals[c.internalNoteFactor1Key].clip(-0.4, 0.4)
interval_filter = None
if applyFilter:
interval_filter = (
np.abs(
ratings_w_user_totals[c.internalRaterFactor1Key].clip(-0.4, 0.4)
- ratings_w_user_totals[c.internalNoteFactor1Key].clip(-0.4, 0.4)
)
< c.intervalHalfWidth
)
< c.intervalHalfWidth
)

incorrectAggregates = _get_incorrect_tfidf_ratio(
ratings_w_user_totals, interval_filter, "_interval"
)

# apply column suffix
columns = incorrectAggregates.columns
cols = [f"{col}{colSuffix}" if col is not c.noteIdKey else col for col in columns]
incorrectAggregates.columns = cols

return incorrectAggregates
5 changes: 0 additions & 5 deletions sourcecode/scoring/mf_base_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def __init__(
crhThresholdLCBIntercept: float = 0.35,
crhSuperThreshold: float = 0.5,
inertiaDelta: float = 0.01,
weightedTotalVotes: float = 2.5,
useStableInitialization: bool = True,
):
"""Configure MatrixFactorizationScorer object.
Expand Down Expand Up @@ -70,7 +69,6 @@ def __init__(
repeated reason tags in not-helpful ratings to achieve CRH status.
inertiaDelta: Minimum amount which a note that has achieve CRH status must drop below the
applicable threshold to lose CRH status.
weightedTotalVotes: Minimum number of weighted incorrect votes required to lose CRH status.
useStableInitialization: whether to use a specific modeling group of users to stably initialize
"""
super().__init__(seed)
Expand All @@ -89,7 +87,6 @@ def __init__(
self._crhThresholdLCBIntercept = crhThresholdLCBIntercept
self._crhSuperThreshold = crhSuperThreshold
self._inertiaDelta = inertiaDelta
self._weightedTotalVotes = weightedTotalVotes
self._modelingGroupToInitializeForStability = 13 if useStableInitialization else None
self._mfRanker = MatrixFactorization()

Expand Down Expand Up @@ -305,7 +302,6 @@ def _score_notes_and_users(
crhThresholdLCBIntercept=self._crhThresholdLCBIntercept,
crhSuperThreshold=self._crhSuperThreshold,
inertiaDelta=self._inertiaDelta,
weightedTotalVotes=self._weightedTotalVotes,
)

# Determine "valid" ratings
Expand Down Expand Up @@ -378,7 +374,6 @@ def _score_notes_and_users(
crhThresholdLCBIntercept=self._crhThresholdLCBIntercept,
crhSuperThreshold=self._crhSuperThreshold,
inertiaDelta=self._inertiaDelta,
weightedTotalVotes=self._weightedTotalVotes,
finalRound=True,
)
# Takes raterParams from most recent MF run, but use the pre-computed
Expand Down
30 changes: 27 additions & 3 deletions sourcecode/scoring/note_ratings.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,6 @@ def compute_scored_notes(
crhThresholdLCBIntercept: float,
crhSuperThreshold: float,
inertiaDelta: float,
weightedTotalVotes: float,
finalRound: bool = False,
# TODO: We might want to consider inputing only the series here, instead of the whole callable
is_crh_function: Callable[..., pd.Series] = is_crh,
Expand Down Expand Up @@ -412,7 +411,6 @@ def compute_scored_notes(
repeated reason tags in not-helpful ratings to achieve CRH status.
inertiaDelta: Minimum amount which a note that has achieve CRH status must drop below the
applicable threshold to lose CRH status.
weightedTotalVotes: Minimum number of weighted incorrect votes required to lose CRH status.
finalRound: If true, enable additional status assignment logic which is only applied when
determining final status. Given that these mechanisms add complexity we don't apply them
in earlier rounds.
Expand Down Expand Up @@ -485,6 +483,15 @@ def compute_scored_notes(
ratings, noteParams, raterParams
)
noteStats = noteStats.merge(incorrectAggregates, on=c.noteIdKey, how="outer")
incorrectAggregatesWide = incorrect_filter.get_incorrect_aggregates(
ratings,
noteParams,
raterParams,
applyFilter=False,
extraCols=[c.notHelpfulSourcesMissingOrUnreliableTagKey, c.notHelpfulIrrelevantSourcesTagKey],
colSuffix=c.wideIncorrectFilterSuffix,
)
noteStats = noteStats.merge(incorrectAggregatesWide, on=c.noteIdKey, how="outer")

# Add tag filtering and sticky scoring logic.
rules.extend(
Expand Down Expand Up @@ -528,7 +535,24 @@ def compute_scored_notes(
minRatingsNeeded,
),
scoring_rules.FilterIncorrect(
RuleID.INCORRECT_OUTLIER, {RuleID.TAG_OUTLIER}, c.needsMoreRatings, weightedTotalVotes
RuleID.INCORRECT_OUTLIER,
{RuleID.TAG_OUTLIER},
c.needsMoreRatings,
tagThreshold=2,
voteThreshold=3,
weightedTotalVotes=2.5,
superThreshold=None,
colSuffix="",
),
scoring_rules.FilterIncorrect(
RuleID.INCORRECT_OUTLIER_WIDE,
{RuleID.TAG_OUTLIER},
c.needsMoreRatings,
tagThreshold=4,
voteThreshold=5,
weightedTotalVotes=4.0,
superThreshold=0.5,
colSuffix=c.wideIncorrectFilterSuffix,
),
]
)
Expand Down
28 changes: 23 additions & 5 deletions sourcecode/scoring/scoring_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class RuleID(Enum):
ELEVATED_CRH_INERTIA = RuleAndVersion("ElevatedCRHInertia", "1.0", False)
LCB_INERTIA = RuleAndVersion("LcbCRHInertia", "1.0", False)
INCORRECT_OUTLIER = RuleAndVersion("FilterIncorrect", "1.0", False)
INCORRECT_OUTLIER_WIDE = RuleAndVersion("FilterIncorrectWide", "1.0", False)

# Rules used in _meta_score.
META_INITIAL_NMR = RuleAndVersion("MetaInitialNMR", "1.0", False)
Expand Down Expand Up @@ -280,20 +281,32 @@ def __init__(
ruleID: RuleID,
dependencies: Set[RuleID],
status: str,
weightedTotalVotes: float = 2.5,
tagThreshold: int,
voteThreshold: int,
weightedTotalVotes: float,
superThreshold: Optional[float],
colSuffix: str,
):
"""Filter CRH notes for outliers with high levels of incorrect tag from similar factor raters.

Args:
rule: enum corresponding to a namedtuple defining a rule name and version string for the ScoringRule.
dependencies: Rules which must run before this rule can run.
status: the status which each note should be set to (e.g. CRH, CRNH, NMR)
tagThreshold: threshold for number of included raters to issue a tag
voteThreshold: threshold for number of included raters (raters must have issued a NH tag to be inclueed)
weightedTotalVotes: For the filter to trigger, the sum of weighted incorrect votes must
exceed the minAdjustedTotal.
superThreshold: if set, allow notes with an intercept above threshold to bypass the filter.
colSuffix: string suffix to apply to lookup columns
"""
super().__init__(ruleID, dependencies)
self._status = status
self.weightedTotalVotes = weightedTotalVotes
self._tagThreshold = tagThreshold
self._voteThreshold = voteThreshold
self._weightedTotalVotes = weightedTotalVotes
self._superThreshold = superThreshold
self._colSuffix = colSuffix

def score_notes(
self, noteStats: pd.DataFrame, currentLabels: pd.DataFrame, statusColumn: str
Expand All @@ -305,9 +318,14 @@ def score_notes(

# Identify impacted notes.
noteStatusUpdates = crhStats.loc[
(crhStats["notHelpfulIncorrect_interval"] >= 2)
& (crhStats["num_voters_interval"] >= 3)
& (crhStats["tf_idf_incorrect_interval"] >= self.weightedTotalVotes)
(crhStats[f"notHelpfulIncorrect_interval{self._colSuffix}"] >= self._tagThreshold)
& (crhStats[f"num_voters_interval{self._colSuffix}"] >= self._voteThreshold)
& (crhStats[f"tf_idf_incorrect_interval{self._colSuffix}"] >= self._weightedTotalVotes)
& (
True
if self._superThreshold is None
else crhStats[c.internalNoteInterceptKey] < self._superThreshold
)
][[c.noteIdKey]]

pd.testing.assert_frame_equal(noteStatusUpdates, noteStatusUpdates.drop_duplicates())
Expand Down
Loading