Skip to content

Commit

Permalink
Improve optimization stability and note status history timestamps
Browse files Browse the repository at this point in the history
1. Improve optimization stability (reduce factor flips from run to run)
2. Note status history previously contained timestamps that were from the start of the scoring run, rather than the end. Now note status history timestamps will be computed at the end of the scoring run, meaning that they will more closely reflect the time at which updated statuses are shown on X.
  • Loading branch information
jbaxter committed Oct 10, 2023
1 parent c6aa718 commit 9f2af9b
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 5 deletions.
7 changes: 5 additions & 2 deletions sourcecode/scoring/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#
# https://docs.python.org/3/tutorial/modules.html#more-on-modules
epochMillis = 1000 * time.time()
useCurrentTimeInsteadOfEpochMillisForNoteStatusHistory = True

maxTrainError = 0.09

Expand Down Expand Up @@ -150,6 +151,8 @@ def rater_factor_key(i):
ratingCountKey = "ratingCount"
numRatingsKey = "numRatings"
numRatingsLast28DaysKey = "numRatingsLast28"
ratingFromInitialModelingGroupKey = "ratingFromInitialModelingGroup"
percentFromInitialModelingGroupKey = "percentFromInitialModelingGroup"

# Helpfulness Score Keys
crhRatioKey = "CRHRatio"
Expand Down Expand Up @@ -526,12 +529,12 @@ def rater_factor_key(i):
(notesCurrentlyRatedHelpful, pd.Int64Dtype()),
(notesCurrentlyRatedNotHelpful, pd.Int64Dtype()),
(notesAwaitingMoreRatings, pd.Int64Dtype()),
(enrollmentState, np.int32),
(enrollmentState, pd.Int64Dtype()),
(successfulRatingNeededToEarnIn, pd.Int64Dtype()),
(authorTopNotHelpfulTagValues, str),
(timestampOfLastStateChange, np.double),
(aboveHelpfulnessThresholdKey, np.float64), # nullable bool
(isEmergingWriterKey, np.bool_),
(isEmergingWriterKey, pd.BooleanDtype()),
(aggregateRatingReceivedTotal, pd.Int64Dtype()),
(timestampOfLastEarnOut, np.double),
(groupRaterInterceptKey, np.double),
Expand Down
39 changes: 38 additions & 1 deletion sourcecode/scoring/mf_base_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ def _run_stable_matrix_factorization(
self,
ratingsForTraining: pd.DataFrame,
userEnrollmentRaw: pd.DataFrame,
minPercentRatingsFromModelingGroup: float = 0.75,
minNumRatingsToIncludeInStableInitialization: int = 5,
):
"""Train a matrix factorization model on the ratingsForTraining data.
Due to stability issues when trained on the entire dataset with no initialization, this is done in
Expand All @@ -202,10 +204,45 @@ def _run_stable_matrix_factorization(
left_on=c.raterParticipantIdKey,
right_on=c.participantIdKey,
)
ratingsForStableInitialization = ratingsForTrainingWithModelingGroup[

ratingsForTrainingWithModelingGroup[c.ratingFromInitialModelingGroupKey] = (
ratingsForTrainingWithModelingGroup[c.modelingGroupKey]
== self._modelingGroupToInitializeForStability
)

# Only include ratings from the modeling group
ratingsForStableInitialization = ratingsForTrainingWithModelingGroup[
ratingsForTrainingWithModelingGroup[c.ratingFromInitialModelingGroupKey]
]

# Only include notes that have received at least 75% of their ratings from the modeling group (and 5 total)
ratingsForTrainingWithModelingGroup[c.ratingCountKey] = 1
noteStatsByRatedModelingGroup = (
ratingsForTrainingWithModelingGroup.groupby(c.noteIdKey)
.sum()[[c.ratingFromInitialModelingGroupKey, c.ratingCountKey]]
.reset_index()
)
noteStatsByRatedModelingGroup[c.percentFromInitialModelingGroupKey] = (
noteStatsByRatedModelingGroup[c.ratingFromInitialModelingGroupKey]
/ noteStatsByRatedModelingGroup[c.ratingCountKey]
)
noteStatsByRatedModelingGroup[
c.percentFromInitialModelingGroupKey
] = noteStatsByRatedModelingGroup[c.percentFromInitialModelingGroupKey].fillna(0)
notesRatedMostlyByInitialModelingGroup = noteStatsByRatedModelingGroup[
(
noteStatsByRatedModelingGroup[c.percentFromInitialModelingGroupKey]
>= minPercentRatingsFromModelingGroup
)
& (
noteStatsByRatedModelingGroup[c.ratingCountKey]
>= minNumRatingsToIncludeInStableInitialization
)
]
ratingsForStableInitialization = ratingsForStableInitialization.merge(
notesRatedMostlyByInitialModelingGroup[[c.noteIdKey]], on=c.noteIdKey
)

assert (
len(ratingsForStableInitialization) > 0
), "No ratings from stable initialization modeling group."
Expand Down
2 changes: 1 addition & 1 deletion sourcecode/scoring/note_ratings.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def get_ratings_with_scores(
)

ratingsWithScores = ratingsBeforeNoteStatus[
[c.raterParticipantIdKey, c.helpfulNumKey, c.noteIdKey]
[c.raterParticipantIdKey, c.helpfulNumKey, c.noteIdKey, c.createdAtMillisKey]
].merge(
scoredNotes[
[
Expand Down
10 changes: 9 additions & 1 deletion sourcecode/scoring/note_status_history.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

from . import constants as c
from .scoring_rules import RuleID

Expand Down Expand Up @@ -166,7 +168,13 @@ def update_note_status_history(
Returns:
pd.DataFrame: noteStatusHistory
"""
currentTimeMillis = c.epochMillis
if c.useCurrentTimeInsteadOfEpochMillisForNoteStatusHistory:
# When running in prod, we use the latest time possible, so as to include as many valid ratings
# as possible, and be closest to the time the new note statuses are user-visible.
currentTimeMillis = 1000 * time.time()
else:
# When running in test, we use the overridable epochMillis constant.
currentTimeMillis = c.epochMillis
newScoredNotesSuffix = "_sn"
mergedStatuses = oldNoteStatusHistory.merge(
scoredNotes[
Expand Down
1 change: 1 addition & 0 deletions sourcecode/scoring/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def main():
args = parse_args()
if args.epoch_millis:
c.epochMillis = args.epoch_millis
c.useCurrentTimeInsteadOfEpochMillisForNoteStatusHistory = False

# Load input dataframes.
dataLoader = LocalDataLoader(args.notes, args.ratings, args.status, args.enrollment, args.headers)
Expand Down

0 comments on commit 9f2af9b

Please sign in to comment.