Improve optimization stability and note status history timestamps

1. Improve optimization stability (reduce factor flips from run to run) 2. Note status history previously contained timestamps that were from the start of the scoring run, rather than the end. Now note status history timestamps will be computed at the end of the scoring run, meaning that they will more closely reflect the time at which updated statuses are shown on X.
twitter · Oct 10, 2023 · 9f2af9b · 9f2af9b
1 parent c6aa718
commit 9f2af9b
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 5 deletions.
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -13,6 +13,7 @@
 #
 # https://docs.python.org/3/tutorial/modules.html#more-on-modules
 epochMillis = 1000 * time.time()
+useCurrentTimeInsteadOfEpochMillisForNoteStatusHistory = True
 
 maxTrainError = 0.09
 
@@ -150,6 +151,8 @@ def rater_factor_key(i):
 ratingCountKey = "ratingCount"
 numRatingsKey = "numRatings"
 numRatingsLast28DaysKey = "numRatingsLast28"
+ratingFromInitialModelingGroupKey = "ratingFromInitialModelingGroup"
+percentFromInitialModelingGroupKey = "percentFromInitialModelingGroup"
 
 # Helpfulness Score Keys
 crhRatioKey = "CRHRatio"
@@ -526,12 +529,12 @@ def rater_factor_key(i):
   (notesCurrentlyRatedHelpful, pd.Int64Dtype()),
   (notesCurrentlyRatedNotHelpful, pd.Int64Dtype()),
   (notesAwaitingMoreRatings, pd.Int64Dtype()),
-  (enrollmentState, np.int32),
+  (enrollmentState, pd.Int64Dtype()),
   (successfulRatingNeededToEarnIn, pd.Int64Dtype()),
   (authorTopNotHelpfulTagValues, str),
   (timestampOfLastStateChange, np.double),
   (aboveHelpfulnessThresholdKey, np.float64),  # nullable bool
-  (isEmergingWriterKey, np.bool_),
+  (isEmergingWriterKey, pd.BooleanDtype()),
   (aggregateRatingReceivedTotal, pd.Int64Dtype()),
   (timestampOfLastEarnOut, np.double),
   (groupRaterInterceptKey, np.double),

diff --git a/sourcecode/scoring/mf_base_scorer.py b/sourcecode/scoring/mf_base_scorer.py
@@ -176,6 +176,8 @@ def _run_stable_matrix_factorization(
     self,
     ratingsForTraining: pd.DataFrame,
     userEnrollmentRaw: pd.DataFrame,
+    minPercentRatingsFromModelingGroup: float = 0.75,
+    minNumRatingsToIncludeInStableInitialization: int = 5,
   ):
     """Train a matrix factorization model on the ratingsForTraining data.
     Due to stability issues when trained on the entire dataset with no initialization, this is done in
@@ -202,10 +204,45 @@ def _run_stable_matrix_factorization(
       left_on=c.raterParticipantIdKey,
       right_on=c.participantIdKey,
     )
-    ratingsForStableInitialization = ratingsForTrainingWithModelingGroup[
+
+    ratingsForTrainingWithModelingGroup[c.ratingFromInitialModelingGroupKey] = (
       ratingsForTrainingWithModelingGroup[c.modelingGroupKey]
       == self._modelingGroupToInitializeForStability
+    )
+
+    # Only include ratings from the modeling group
+    ratingsForStableInitialization = ratingsForTrainingWithModelingGroup[
+      ratingsForTrainingWithModelingGroup[c.ratingFromInitialModelingGroupKey]
     ]
+
+    # Only include notes that have received at least 75% of their ratings from the modeling group (and 5 total)
+    ratingsForTrainingWithModelingGroup[c.ratingCountKey] = 1
+    noteStatsByRatedModelingGroup = (
+      ratingsForTrainingWithModelingGroup.groupby(c.noteIdKey)
+      .sum()[[c.ratingFromInitialModelingGroupKey, c.ratingCountKey]]
+      .reset_index()
+    )
+    noteStatsByRatedModelingGroup[c.percentFromInitialModelingGroupKey] = (
+      noteStatsByRatedModelingGroup[c.ratingFromInitialModelingGroupKey]
+      / noteStatsByRatedModelingGroup[c.ratingCountKey]
+    )
+    noteStatsByRatedModelingGroup[
+      c.percentFromInitialModelingGroupKey
+    ] = noteStatsByRatedModelingGroup[c.percentFromInitialModelingGroupKey].fillna(0)
+    notesRatedMostlyByInitialModelingGroup = noteStatsByRatedModelingGroup[
+      (
+        noteStatsByRatedModelingGroup[c.percentFromInitialModelingGroupKey]
+        >= minPercentRatingsFromModelingGroup
+      )
+      & (
+        noteStatsByRatedModelingGroup[c.ratingCountKey]
+        >= minNumRatingsToIncludeInStableInitialization
+      )
+    ]
+    ratingsForStableInitialization = ratingsForStableInitialization.merge(
+      notesRatedMostlyByInitialModelingGroup[[c.noteIdKey]], on=c.noteIdKey
+    )
+
     assert (
       len(ratingsForStableInitialization) > 0
     ), "No ratings from stable initialization modeling group."

diff --git a/sourcecode/scoring/note_ratings.py b/sourcecode/scoring/note_ratings.py
@@ -190,7 +190,7 @@ def get_ratings_with_scores(
   )
 
   ratingsWithScores = ratingsBeforeNoteStatus[
-    [c.raterParticipantIdKey, c.helpfulNumKey, c.noteIdKey]
+    [c.raterParticipantIdKey, c.helpfulNumKey, c.noteIdKey, c.createdAtMillisKey]
   ].merge(
     scoredNotes[
       [

diff --git a/sourcecode/scoring/note_status_history.py b/sourcecode/scoring/note_status_history.py
@@ -1,3 +1,5 @@
+import time
+
 from . import constants as c
 from .scoring_rules import RuleID
 
@@ -166,7 +168,13 @@ def update_note_status_history(
   Returns:
       pd.DataFrame: noteStatusHistory
   """
-  currentTimeMillis = c.epochMillis
+  if c.useCurrentTimeInsteadOfEpochMillisForNoteStatusHistory:
+    # When running in prod, we use the latest time possible, so as to include as many valid ratings
+    # as possible, and be closest to the time the new note statuses are user-visible.
+    currentTimeMillis = 1000 * time.time()
+  else:
+    # When running in test, we use the overridable epochMillis constant.
+    currentTimeMillis = c.epochMillis
   newScoredNotesSuffix = "_sn"
   mergedStatuses = oldNoteStatusHistory.merge(
     scoredNotes[

diff --git a/sourcecode/scoring/runner.py b/sourcecode/scoring/runner.py
@@ -86,6 +86,7 @@ def main():
   args = parse_args()
   if args.epoch_millis:
     c.epochMillis = args.epoch_millis
+    c.useCurrentTimeInsteadOfEpochMillisForNoteStatusHistory = False
 
   # Load input dataframes.
   dataLoader = LocalDataLoader(args.notes, args.ratings, args.status, args.enrollment, args.headers)