twitter · jbaxter · Aug 28, 2023 · Aug 28, 2023
diff --git a/sourcecode/main.py b/sourcecode/main.py
@@ -7,15 +7,15 @@
     --notes data/notes-00000.tsv \
     --ratings data/ratings-00000.tsv \
     --status data/noteStatusHistory-00000.tsv \
-    --outdir data/scored_notes.tsv
+    --outdir data
 """
 
 import argparse
 import os
 
 import scoring.constants as c
 from scoring.enums import scorers_from_csv
-from scoring.process_data import get_data, write_tsv_local
+from scoring.process_data import LocalDataLoader, write_tsv_local
 from scoring.run_scoring import run_scoring
 
 
@@ -82,6 +82,13 @@ def parse_args():
     dest="strict_columns",
   )
   parser.set_defaults(strict_columns=True)
+  parser.add_argument(
+    "--parallel",
+    help="Disable parallel run of algorithm.",
+    action="store_true",
+    dest="parallel",
+  )
+  parser.set_defaults(parallel=False)
 
   return parser.parse_args()
 
@@ -93,9 +100,8 @@ def main():
     c.epochMillis = args.epoch_millis
 
   # Load input dataframes.
-  _, ratings, statusHistory, userEnrollment = get_data(
-    args.notes, args.ratings, args.status, args.enrollment, args.headers
-  )
+  dataLoader = LocalDataLoader(args.notes, args.ratings, args.status, args.enrollment, args.headers)
+  _, ratings, statusHistory, userEnrollment = dataLoader.get_data()
 
   # Invoke scoring and user contribution algorithms.
   scoredNotes, helpfulnessScores, newStatus, auxNoteInfo = run_scoring(
@@ -106,6 +112,8 @@ def main():
     pseudoraters=args.pseudoraters,
     enabledScorers=args.scorers,
     strictColumns=args.strict_columns,
+    runParallel=args.parallel,
+    dataLoader=dataLoader if args.parallel == True else None,
   )
 
   # Write outputs to local disk.

diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -16,7 +16,7 @@
 
 maxTrainError = 0.09
 
-coreFlipPct = 0.175
+coreFlipPct = 0.09
 expansionFlipPct = 0.19
 maxReruns = 5
 

diff --git a/sourcecode/scoring/matrix_factorization/matrix_factorization.py b/sourcecode/scoring/matrix_factorization/matrix_factorization.py
@@ -132,7 +132,8 @@ def _initialize_parameters(
     """
     assert self.mf_model is not None
     if noteInit is not None:
-      print("initializing notes")
+      if self._logging:
+        print("initializing notes")
       noteInit = self.noteIdMap.merge(noteInit, on=c.noteIdKey, how="left")
       self.mf_model.note_intercepts.weight.data = torch.tensor(
         np.expand_dims(noteInit[c.internalNoteInterceptKey].astype(np.float32).values, axis=1)
@@ -144,7 +145,8 @@ def _initialize_parameters(
       )
 
     if userInit is not None:
-      print("initializing users")
+      if self._logging:
+        print("initializing users")
       userInit = self.raterIdMap.merge(userInit, on=c.raterParticipantIdKey, how="left")
       self.mf_model.user_intercepts.weight.data = torch.tensor(
         np.expand_dims(userInit[c.internalRaterInterceptKey].astype(np.float32).values, axis=1)
@@ -156,7 +158,8 @@ def _initialize_parameters(
       )
 
     if globalInterceptInit is not None:
-      print("initialized global intercept")
+      if self._logging:
+        print("initialized global intercept")
       self.mf_model.global_intercept = torch.nn.parameter.Parameter(
         torch.ones(1, 1) * globalInterceptInit
       )
@@ -214,7 +217,8 @@ def _create_mf_model(
       )  # smaller learning rate
     else:
       self.optimizer = torch.optim.Adam(self.mf_model.parameters(), lr=self._noInitLearningRate)
-    print(self.mf_model.device)
+    if self._logging:
+      print(self.mf_model.device)
     self.mf_model.to(self.mf_model.device)
 
   def _instantiate_biased_mf_model(self):
@@ -225,6 +229,7 @@ def _instantiate_biased_mf_model(self):
       n_notes,
       use_global_intercept=self._useGlobalIntercept,
       n_factors=self._numFactors,
+      logging=self._logging,
     )
     if self._logging:
       print("------------------")

diff --git a/sourcecode/scoring/matrix_factorization/model.py b/sourcecode/scoring/matrix_factorization/model.py
@@ -20,6 +20,7 @@ def __init__(
     n_notes: int,
     n_factors: int = 1,
     use_global_intercept: bool = True,
+    logging: bool = True,
   ) -> None:
     """Initialize matrix factorization model using xavier_uniform for factors
     and zeros for intercepts.
@@ -32,6 +33,8 @@ def __init__(
     """
     super().__init__()
 
+    self._logging = logging
+
     self.user_factors = torch.nn.Embedding(n_users, n_factors, sparse=False)
     self.note_factors = torch.nn.Embedding(n_notes, n_factors, sparse=False)
 
@@ -78,5 +81,6 @@ def _freeze_parameters(self, words_to_freeze: set):
     for name, param in self.named_parameters():
       for word in words_to_freeze:
         if word in name:
-          print("Freezing parameter: ", name)
+          if self._logging:
+            print("Freezing parameter: ", name)
           param.requires_grad_(False)
diff --git a/sourcecode/scoring/matrix_factorization/pseudo_raters.py b/sourcecode/scoring/matrix_factorization/pseudo_raters.py
@@ -28,9 +28,11 @@ def __init__(
     globalBias: float,
     mfRanker: MatrixFactorization,
     logging=True,
+    checkParamsSame=True,
   ):
     self._logging = logging
     self._mfRanker = mfRanker
+    self._checkParamsSame = checkParamsSame
     self.ratings = ratings
     (
       self.noteIdMap,
@@ -43,9 +45,12 @@ def __init__(
 
   def compute_note_parameter_confidence_bounds_with_pseudo_raters(self):
     self._make_extreme_raters(self.raterParams, self.raterIdMap)
+    self._add_extreme_raters_to_id_maps_and_params()
+    self._create_extreme_ratings()
 
-    notesWithConfidenceBounds = self._fit_note_params_for_each_dataset_with_extreme_ratings()
+    noteParamsList = self._fit_note_params_for_each_dataset_with_extreme_ratings()
 
+    notesWithConfidenceBounds = self._aggregate_note_params(noteParamsList)
     return self.noteParams.merge(
       notesWithConfidenceBounds.reset_index(), on=c.noteIdKey, how="left"
     )
@@ -170,8 +175,9 @@ def _create_new_model_with_extreme_raters_from_original_params(
       lr=newExtremeMF._initLearningRate,
     )
 
-    self._check_note_parameters_same(newExtremeMF)
-    self._check_rater_parameters_same(newExtremeMF)
+    if self._checkParamsSame:
+      self._check_note_parameters_same(newExtremeMF)
+      self._check_rater_parameters_same(newExtremeMF)
 
     return newExtremeMF
 
@@ -183,15 +189,15 @@ def _fit_all_notes_with_raters_constant(self, ratingFeaturesAndLabelsWithExtreme
     newExtremeMF._fit_model()
 
     # Double check that we kept rater parameters fixed during re-training of note parameters.
-    self._check_rater_parameters_same(newExtremeMF)
+    if self._checkParamsSame:
+      self._check_rater_parameters_same(newExtremeMF)
 
     fitNoteParams, fitRaterParams = newExtremeMF._get_parameters_from_trained_model()
     return fitNoteParams
 
-  def _fit_note_params_for_each_dataset_with_extreme_ratings(self, joinOrig=False):
-    self._add_extreme_raters_to_id_maps_and_params()
-    extremeRatingsToAddWithoutNotes = []
-    extremeRatingsToAddWithoutNotes.append(
+  def _create_extreme_ratings(self):
+    self.extremeRatingsToAddWithoutNotes = []
+    self.extremeRatingsToAddWithoutNotes.append(
       {
         c.internalRaterInterceptKey: None,
         c.internalRaterFactor1Key: None,
@@ -203,37 +209,44 @@ def _fit_note_params_for_each_dataset_with_extreme_ratings(self, joinOrig=False)
 
       for helpfulNum in (0.0, 1.0):
         extremeRater[c.helpfulNumKey] = helpfulNum
-        extremeRatingsToAddWithoutNotes.append(extremeRater.copy())
+        self.extremeRatingsToAddWithoutNotes.append(extremeRater.copy())
+
+  def _create_dataset_with_extreme_rating_on_each_note(self, ratingToAddWithoutNoteId):
+    ## for each rating (ided by raterParticipantId and raterIndex)
+    if ratingToAddWithoutNoteId[c.helpfulNumKey] is not None:
+      ratingsWithNoteIds = []
+      for i, noteRow in (
+        self.ratingFeaturesAndLabels[[c.noteIdKey, mf_c.noteIndexKey]].drop_duplicates().iterrows()
+      ):
+        ratingToAdd = ratingToAddWithoutNoteId.copy()
+        ratingToAdd[c.noteIdKey] = noteRow[c.noteIdKey]
+        ratingToAdd[mf_c.noteIndexKey] = noteRow[mf_c.noteIndexKey]
+        ratingsWithNoteIds.append(ratingToAdd)
+      extremeRatingsToAdd = pd.DataFrame(ratingsWithNoteIds).drop(
+        [c.internalRaterInterceptKey, c.internalRaterFactor1Key], axis=1
+      )
+      ratingFeaturesAndLabelsWithExtremeRatings = pd.concat(
+        [self.ratingFeaturesAndLabels, extremeRatingsToAdd]
+      )
+    else:
+      ratingFeaturesAndLabelsWithExtremeRatings = self.ratingFeaturesAndLabels
+    return ratingFeaturesAndLabelsWithExtremeRatings
 
+  def _fit_note_params_for_each_dataset_with_extreme_ratings(self):
     noteParamsList = []
-    for ratingToAddWithoutNoteId in extremeRatingsToAddWithoutNotes:
-      ## for each rating (ided by raterParticipantId and raterIndex)
-      if ratingToAddWithoutNoteId[c.helpfulNumKey] is not None:
-        ratingsWithNoteIds = []
-        for i, noteRow in (
-          self.ratingFeaturesAndLabels[[c.noteIdKey, mf_c.noteIndexKey]]
-          .drop_duplicates()
-          .iterrows()
-        ):
-          ratingToAdd = ratingToAddWithoutNoteId.copy()
-          ratingToAdd[c.noteIdKey] = noteRow[c.noteIdKey]
-          ratingToAdd[mf_c.noteIndexKey] = noteRow[mf_c.noteIndexKey]
-          ratingsWithNoteIds.append(ratingToAdd)
-        extremeRatingsToAdd = pd.DataFrame(ratingsWithNoteIds).drop(
-          [c.internalRaterInterceptKey, c.internalRaterFactor1Key], axis=1
-        )
-        ratingFeaturesAndLabelsWithExtremeRatings = pd.concat(
-          [self.ratingFeaturesAndLabels, extremeRatingsToAdd]
-        )
-      else:
-        ratingFeaturesAndLabelsWithExtremeRatings = self.ratingFeaturesAndLabels
+    for ratingToAddWithoutNoteId in self.extremeRatingsToAddWithoutNotes:
+      ratingFeaturesAndLabelsWithExtremeRatings = (
+        self._create_dataset_with_extreme_rating_on_each_note(ratingToAddWithoutNoteId)
+      )
 
       if self._logging:
         print("------------------")
         print(f"Re-scoring all notes with extra rating added: {ratingToAddWithoutNoteId}")
+
       fitNoteParams = self._fit_all_notes_with_raters_constant(
         ratingFeaturesAndLabelsWithExtremeRatings
       )
+
       fitNoteParams[Constants.extraRaterInterceptKey] = ratingToAddWithoutNoteId[
         c.internalRaterInterceptKey
       ]
@@ -242,7 +255,9 @@ def _fit_note_params_for_each_dataset_with_extreme_ratings(self, joinOrig=False)
       ]
       fitNoteParams[Constants.extraRatingHelpfulNumKey] = ratingToAddWithoutNoteId[c.helpfulNumKey]
       noteParamsList.append(fitNoteParams)
+    return noteParamsList
 
+  def _aggregate_note_params(self, noteParamsList, joinOrig=False):
     rawRescoredNotesWithEachExtraRater = pd.concat(noteParamsList)
     rawRescoredNotesWithEachExtraRater.drop(mf_c.noteIndexKey, axis=1, inplace=True)
     rawRescoredNotesWithEachExtraRater = rawRescoredNotesWithEachExtraRater.sort_values(
@@ -278,9 +293,9 @@ def _fit_note_params_for_each_dataset_with_extreme_ratings(self, joinOrig=False)
     raterFacs[Constants.allKey] = 1
     raterFacs[Constants.negFacKey] = raterFacs[c.internalRaterFactor1Key] < 0
     raterFacs[Constants.posFacKey] = raterFacs[c.internalRaterFactor1Key] > 0
-    r = raterFacs.groupby(c.noteIdKey).sum()[
+    r = raterFacs.groupby(c.noteIdKey)[
       [Constants.allKey, Constants.negFacKey, Constants.posFacKey]
-    ]
+    ].sum()
     r.columns = pd.MultiIndex.from_product([[c.ratingCountKey], r.columns])
     notesWithConfidenceBounds = notesWithConfidenceBounds.join(r)
 

diff --git a/sourcecode/scoring/mf_base_scorer.py b/sourcecode/scoring/mf_base_scorer.py
@@ -32,7 +32,7 @@ def __init__(
     crnhThresholdNoteFactorMultiplier: float = -0.8,
     crnhThresholdNMIntercept: float = -0.15,
     crnhThresholdUCBIntercept: float = -0.04,
-    crhThresholdLCBIntercept: float = 0.32,
+    crhThresholdLCBIntercept: float = 0.35,
     crhSuperThreshold: float = 0.5,
     inertiaDelta: float = 0.01,
     weightedTotalVotes: float = 1.0,