twitter · jbaxter · Aug 15, 2023 · Aug 15, 2023 · Aug 15, 2023
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ settings.json
 .gitignore
 *.tsv
 __pycache__
+.vscode*
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -267,17 +267,17 @@ def rater_factor_key(i):
 noteTSVColumnsAndTypes = (
   [
     (noteIdKey, np.int64),
-    (noteAuthorParticipantIdKey, np.object),
+    (noteAuthorParticipantIdKey, object),
     (createdAtMillisKey, np.int64),
     (tweetIdKey, np.int64),
-    (classificationKey, np.object),
-    ("believable", np.object),
-    ("harmful", np.object),
-    ("validationDifficulty", np.object),
+    (classificationKey, object),
+    ("believable", object),
+    ("harmful", object),
+    ("validationDifficulty", object),
   ]
   + misleadingTagsAndTypes
   + notMisleadingTagsAndTypes
-  + [("trustworthySources", np.int64), (summaryKey, np.object), ("isMediaNote", np.int64)]
+  + [("trustworthySources", np.int64), (summaryKey, object), ("isMediaNote", np.int64)]
 )
 noteTSVColumns = [col for (col, dtype) in noteTSVColumnsAndTypes]
 noteTSVTypes = [dtype for (col, dtype) in noteTSVColumnsAndTypes]
@@ -286,14 +286,14 @@ def rater_factor_key(i):
 ratingTSVColumnsAndTypes = (
   [
     (noteIdKey, np.int64),
-    (raterParticipantIdKey, np.object),
+    (raterParticipantIdKey, object),
     (createdAtMillisKey, np.int64),
     ("version", np.int64),
     ("agree", np.int64),
     ("disagree", np.int64),
     (helpfulKey, np.int64),
     (notHelpfulKey, np.int64),
-    (helpfulnessLevelKey, np.object),
+    (helpfulnessLevelKey, object),
   ]
   + helpfulTagsAndTypesTSVOrder
   + notHelpfulTagsAndTypesTSVOrder
@@ -317,16 +317,16 @@ def rater_factor_key(i):
 
 noteStatusHistoryTSVColumnsAndTypes = [
   (noteIdKey, np.int64),
-  (noteAuthorParticipantIdKey, np.object),
+  (noteAuthorParticipantIdKey, object),
   (createdAtMillisKey, np.int64),
   (timestampMillisOfNoteFirstNonNMRLabelKey, np.double),  # double because nullable.
-  (firstNonNMRLabelKey, np.object),
+  (firstNonNMRLabelKey, object),
   (timestampMillisOfNoteCurrentLabelKey, np.double),  # double because nullable.
-  (currentLabelKey, np.object),
+  (currentLabelKey, object),
   (timestampMillisOfNoteMostRecentNonNMRLabelKey, np.double),  # double because nullable.
-  (mostRecentNonNMRLabelKey, np.object),
+  (mostRecentNonNMRLabelKey, object),
   (timestampMillisOfStatusLockKey, np.double),  # double because nullable.
-  (lockedStatusKey, np.object),
+  (lockedStatusKey, object),
   (timestampMillisOfRetroLockKey, np.double),  # double because nullable.
 ]
 noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
@@ -367,12 +367,12 @@ def rater_factor_key(i):
 expansion = "EXPANSION"
 
 userEnrollmentTSVColumnsAndTypes = [
-  (participantIdKey, np.str),
-  (enrollmentState, np.str),
+  (participantIdKey, str),
+  (enrollmentState, str),
   (successfulRatingNeededToEarnIn, np.int64),
   (timestampOfLastStateChange, np.int64),
   (timestampOfLastEarnOut, np.double),  # double because nullable.
-  (modelingPopulationKey, np.str),
+  (modelingPopulationKey, str),
 ]
 userEnrollmentTSVColumns = [col for (col, _) in userEnrollmentTSVColumnsAndTypes]
 userEnrollmentTSVTypes = [dtype for (_, dtype) in userEnrollmentTSVColumnsAndTypes]
@@ -458,26 +458,26 @@ def rater_factor_key(i):
   (noteIdKey, np.int64),
   (coreNoteInterceptKey, np.double),
   (coreNoteFactor1Key, np.double),
-  (finalRatingStatusKey, np.str),
-  (firstTagKey, np.str),
-  (secondTagKey, np.str),
+  (finalRatingStatusKey, str),
+  (firstTagKey, str),
+  (secondTagKey, str),
   # Note that this column was formerly named "activeRules" and the name is now
   # updated to "coreActiveRules".  The data values remain the compatible,
   # but the new column only contains rules that ran when deciding status based on
   # the core model.
-  (coreActiveRulesKey, np.str),
-  (activeFilterTagsKey, np.str),
-  (classificationKey, np.str),
+  (coreActiveRulesKey, str),
+  (activeFilterTagsKey, str),
+  (classificationKey, str),
   (createdAtMillisKey, np.int64),
-  (coreRatingStatusKey, np.str),
-  (metaScorerActiveRulesKey, np.str),
-  (decidedByKey, np.str),
+  (coreRatingStatusKey, str),
+  (metaScorerActiveRulesKey, str),
+  (decidedByKey, str),
   (expansionNoteInterceptKey, np.double),
   (expansionNoteFactor1Key, np.double),
-  (expansionRatingStatusKey, np.str),
+  (expansionRatingStatusKey, str),
   (coverageNoteInterceptKey, np.double),
   (coverageNoteFactor1Key, np.double),
-  (coverageRatingStatusKey, np.str),
+  (coverageRatingStatusKey, str),
   (coreNoteInterceptMinKey, np.double),
   (coreNoteInterceptMaxKey, np.double),
   (expansionNoteInterceptMinKey, np.double),
@@ -486,7 +486,7 @@ def rater_factor_key(i):
   (coverageNoteInterceptMaxKey, np.double),
   (groupNoteInterceptKey, np.double),
   (groupNoteFactor1Key, np.double),
-  (groupRatingStatusKey, np.str),
+  (groupRatingStatusKey, str),
   (groupNoteInterceptMaxKey, np.double),
   (groupNoteInterceptMinKey, np.double),
   (modelingGroupKey, np.float64),
@@ -520,7 +520,7 @@ def rater_factor_key(i):
   (notesAwaitingMoreRatings, pd.Int64Dtype()),
   (enrollmentState, np.int32),
   (successfulRatingNeededToEarnIn, pd.Int64Dtype()),
-  (authorTopNotHelpfulTagValues, np.str),
+  (authorTopNotHelpfulTagValues, str),
   (timestampOfLastStateChange, np.double),
   (aboveHelpfulnessThresholdKey, np.float64),  # nullable bool
   (isEmergingWriterKey, np.bool_),

diff --git a/sourcecode/scoring/incorrect_filter.py b/sourcecode/scoring/incorrect_filter.py
@@ -58,9 +58,15 @@ def _get_incorrect_tfidf_ratio(
   """
 
   ratings_w_user_totals = augmented_ratings[user_filter]
-  ratings_w_user_totals.drop(
-    [c.internalRaterFactor1Key, c.internalNoteFactor1Key], inplace=True, axis=1
-  )
+
+  columns_to_attempt_to_drop = [
+    c.internalRaterFactor1Key,
+    c.internalNoteFactor1Key,
+    c.raterParticipantIdKey,
+  ]
+  columns_to_drop = ratings_w_user_totals.columns.intersection(columns_to_attempt_to_drop)
+  ratings_w_user_totals.drop(columns_to_drop, inplace=True, axis=1)
+
   rating_aggs = ratings_w_user_totals.groupby(c.noteIdKey).agg("sum").reset_index()
   rating_aggs_w_cnt = rating_aggs.merge(note_nh_count, on=c.noteIdKey)
 

diff --git a/sourcecode/scoring/note_ratings.py b/sourcecode/scoring/note_ratings.py
@@ -91,7 +91,7 @@ def get_ratings_before_note_status_and_public_tsv(
   # c.timestampMillisOfNoteMostRecentNonNMRLabelKey are determined at runtime and cannot be statically
   # determined from the code above.  If noteStatusHistory is missing any noteIdKey which is found in
   # ratings, then the missing rows will have NaN values for c.createdAtMillisKey and
-  # c.timestampMillisOfNoteMostRecentNonNMRLabelKey, forcing the entire colum to have type np.float.
+  # c.timestampMillisOfNoteMostRecentNonNMRLabelKey, forcing the entire colum to have type float.
   # However, if there are no missing values in column noteIdKey then c.createdAtMillisKey and
   # c.timestampMillisOfNoteMostRecentNonNMRLabelKey will retain their int64 types.  The code below
   # coerces both columns to always have float types so the typecheck below will pass.
@@ -103,11 +103,11 @@ def get_ratings_before_note_status_and_public_tsv(
     ratingsWithNoteLabelInfoTypes = c.ratingTSVTypeMapping
     ratingsWithNoteLabelInfoTypes[
       c.createdAtMillisKey + "_note"
-    ] = np.float  # float because nullable after merge.
+    ] = float  # float because nullable after merge.
     ratingsWithNoteLabelInfoTypes[
       c.timestampMillisOfNoteMostRecentNonNMRLabelKey
-    ] = np.float  # float because nullable.
-    ratingsWithNoteLabelInfoTypes[c.helpfulNumKey] = np.float
+    ] = float  # float because nullable.
+    ratingsWithNoteLabelInfoTypes[c.helpfulNumKey] = float
 
     assert len(ratingsWithNoteLabelInfo) == len(ratings)
     mismatches = [

diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py
@@ -368,7 +368,7 @@ def preprocess_data(
   ratings.loc[ratings[c.helpfulnessLevelKey] == c.helpfulValueTsv, c.helpfulNumKey] = 1
   ratings = ratings.loc[~pd.isna(ratings[c.helpfulNumKey])]
 
-  notes[c.tweetIdKey] = notes[c.tweetIdKey].astype(np.str)
+  notes[c.tweetIdKey] = notes[c.tweetIdKey].astype(str)
 
   noteStatusHistory = note_status_history.merge_note_info(noteStatusHistory, notes)