diff --git a/.gitignore b/.gitignore index f3b18553..35aea257 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ settings.json .gitignore *.tsv __pycache__ +.vscode* \ No newline at end of file diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py index ef7f1237..a0975a19 100644 --- a/sourcecode/scoring/constants.py +++ b/sourcecode/scoring/constants.py @@ -267,17 +267,17 @@ def rater_factor_key(i): noteTSVColumnsAndTypes = ( [ (noteIdKey, np.int64), - (noteAuthorParticipantIdKey, np.object), + (noteAuthorParticipantIdKey, object), (createdAtMillisKey, np.int64), (tweetIdKey, np.int64), - (classificationKey, np.object), - ("believable", np.object), - ("harmful", np.object), - ("validationDifficulty", np.object), + (classificationKey, object), + ("believable", object), + ("harmful", object), + ("validationDifficulty", object), ] + misleadingTagsAndTypes + notMisleadingTagsAndTypes - + [("trustworthySources", np.int64), (summaryKey, np.object), ("isMediaNote", np.int64)] + + [("trustworthySources", np.int64), (summaryKey, object), ("isMediaNote", np.int64)] ) noteTSVColumns = [col for (col, dtype) in noteTSVColumnsAndTypes] noteTSVTypes = [dtype for (col, dtype) in noteTSVColumnsAndTypes] @@ -286,14 +286,14 @@ def rater_factor_key(i): ratingTSVColumnsAndTypes = ( [ (noteIdKey, np.int64), - (raterParticipantIdKey, np.object), + (raterParticipantIdKey, object), (createdAtMillisKey, np.int64), ("version", np.int64), ("agree", np.int64), ("disagree", np.int64), (helpfulKey, np.int64), (notHelpfulKey, np.int64), - (helpfulnessLevelKey, np.object), + (helpfulnessLevelKey, object), ] + helpfulTagsAndTypesTSVOrder + notHelpfulTagsAndTypesTSVOrder @@ -317,16 +317,16 @@ def rater_factor_key(i): noteStatusHistoryTSVColumnsAndTypes = [ (noteIdKey, np.int64), - (noteAuthorParticipantIdKey, np.object), + (noteAuthorParticipantIdKey, object), (createdAtMillisKey, np.int64), (timestampMillisOfNoteFirstNonNMRLabelKey, np.double), # double because nullable. - (firstNonNMRLabelKey, np.object), + (firstNonNMRLabelKey, object), (timestampMillisOfNoteCurrentLabelKey, np.double), # double because nullable. - (currentLabelKey, np.object), + (currentLabelKey, object), (timestampMillisOfNoteMostRecentNonNMRLabelKey, np.double), # double because nullable. - (mostRecentNonNMRLabelKey, np.object), + (mostRecentNonNMRLabelKey, object), (timestampMillisOfStatusLockKey, np.double), # double because nullable. - (lockedStatusKey, np.object), + (lockedStatusKey, object), (timestampMillisOfRetroLockKey, np.double), # double because nullable. ] noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes] @@ -367,12 +367,12 @@ def rater_factor_key(i): expansion = "EXPANSION" userEnrollmentTSVColumnsAndTypes = [ - (participantIdKey, np.str), - (enrollmentState, np.str), + (participantIdKey, str), + (enrollmentState, str), (successfulRatingNeededToEarnIn, np.int64), (timestampOfLastStateChange, np.int64), (timestampOfLastEarnOut, np.double), # double because nullable. - (modelingPopulationKey, np.str), + (modelingPopulationKey, str), ] userEnrollmentTSVColumns = [col for (col, _) in userEnrollmentTSVColumnsAndTypes] userEnrollmentTSVTypes = [dtype for (_, dtype) in userEnrollmentTSVColumnsAndTypes] @@ -458,26 +458,26 @@ def rater_factor_key(i): (noteIdKey, np.int64), (coreNoteInterceptKey, np.double), (coreNoteFactor1Key, np.double), - (finalRatingStatusKey, np.str), - (firstTagKey, np.str), - (secondTagKey, np.str), + (finalRatingStatusKey, str), + (firstTagKey, str), + (secondTagKey, str), # Note that this column was formerly named "activeRules" and the name is now # updated to "coreActiveRules". The data values remain the compatible, # but the new column only contains rules that ran when deciding status based on # the core model. - (coreActiveRulesKey, np.str), - (activeFilterTagsKey, np.str), - (classificationKey, np.str), + (coreActiveRulesKey, str), + (activeFilterTagsKey, str), + (classificationKey, str), (createdAtMillisKey, np.int64), - (coreRatingStatusKey, np.str), - (metaScorerActiveRulesKey, np.str), - (decidedByKey, np.str), + (coreRatingStatusKey, str), + (metaScorerActiveRulesKey, str), + (decidedByKey, str), (expansionNoteInterceptKey, np.double), (expansionNoteFactor1Key, np.double), - (expansionRatingStatusKey, np.str), + (expansionRatingStatusKey, str), (coverageNoteInterceptKey, np.double), (coverageNoteFactor1Key, np.double), - (coverageRatingStatusKey, np.str), + (coverageRatingStatusKey, str), (coreNoteInterceptMinKey, np.double), (coreNoteInterceptMaxKey, np.double), (expansionNoteInterceptMinKey, np.double), @@ -486,7 +486,7 @@ def rater_factor_key(i): (coverageNoteInterceptMaxKey, np.double), (groupNoteInterceptKey, np.double), (groupNoteFactor1Key, np.double), - (groupRatingStatusKey, np.str), + (groupRatingStatusKey, str), (groupNoteInterceptMaxKey, np.double), (groupNoteInterceptMinKey, np.double), (modelingGroupKey, np.float64), @@ -520,7 +520,7 @@ def rater_factor_key(i): (notesAwaitingMoreRatings, pd.Int64Dtype()), (enrollmentState, np.int32), (successfulRatingNeededToEarnIn, pd.Int64Dtype()), - (authorTopNotHelpfulTagValues, np.str), + (authorTopNotHelpfulTagValues, str), (timestampOfLastStateChange, np.double), (aboveHelpfulnessThresholdKey, np.float64), # nullable bool (isEmergingWriterKey, np.bool_), diff --git a/sourcecode/scoring/incorrect_filter.py b/sourcecode/scoring/incorrect_filter.py index f8e3e6c2..760ab098 100644 --- a/sourcecode/scoring/incorrect_filter.py +++ b/sourcecode/scoring/incorrect_filter.py @@ -58,9 +58,15 @@ def _get_incorrect_tfidf_ratio( """ ratings_w_user_totals = augmented_ratings[user_filter] - ratings_w_user_totals.drop( - [c.internalRaterFactor1Key, c.internalNoteFactor1Key], inplace=True, axis=1 - ) + + columns_to_attempt_to_drop = [ + c.internalRaterFactor1Key, + c.internalNoteFactor1Key, + c.raterParticipantIdKey, + ] + columns_to_drop = ratings_w_user_totals.columns.intersection(columns_to_attempt_to_drop) + ratings_w_user_totals.drop(columns_to_drop, inplace=True, axis=1) + rating_aggs = ratings_w_user_totals.groupby(c.noteIdKey).agg("sum").reset_index() rating_aggs_w_cnt = rating_aggs.merge(note_nh_count, on=c.noteIdKey) diff --git a/sourcecode/scoring/note_ratings.py b/sourcecode/scoring/note_ratings.py index df12a23f..5f9a9537 100644 --- a/sourcecode/scoring/note_ratings.py +++ b/sourcecode/scoring/note_ratings.py @@ -91,7 +91,7 @@ def get_ratings_before_note_status_and_public_tsv( # c.timestampMillisOfNoteMostRecentNonNMRLabelKey are determined at runtime and cannot be statically # determined from the code above. If noteStatusHistory is missing any noteIdKey which is found in # ratings, then the missing rows will have NaN values for c.createdAtMillisKey and - # c.timestampMillisOfNoteMostRecentNonNMRLabelKey, forcing the entire colum to have type np.float. + # c.timestampMillisOfNoteMostRecentNonNMRLabelKey, forcing the entire colum to have type float. # However, if there are no missing values in column noteIdKey then c.createdAtMillisKey and # c.timestampMillisOfNoteMostRecentNonNMRLabelKey will retain their int64 types. The code below # coerces both columns to always have float types so the typecheck below will pass. @@ -103,11 +103,11 @@ def get_ratings_before_note_status_and_public_tsv( ratingsWithNoteLabelInfoTypes = c.ratingTSVTypeMapping ratingsWithNoteLabelInfoTypes[ c.createdAtMillisKey + "_note" - ] = np.float # float because nullable after merge. + ] = float # float because nullable after merge. ratingsWithNoteLabelInfoTypes[ c.timestampMillisOfNoteMostRecentNonNMRLabelKey - ] = np.float # float because nullable. - ratingsWithNoteLabelInfoTypes[c.helpfulNumKey] = np.float + ] = float # float because nullable. + ratingsWithNoteLabelInfoTypes[c.helpfulNumKey] = float assert len(ratingsWithNoteLabelInfo) == len(ratings) mismatches = [ diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py index d62cd9c6..61ffabba 100644 --- a/sourcecode/scoring/process_data.py +++ b/sourcecode/scoring/process_data.py @@ -368,7 +368,7 @@ def preprocess_data( ratings.loc[ratings[c.helpfulnessLevelKey] == c.helpfulValueTsv, c.helpfulNumKey] = 1 ratings = ratings.loc[~pd.isna(ratings[c.helpfulNumKey])] - notes[c.tweetIdKey] = notes[c.tweetIdKey].astype(np.str) + notes[c.tweetIdKey] = notes[c.tweetIdKey].astype(str) noteStatusHistory = note_status_history.merge_note_info(noteStatusHistory, notes)