diff --git a/sourcecode/main.py b/sourcecode/main.py index a10a68ba..fcf3bd75 100644 --- a/sourcecode/main.py +++ b/sourcecode/main.py @@ -14,6 +14,7 @@ import os import scoring.constants as c +from scoring.enums import scorers_from_csv from scoring.process_data import get_data, write_tsv_local from scoring.run_scoring import run_scoring @@ -59,12 +60,28 @@ def parse_args(): ) parser.set_defaults(pseudoraters=True) parser.add_argument("-r", "--ratings", default=c.ratingsInputPath, help="rating dataset") + parser.add_argument( + "--scorers", default=None, type=scorers_from_csv, help="CSV list of scorers to enable." + ) parser.add_argument( "--seed", default=None, type=int, help="set to an int to seed matrix factorization" ) parser.add_argument( "-s", "--status", default=c.noteStatusHistoryInputPath, help="note status history dataset" ) + parser.add_argument( + "--strict-columns", + dest="strict_columns", + help="Explicitly select columns and require that expected columns are present.", + action="store_true", + ) + parser.add_argument( + "--nostrict-columns", + help="Disable validation of expected columns and allow unexpected columns.", + action="store_false", + dest="strict_columns", + ) + parser.set_defaults(strict_columns=True) return parser.parse_args() @@ -82,7 +99,13 @@ def main(): # Invoke scoring and user contribution algorithms. scoredNotes, helpfulnessScores, newStatus, auxNoteInfo = run_scoring( - ratings, statusHistory, userEnrollment, seed=args.seed, pseudoraters=args.pseudoraters + ratings, + statusHistory, + userEnrollment, + seed=args.seed, + pseudoraters=args.pseudoraters, + enabledScorers=args.scorers, + strictColumns=args.strict_columns, ) # Write outputs to local disk. diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py index 497c3d44..a3f16649 100644 --- a/sourcecode/scoring/constants.py +++ b/sourcecode/scoring/constants.py @@ -1,6 +1,7 @@ import time import numpy as np +import pandas as pd # Store the timestamp at which the constants module is initialized. Note @@ -15,6 +16,9 @@ maxTrainError = 0.09 +expansionFlipPct = 0.19 +maxReruns = 5 + # Explanation Tags minRatingsToGetTag = 2 minTagsNeededForStatus = 2 @@ -216,12 +220,10 @@ def rater_factor_key(i): incorrectFilterColumns = [ "notHelpfulIncorrect_interval", - "notHelpfulIncorrect_total_interval", "cnt_interval", "num_voters_interval", "tf_idf_incorrect_interval", "notHelpfulIncorrect_same", - "notHelpfulIncorrect_total_same", "cnt_same", "num_voters_same", "tf_idf_incorrect_same", @@ -261,11 +263,7 @@ def rater_factor_key(i): ] + misleadingTagsAndTypes + notMisleadingTagsAndTypes - + [ - ("trustworthySources", np.int64), - (summaryKey, np.object), - ("isMediaNote", np.int64) - ] + + [("trustworthySources", np.int64), (summaryKey, np.object), ("isMediaNote", np.int64)] ) noteTSVColumns = [col for (col, dtype) in noteTSVColumnsAndTypes] noteTSVTypes = [dtype for (col, dtype) in noteTSVColumnsAndTypes] @@ -464,24 +462,24 @@ def rater_factor_key(i): (crhCrnhRatioDifferenceKey, np.double), (meanNoteScoreKey, np.double), (raterAgreeRatioKey, np.double), - (successfulRatingHelpfulCount, np.int64), - (successfulRatingNotHelpfulCount, np.int64), - (successfulRatingTotal, np.int64), - (unsuccessfulRatingHelpfulCount, np.int64), - (unsuccessfulRatingNotHelpfulCount, np.int64), - (unsuccessfulRatingTotal, np.int64), - (ratingsAwaitingMoreRatings, np.int64), - (ratedAfterDecision, np.int64), - (notesCurrentlyRatedHelpful, np.int64), - (notesCurrentlyRatedNotHelpful, np.int64), - (notesAwaitingMoreRatings, np.int64), + (successfulRatingHelpfulCount, pd.Int64Dtype()), + (successfulRatingNotHelpfulCount, pd.Int64Dtype()), + (successfulRatingTotal, pd.Int64Dtype()), + (unsuccessfulRatingHelpfulCount, pd.Int64Dtype()), + (unsuccessfulRatingNotHelpfulCount, pd.Int64Dtype()), + (unsuccessfulRatingTotal, pd.Int64Dtype()), + (ratingsAwaitingMoreRatings, pd.Int64Dtype()), + (ratedAfterDecision, pd.Int64Dtype()), + (notesCurrentlyRatedHelpful, pd.Int64Dtype()), + (notesCurrentlyRatedNotHelpful, pd.Int64Dtype()), + (notesAwaitingMoreRatings, pd.Int64Dtype()), (enrollmentState, np.int32), - (successfulRatingNeededToEarnIn, np.int64), + (successfulRatingNeededToEarnIn, pd.Int64Dtype()), (authorTopNotHelpfulTagValues, np.str), - (timestampOfLastStateChange, np.int64), - (aboveHelpfulnessThresholdKey, np.bool_), + (timestampOfLastStateChange, np.double), + (aboveHelpfulnessThresholdKey, np.float64), # nullable bool (isEmergingWriterKey, np.bool_), - (aggregateRatingReceivedTotal, np.int64), + (aggregateRatingReceivedTotal, pd.Int64Dtype()), (timestampOfLastEarnOut, np.double), ] raterModelOutputTSVColumns = [col for (col, dtype) in raterModelOutputTSVColumnsAndTypes] diff --git a/sourcecode/scoring/enums.py b/sourcecode/scoring/enums.py new file mode 100644 index 00000000..d3ff9bd5 --- /dev/null +++ b/sourcecode/scoring/enums.py @@ -0,0 +1,31 @@ +from enum import auto, Enum +from typing import Set + + +class Scorers(Enum): + """Exhaustive list of all scorers to simplify setting enabled/disabled scorers.""" + + MFCoreScorer = auto() + MFCoverageScorer = auto() + MFExpansionScorer = auto() + + +def scorers_from_csv(csv: str) -> Set[Scorers]: + """Converts a CSV of enums to an actual set of Enum values. + + Args: + csv: CSV string of Scorer names. + + Returns: + Set containing Scorers. + + Raises: + ValueError if csv contains a token which is not a valid Scorer. + """ + values = [] + for value in csv.split(","): + try: + values.append(Scorers[value]) + except KeyError: + raise ValueError(f"Unknown value {value}") + return set(values) diff --git a/sourcecode/scoring/explanation_tags.py b/sourcecode/scoring/explanation_tags.py index 36991a08..663be136 100644 --- a/sourcecode/scoring/explanation_tags.py +++ b/sourcecode/scoring/explanation_tags.py @@ -82,7 +82,7 @@ def get_top_nonhelpful_tags_per_author( ), axis=1, )[[c.noteIdKey, c.firstTagKey, c.secondTagKey]] - # Aggregates top two tags per author. + # Aggreagtes top two tags per author. notesToUse = noteStatusHistory[[c.noteAuthorParticipantIdKey, c.noteIdKey]] authorTagsAgg = ( notesToUse.merge(noteTopTags, on=[c.noteIdKey]) diff --git a/sourcecode/scoring/incorrect_filter.py b/sourcecode/scoring/incorrect_filter.py index 3706087b..f8e3e6c2 100644 --- a/sourcecode/scoring/incorrect_filter.py +++ b/sourcecode/scoring/incorrect_filter.py @@ -53,6 +53,8 @@ def _get_incorrect_tfidf_ratio( suffix: suffix for incorrect and count column names for this filter Returns: + pd.DataFrame with one row for each note, with computed sum(tf_idf_incorrect) score for raters + included in filter """ ratings_w_user_totals = augmented_ratings[user_filter] @@ -67,7 +69,7 @@ def _get_incorrect_tfidf_ratio( ) / np.log( 1 + (rating_aggs_w_cnt["notHelpfulIncorrect_total"] / rating_aggs_w_cnt["cnt"]) ) # p(incorrect over all rater ratings) - rating_aggs_w_cnt.drop("notHelpfulIncorrect_total", axis=1) + rating_aggs_w_cnt.drop("notHelpfulIncorrect_total", inplace=True, axis=1) rating_aggs_w_cnt.columns = [c.noteIdKey] + [ f"{col}{suffix}" for col in rating_aggs_w_cnt.columns[1:] ] diff --git a/sourcecode/scoring/matrix_factorization.py b/sourcecode/scoring/matrix_factorization.py index 5ce33899..5bd1dac6 100644 --- a/sourcecode/scoring/matrix_factorization.py +++ b/sourcecode/scoring/matrix_factorization.py @@ -251,6 +251,72 @@ def _create_mf_model( return mf_model, optimizer, criterion + def _get_train_validate_sets( + self, + row: torch.LongTensor, + col: torch.LongTensor, + rating: torch.FloatTensor, + validate_percent: Optional[float] = None, + ) -> Tuple[ + torch.LongTensor, + torch.LongTensor, + torch.FloatTensor, + torch.LongTensor, + torch.LongTensor, + torch.FloatTensor, + ]: + if validate_percent is not None: + random_indices = np.random.permutation(np.arange(len(row))) + validate_indices = random_indices[: int(validate_percent * len(row))] + train_indices = random_indices[int(validate_percent * len(row)) :] + row_validate = row[validate_indices] + col_validate = col[validate_indices] + rating_validate = rating[validate_indices] + row_train = row[train_indices] + col_train = col[train_indices] + rating_train = rating[train_indices] + else: + row_train = row + col_train = col + rating_train = rating + row_validate = None + col_validate = None + rating_validate = None + return row_train, col_train, rating_train, row_validate, col_validate, rating_validate + + def _compute_and_print_loss( + self, + mf_model: BiasedMatrixFactorization, + loss_value: float, + criterion: torch.nn.modules.loss._Loss, + row_train: torch.LongTensor, + col_train: torch.LongTensor, + rating_train: torch.FloatTensor, + row_validate: Optional[torch.LongTensor], + col_validate: Optional[torch.LongTensor], + rating_validate: Optional[torch.FloatTensor], + epoch: int, + final: bool = False, + ) -> Tuple[float, float, Optional[float]]: + y_pred = mf_model(row_train, col_train) + train_loss_value = criterion(y_pred, rating_train).item() + if row_validate is not None: + y_pred_validate = mf_model(row_validate, col_validate) + validate_loss_value = criterion(y_pred_validate, rating_validate).item() + else: + validate_loss_value = None + + if self._logging: + print("epoch", epoch, loss_value) + print("TRAIN FIT LOSS: ", train_loss_value) + if validate_loss_value is not None: + print("VALIDATE FIT LOSS: ", validate_loss_value) + + if final == True: + self.test_errors.append(loss_value) + self.train_errors.append(train_loss_value) + return train_loss_value, loss_value, validate_loss_value + def _fit_model( self, mf_model: BiasedMatrixFactorization, @@ -259,7 +325,8 @@ def _fit_model( row: torch.LongTensor, col: torch.LongTensor, rating: torch.FloatTensor, - ) -> None: + validate_percent: Optional[float] = None, + ) -> Tuple[float, float, Optional[float]]: """Run gradient descent to train the model. Args: @@ -270,23 +337,20 @@ def _fit_model( col (torch.LongTensor) rating (torch.FloatTensor) """ - l2_lambda_intercept = self._l2_lambda * self._l2_intercept_multiplier - - def print_loss(final=False): - y_pred = mf_model(row, col) - train_loss = criterion(y_pred, rating) - - if self._logging: - print("epoch", epoch, loss.item()) - print("TRAIN FIT LOSS: ", train_loss.item()) - if final == True: - self.test_errors.append(loss.item()) - self.train_errors.append(train_loss.item()) + ( + row_train, + col_train, + rating_train, + row_validate, + col_validate, + rating_validate, + ) = self._get_train_validate_sets(row, col, rating, validate_percent) + l2_lambda_intercept = self._l2_lambda * self._l2_intercept_multiplier prev_loss = 1e10 - y_pred = mf_model(row, col) - loss = criterion(y_pred, rating) + y_pred = mf_model(row_train, col_train) + loss = criterion(y_pred, rating_train) l2_reg_loss = torch.tensor(0.0).to(mf_model.device) for name, param in mf_model.named_parameters(): @@ -299,7 +363,9 @@ def print_loss(final=False): epoch = 0 - while abs(prev_loss - loss.item()) > self._convergence: + while (abs(loss.item() - prev_loss) > self._convergence) and ( + not (epoch > 100 and loss.item() > prev_loss) + ): prev_loss = loss.item() @@ -313,8 +379,8 @@ def print_loss(final=False): optimizer.zero_grad() # Predict and calculate loss - y_pred = mf_model(row, col) - loss = criterion(y_pred, rating) + y_pred = mf_model(row_train, col_train) + loss = criterion(y_pred, rating_train) l2_reg_loss = torch.tensor(0.0).to(mf_model.device) for name, param in mf_model.named_parameters(): @@ -326,13 +392,37 @@ def print_loss(final=False): loss += l2_reg_loss if epoch % 50 == 0: - print_loss() + self._compute_and_print_loss( + mf_model, + loss.item(), + criterion, + row_train, + col_train, + rating_train, + row_validate, + col_validate, + rating_validate, + epoch, + final=False, + ) epoch += 1 if self._logging: print("Num epochs:", epoch) - print_loss(final=True) + return self._compute_and_print_loss( + mf_model, + loss.item(), + criterion, + row_train, + col_train, + rating_train, + row_validate, + col_validate, + rating_validate, + epoch, + final=True, + ) def run_mf( self, @@ -341,7 +431,8 @@ def run_mf( userInit: pd.DataFrame = None, globalInterceptInit: Optional[float] = None, specificNoteId: Optional[int] = None, - ) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[float]]: + validatePercent: Optional[float] = None, + ): """Train matrix factorization model. See https://twitter.github.io/communitynotes/ranking-notes/#matrix-factorization @@ -387,13 +478,8 @@ def run_mf( row = torch.LongTensor(noteRatingIds[_raterIndexKey].values).to(mf_model.device) col = torch.LongTensor(noteRatingIds[_noteIndexKey].values).to(mf_model.device) - self._fit_model( - mf_model, - optimizer, - criterion, - row, - col, - rating, + train_loss, loss, validate_loss = self._fit_model( + mf_model, optimizer, criterion, row, col, rating, validatePercent ) assert mf_model.item_factors.weight.data.cpu().numpy().shape[0] == noteIdMap.shape[0] @@ -409,7 +495,10 @@ def run_mf( ) fitRaterParams.drop(_raterIndexKey, axis=1, inplace=True) - return fitNoteParams, fitRaterParams, globalIntercept + if validatePercent is None: + return fitNoteParams, fitRaterParams, globalIntercept + else: + return fitNoteParams, fitRaterParams, globalIntercept, train_loss, loss, validate_loss def _flip_factors_for_identification( self, noteIdMap: pd.DataFrame, raterIdMap: pd.DataFrame diff --git a/sourcecode/scoring/mf_base_scorer.py b/sourcecode/scoring/mf_base_scorer.py index a005f7dc..4acd52d1 100644 --- a/sourcecode/scoring/mf_base_scorer.py +++ b/sourcecode/scoring/mf_base_scorer.py @@ -30,7 +30,7 @@ def __init__( crnhThresholdNoteFactorMultiplier: float = -0.8, crnhThresholdNMIntercept: float = -0.15, crnhThresholdUCBIntercept: float = -0.04, - crhThresholdLCBIntercept: float = 0.31, + crhThresholdLCBIntercept: float = 0.32, crhSuperThreshold: float = 0.5, inertiaDelta: float = 0.01, weightedTotalVotes: float = 1.0, diff --git a/sourcecode/scoring/mf_coverage_scorer.py b/sourcecode/scoring/mf_coverage_scorer.py index 8287bc78..8e6bd14c 100644 --- a/sourcecode/scoring/mf_coverage_scorer.py +++ b/sourcecode/scoring/mf_coverage_scorer.py @@ -1,8 +1,10 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple from . import constants as c, matrix_factorization from .mf_core_scorer import MFCoreScorer +import pandas as pd + class MFCoverageScorer(MFCoreScorer): def __init__(self, seed: Optional[int] = None) -> None: @@ -74,3 +76,54 @@ def _get_dropped_user_cols(self) -> List[str]: c.raterAgreeRatioKey, c.aboveHelpfulnessThresholdKey, ] + + +class MFDummyCoverageScorer(MFCoverageScorer): + def __init__(self, seed: Optional[int] = None) -> None: + """Return empty columns from MFCoverageScorer object. + + Args: + seed: if not None, seed value to ensure deterministic execution + """ + super().__init__(seed) + + def _score_notes_and_users( + self, ratings: pd.DataFrame, noteStatusHistory: pd.DataFrame + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Process ratings to assign status to notes and optionally compute rater properties. + + Args: + ratings (pd.DataFrame): preprocessed ratings + noteStatusHistory (pd.DataFrame): one row per note; history of when note had each status + + Returns: + Tuple[pd.DataFrame, pd.DataFrame]: + noteScores pd.DataFrame: one row per note contained note scores and parameters. + userScores pd.DataFrame: one row per user containing a column for each helpfulness score. + """ + noteScores = pd.DataFrame( + { + c.noteIdKey: pd.Series(dtype="int"), + c.internalNoteInterceptKey: pd.Series(dtype="double"), + c.internalNoteFactor1Key: pd.Series(dtype="double"), + c.noteInterceptMinKey: pd.Series(dtype="double"), + c.noteInterceptMaxKey: pd.Series(dtype="double"), + c.internalRatingStatusKey: pd.Series(dtype="str"), + } + ) + noteScores[c.noteIdKey] = ratings[c.noteIdKey].drop_duplicates() + userScores = pd.DataFrame() + return noteScores, userScores + + def _get_dropped_note_cols(self) -> List[str]: + """Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo.""" + return [] + + def _get_dropped_user_cols(self) -> List[str]: + """Returns a list of columns which should be excluded from helpfulnessScores output.""" + return [] + + def _filter_input( + self, ratings: pd.DataFrame, noteStatusHistory: pd.DataFrame, userEnrollment: pd.DataFrame + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + return ratings, noteStatusHistory diff --git a/sourcecode/scoring/note_ratings.py b/sourcecode/scoring/note_ratings.py index dad9dcfc..df12a23f 100644 --- a/sourcecode/scoring/note_ratings.py +++ b/sourcecode/scoring/note_ratings.py @@ -21,10 +21,18 @@ def is_crh(scoredNotes, minRatingsNeeded, crhThreshold) -> pd.Series: ) -def is_crh_lcb(scoredNotes, minRatingsNeeded, crhThresholdLCBIntercept) -> pd.Series: +def is_crh_lcb(scoredNotes, minRatingsNeeded, crhThresholdLCBIntercept, inertia=False) -> pd.Series: enoughRatings = scoredNotes[c.numRatingsKey] >= minRatingsNeeded + if c.noteInterceptMinKey in scoredNotes.columns: - return enoughRatings & (scoredNotes[c.noteInterceptMinKey] >= crhThresholdLCBIntercept) + if inertia == False: + return enoughRatings & (scoredNotes[c.noteInterceptMinKey] >= crhThresholdLCBIntercept) + else: + return ( + enoughRatings + & (scoredNotes[c.noteInterceptMinKey] >= crhThresholdLCBIntercept) + & (scoredNotes[c.currentLabelKey] == c.currentlyRatedHelpful) + ) else: # all False return enoughRatings & (~enoughRatings) @@ -489,6 +497,15 @@ def compute_scored_notes( crhThreshold, minRatingsNeeded, ), + scoring_rules.RuleFromFunction( + RuleID.LCB_INERTIA, + {RuleID.GENERAL_CRH}, + c.currentlyRatedHelpful, + lambda noteStats: is_crh_lcb_function( + noteStats, minRatingsNeeded, crhThresholdLCBIntercept - inertiaDelta, True + ), + onlyApplyToNotesThatSayTweetIsMisleading=True, + ), scoring_rules.FilterTagOutliers( RuleID.TAG_OUTLIER, {RuleID.GENERAL_CRH}, diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py index bc74a22a..20c923b4 100644 --- a/sourcecode/scoring/process_data.py +++ b/sourcecode/scoring/process_data.py @@ -87,7 +87,7 @@ def tsv_parser( firstLine = rawTSV.split("\n")[0] if len(firstLine.split("\t")) != len(columns): raise ValueError - return pd.read_csv( + data = pd.read_csv( StringIO(rawTSV), sep="\t", names=columns, @@ -95,6 +95,7 @@ def tsv_parser( header=0 if header else None, index_col=[], ) + return data except (ValueError, IndexError): raise ValueError("invalid input") diff --git a/sourcecode/scoring/run_scoring.py b/sourcecode/scoring/run_scoring.py index b6e02faa..622fb9f5 100644 --- a/sourcecode/scoring/run_scoring.py +++ b/sourcecode/scoring/run_scoring.py @@ -9,12 +9,13 @@ import concurrent.futures import multiprocessing import time -from typing import List, Optional, Tuple +from typing import List, Optional, Set, Tuple from . import constants as c, contributor_state, note_ratings, note_status_history, scoring_rules +from .enums import Scorers from .mf_base_scorer import MFBaseScorer from .mf_core_scorer import MFCoreScorer -from .mf_coverage_scorer import MFCoverageScorer +from .mf_coverage_scorer import MFCoverageScorer, MFDummyCoverageScorer from .mf_expansion_scorer import MFExpansionScorer from .scorer import Scorer from .scoring_rules import RuleID @@ -26,17 +27,86 @@ ModelResult = namedtuple("ModelResult", ["scoredNotes", "helpfulnessScores", "auxiliaryNoteInfo"]) -def _get_scorers(seed: Optional[int], pseudoraters: Optional[bool]) -> List[Scorer]: +def _check_flips( + scoredNotes: pd.DataFrame, + noteStatusHistory: pd.DataFrame, + userEnrollment: pd.DataFrame, + modelingPopulations: List[str], + pctFlips: float, + resultCol: str, +) -> bool: + """ + Args: + scoredNotes (pd.DataFrame): notes with scores returned by MF scoring algorithm to evaluate against NSH for flips + noteStatusHistory (pd.DataFrame): one row per note; history of when note had each status + userEnrollment (pd.DataFrame): The enrollment state for each contributor + modelingPopulation (str): name of modelingPopulation(s) of authors for which to evaluate CRH flips + pctFlips (float): percent of flips of unlocked notes that triggers a rerun + resultCol (str): name of column with noteStatus for model + + Returns: + bool whether unlocked note flips exceed acceptable level, triggering a rerun + """ + # combine noteStatusHistory with userEnrollment modelingPopulation + popNsh = noteStatusHistory.merge( + userEnrollment[[c.participantIdKey, c.modelingPopulationKey]], + left_on=c.noteAuthorParticipantIdKey, + right_on=c.participantIdKey, + how="inner", + ) + unlockedPopNsh = popNsh.loc[ + (popNsh[c.modelingPopulationKey].isin(modelingPopulations)) + & (popNsh[c.timestampMillisOfStatusLockKey].isna()) + ] + + # check how many unlocked notes that appeared in previous run have now flipped status + unlockedPopNsh = unlockedPopNsh.merge( + scoredNotes[[c.noteIdKey, resultCol]], on=c.noteIdKey, how="inner" + ) + statusFlips = len( + unlockedPopNsh.loc[ + (unlockedPopNsh[c.currentLabelKey] != unlockedPopNsh[resultCol]) + & ( + (unlockedPopNsh[c.currentLabelKey] == c.currentlyRatedHelpful) + | (unlockedPopNsh[resultCol] == c.currentlyRatedHelpful) + ) + ] + ) + # take max w/1 to avoid divide by zero error when no prevCRH + prevCrh = max( + 1, len(unlockedPopNsh.loc[unlockedPopNsh[c.currentLabelKey] == c.currentlyRatedHelpful]) + ) + + print(f"Percentage of previous CRH flipping status: {(statusFlips / prevCrh)}") + + return (statusFlips / prevCrh) > pctFlips + + +def _get_scorers( + seed: Optional[int], pseudoraters: Optional[bool], enabledScorers: Optional[Set[Scorers]] +) -> List[Scorer]: """Instantiate all Scorer objects which should be used for note ranking. Args: seed (int, optional): if not None, base distinct seeds for the first and second MF rounds on this value pseudoraters (bool, optional): if True, compute optional pseudorater confidence intervals + enabledScorers: if not None, set of which scorers should be instantiated and enabled Returns: List[Scorer] containing instantiated Scorer objects for note ranking. """ - return [MFCoreScorer(seed, pseudoraters), MFExpansionScorer(seed), MFCoverageScorer(seed)] + scorers: List[Scorer] = [] + + if enabledScorers is None or Scorers.MFCoreScorer in enabledScorers: + scorers.append(MFCoreScorer(seed, pseudoraters)) + if enabledScorers is None or Scorers.MFExpansionScorer in enabledScorers: + scorers.append(MFExpansionScorer(seed)) + if enabledScorers is not None and Scorers.MFCoverageScorer in enabledScorers: + scorers.append(MFCoverageScorer(seed)) + else: + scorers.append(MFDummyCoverageScorer()) + + return scorers def _merge_results( @@ -94,13 +164,28 @@ def _run_scorer_parallelizable(scorer, ratings, noteStatusHistory, userEnrollmen runCounter = 0 scorerStartTime = time.perf_counter() result = None - - if isinstance(scorer, (MFBaseScorer, MFCoreScorer, MFCoverageScorer, MFExpansionScorer)): - while (len(scorer._mfRanker.train_errors) == 0) or ( - scorer._mfRanker.train_errors[-1] > c.maxTrainError + flips = False + + if isinstance( + scorer, (MFBaseScorer, MFCoreScorer, MFCoverageScorer, MFExpansionScorer) + ) and not isinstance(scorer, MFDummyCoverageScorer): + while ( + (len(scorer._mfRanker.train_errors) == 0) + or (scorer._mfRanker.train_errors[-1] > c.maxTrainError) + or (flips and runCounter < c.maxReruns) ): runCounter += 1 result = ModelResult(*scorer.score(ratings, noteStatusHistory, userEnrollment)) + # check for notes createdat > lock time, not more than x% have changed status from nsh + if isinstance(scorer, MFExpansionScorer): + flips = _check_flips( + result.scoredNotes, + noteStatusHistory, + userEnrollment, + [c.expansion], + c.expansionFlipPct, + c.expansionRatingStatusKey, + ) else: result = ModelResult(*scorer.score(ratings, noteStatusHistory, userEnrollment)) runCounter += 1 @@ -183,7 +268,10 @@ def _run_scorers( def meta_score( - scoredNotes: pd.DataFrame, auxiliaryNoteInfo: pd.DataFrame, lockedStatus: pd.DataFrame + scoredNotes: pd.DataFrame, + auxiliaryNoteInfo: pd.DataFrame, + lockedStatus: pd.DataFrame, + enabledScorers: Optional[Set[Scorers]], ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Determine final note status based on individual scoring results. @@ -196,6 +284,7 @@ def meta_score( scoredNotes: pd.DataFrame containing all scored note results. auxiliaryNoteInfo: pd.DataFrame containing tag aggregates lockedStatus: pd.DataFrame containing {noteId, status} pairs for all notes + enabledScorers: if not None, set of which scorers should be instantiated and enabled Returns: Tuple[pd.DataFrame, pd.DataFrame]: @@ -209,32 +298,48 @@ def meta_score( on=c.noteIdKey, ) assert len(scoredNotes) == len(auxiliaryNoteInfo) - rules = [ - scoring_rules.DefaultRule(RuleID.META_INITIAL_NMR, set(), c.needsMoreRatings), - scoring_rules.ApplyModelResult( - RuleID.EXPANSION_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionRatingStatusKey - ), - scoring_rules.ApplyModelResult( - RuleID.CORE_MODEL, {RuleID.EXPANSION_MODEL}, c.coreRatingStatusKey - ), - scoring_rules.ApplyAdditiveModelResult( - RuleID.COVERAGE_MODEL, {RuleID.CORE_MODEL}, c.coverageRatingStatusKey, 0.38 - ), - scoring_rules.ScoringDriftGuard(RuleID.SCORING_DRIFT_GUARD, {RuleID.CORE_MODEL}, lockedStatus), - # TODO: The rule below both sets tags for notes which are CRH / CRNH and unsets status for - # any notes which are CRH / CRNH but don't have enough ratings to assign two tags. The later - # behavior can lead to unsetting locked status. We should refactor this code to (1) remove - # the behavior which unsets status (instead tags will be assigned on a best effort basis) and - # (2) set tags in logic which is not run as a ScoringRule (since ScoringRules function to - # update note status). - scoring_rules.InsufficientExplanation( - RuleID.INSUFFICIENT_EXPLANATION, - {RuleID.CORE_MODEL}, - c.needsMoreRatings, - c.minRatingsToGetTag, - c.minTagsNeededForStatus, - ), + rules: List[scoring_rules.ScoringRule] = [ + scoring_rules.DefaultRule(RuleID.META_INITIAL_NMR, set(), c.needsMoreRatings) ] + # Only attach meta-scoring rules for models which actually run. + if enabledScorers is None or Scorers.MFExpansionScorer in enabledScorers: + rules.append( + scoring_rules.ApplyModelResult( + RuleID.EXPANSION_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionRatingStatusKey + ) + ) + if enabledScorers is None or Scorers.MFCoreScorer in enabledScorers: + rules.append( + scoring_rules.ApplyModelResult( + RuleID.CORE_MODEL, {RuleID.META_INITIAL_NMR}, c.coreRatingStatusKey + ) + ) + if enabledScorers is not None and Scorers.MFCoverageScorer in enabledScorers: + rules.append( + scoring_rules.ApplyAdditiveModelResult( + RuleID.COVERAGE_MODEL, {RuleID.META_INITIAL_NMR}, c.coverageRatingStatusKey, 0.38 + ) + ) + rules.extend( + [ + scoring_rules.ScoringDriftGuard( + RuleID.SCORING_DRIFT_GUARD, {RuleID.CORE_MODEL}, lockedStatus + ), + # TODO: The rule below both sets tags for notes which are CRH / CRNH and unsets status for + # any notes which are CRH / CRNH but don't have enough ratings to assign two tags. The later + # behavior can lead to unsetting locked status. We should refactor this code to (1) remove + # the behavior which unsets status (instead tags will be assigned on a best effort basis) and + # (2) set tags in logic which is not run as a ScoringRule (since ScoringRules function to + # update note status). + scoring_rules.InsufficientExplanation( + RuleID.INSUFFICIENT_EXPLANATION, + {RuleID.CORE_MODEL}, + c.needsMoreRatings, + c.minRatingsToGetTag, + c.minTagsNeededForStatus, + ), + ] + ) scoredNotes[c.firstTagKey] = np.nan scoredNotes[c.secondTagKey] = np.nan scoringResult = scoring_rules.apply_scoring_rules( @@ -447,6 +552,8 @@ def run_scoring( userEnrollment: pd.DataFrame, seed: Optional[int] = None, pseudoraters: Optional[bool] = True, + enabledScorers: Optional[Set[Scorers]] = None, + strictColumns: bool = True, ): """Invokes note scoring algorithms, merges results and computes user stats. @@ -456,6 +563,8 @@ def run_scoring( userEnrollment (pd.DataFrame): The enrollment state for each contributor seed (int, optional): if not None, base distinct seeds for the first and second MF rounds on this value pseudoraters (bool, optional): if True, compute optional pseudorater confidence intervals + enabledScorers (Set[Scorers], optional): Scorers which should be instantiated + strictColumns (bool, optional): if True, validate which columns are present Returns: Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: @@ -465,7 +574,7 @@ def run_scoring( auxiliaryNoteInfo: one row per note containing adjusted and ratio tag values """ # Apply individual scoring models and obtained merged result. - scorers = _get_scorers(seed, pseudoraters) + scorers = _get_scorers(seed, pseudoraters, enabledScorers) scoredNotes, helpfulnessScores, auxiliaryNoteInfo = _run_scorers( scorers, ratings, noteStatusHistory, userEnrollment ) @@ -479,7 +588,10 @@ def run_scoring( # Assign final status to notes based on individual model scores and note attributes. scoredNotesCols, auxiliaryNoteInfoCols = meta_score( - scoredNotes, auxiliaryNoteInfo, noteStatusHistory[[c.noteIdKey, c.lockedStatusKey]] + scoredNotes, + auxiliaryNoteInfo, + noteStatusHistory[[c.noteIdKey, c.lockedStatusKey]], + enabledScorers, ) scoredNotes = scoredNotes.merge(scoredNotesCols, on=c.noteIdKey) auxiliaryNoteInfo = auxiliaryNoteInfo.merge(auxiliaryNoteInfoCols, on=c.noteIdKey) @@ -505,5 +617,9 @@ def run_scoring( noteStatusHistory ), "noteStatusHistory should contain all notes after preprocessing" - # Finalize output dataframes with correct columns. - return _validate(scoredNotes, helpfulnessScores, newNoteStatusHistory, auxiliaryNoteInfo) + # Skip validation and selection out output columns if the set of scorers is overridden. + if strictColumns: + scoredNotes, helpfulnessScores, newNoteStatusHistory, auxiliaryNoteInfo = _validate( + scoredNotes, helpfulnessScores, newNoteStatusHistory, auxiliaryNoteInfo + ) + return scoredNotes, helpfulnessScores, newNoteStatusHistory, auxiliaryNoteInfo diff --git a/sourcecode/scoring/scorer.py b/sourcecode/scoring/scorer.py index 20d0ff8e..5353c72c 100644 --- a/sourcecode/scoring/scorer.py +++ b/sourcecode/scoring/scorer.py @@ -103,6 +103,7 @@ def score( noteScores, userScores = self._score_notes_and_users(ratings, noteStatusHistory) noteScores = noteScores.rename(columns=self._get_note_col_mapping()) userScores = userScores.rename(columns=self._get_user_col_mapping()) + # TODO: Tolerate unexpcted columns if --nostrict-columns is set. # Process noteScores noteScores = noteScores.drop(columns=self._get_dropped_note_cols()) assert set(noteScores.columns) == set( @@ -112,7 +113,12 @@ def score( Missing expected columns that should've been in noteScores: {set(self.get_scored_notes_cols() + self.get_auxiliary_note_info_cols()) - set(noteScores.columns)}""" # Process userScores userScores = userScores.drop(columns=self._get_dropped_user_cols()) - assert set(userScores.columns) == set(self.get_helpfulness_scores_cols()) + assert set(userScores.columns) == set( + self.get_helpfulness_scores_cols() + ), f"""all columns must be either dropped or explicitly defined in an output. + Extra columns that were in userScores: {set(userScores.columns) - set(self.get_helpfulness_scores_cols())} + Missing expected columns that should've been in userScores: {set(self.get_helpfulness_scores_cols()) - set(userScores.columns)}""" + # Return dataframes with specified columns in specified order return ( noteScores[self.get_scored_notes_cols()], diff --git a/sourcecode/scoring/scoring_rules.py b/sourcecode/scoring/scoring_rules.py index 583b3b1a..319c8fe7 100644 --- a/sourcecode/scoring/scoring_rules.py +++ b/sourcecode/scoring/scoring_rules.py @@ -28,6 +28,7 @@ class RuleID(Enum): NM_CRNH = RuleAndVersion("NmCRNH", "1.0", False) GENERAL_CRH_INERTIA = RuleAndVersion("GeneralCRHInertia", "1.0", False) ELEVATED_CRH_INERTIA = RuleAndVersion("ElevatedCRHInertia", "1.0", False) + LCB_INERTIA = RuleAndVersion("LcbCRHInertia", "1.0", False) INCORRECT_OUTLIER = RuleAndVersion("FilterIncorrect", "1.0", False) # Rules used in _meta_score.