Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Algorithm updates #264

Merged
merged 1 commit into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions sourcecode/scoring/matrix_factorization/matrix_factorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def __init__(
numFactors=1,
useGlobalIntercept=True,
log=True,
flipFactorsForIdentification=True,
model: Optional[BiasedMatrixFactorization] = None,
featureCols: List[str] = [c.noteIdKey, c.raterParticipantIdKey],
labelCol: str = c.helpfulNumKey,
Expand All @@ -51,7 +50,6 @@ def __init__(
self._numFactors = numFactors
self._useGlobalIntercept = useGlobalIntercept
self._log = log
self._flipFactorsForIdentification = flipFactorsForIdentification
self._featureCols = featureCols
self._labelCol = labelCol
self._useSigmoidCrossEntropy = useSigmoidCrossEntropy
Expand Down Expand Up @@ -103,7 +101,6 @@ def get_new_mf_with_same_args(self):
numFactors=self._numFactors,
useGlobalIntercept=self._useGlobalIntercept,
log=self._log,
flipFactorsForIdentification=self._flipFactorsForIdentification,
model=None,
featureCols=self._featureCols,
labelCol=self._labelCol,
Expand Down Expand Up @@ -225,7 +222,9 @@ def _initialize_parameters(
torch.ones(1, 1, dtype=torch.float32) * globalInterceptInit
)

def _get_parameters_from_trained_model(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
def _get_parameters_from_trained_model(
self, flipFactorsForIdentification: bool = True
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Returns:
Tuple[pd.DataFrame, pd.DataFrame]: noteIdMap, raterIdMap
Expand All @@ -247,7 +246,7 @@ def _get_parameters_from_trained_model(self) -> Tuple[pd.DataFrame, pd.DataFrame
:, i
]

if self._flipFactorsForIdentification:
if flipFactorsForIdentification:
noteParams, raterParams = self._flip_factors_for_identification(noteParams, raterParams)

return noteParams, raterParams
Expand Down Expand Up @@ -497,10 +496,12 @@ def run_mf(
globalInterceptInit: Optional[float] = None,
specificNoteId: Optional[int] = None,
validatePercent: Optional[float] = None,
freezeNoteParameters: bool = False,
freezeRaterParameters: bool = False,
freezeGlobalParameters: bool = False,
ratingPerNoteLossRatio: Optional[float] = None,
ratingPerUserLossRatio: Optional[float] = None,
flipFactorsForIdentification: bool = True,
):
"""Train matrix factorization model.

Expand Down Expand Up @@ -546,13 +547,15 @@ def run_mf(
self.mf_model._freeze_parameters(set({"user"}))
if freezeGlobalParameters:
self.mf_model._freeze_parameters(set({"global"}))
if freezeNoteParameters:
self.mf_model._freeze_parameters(set({"note"}))
if specificNoteId is not None:
self.mf_model.freeze_rater_and_global_parameters()
self.prepare_features_and_labels(specificNoteId)

train_loss, loss, validate_loss = self._fit_model(validatePercent)
if self._normalizedLossHyperparameters is not None:
_, raterParams = self._get_parameters_from_trained_model()
_, raterParams = self._get_parameters_from_trained_model(flipFactorsForIdentification)
assert self.modelData is not None
self._lossModule = NormalizedLoss(
self.criterion,
Expand All @@ -575,7 +578,9 @@ def run_mf(
if self._log:
logger.info(f"Global Intercept: {globalIntercept}")

fitNoteParams, fitRaterParams = self._get_parameters_from_trained_model()
fitNoteParams, fitRaterParams = self._get_parameters_from_trained_model(
flipFactorsForIdentification
)

fitRaterParams.drop(Constants.raterIndexKey, axis=1, inplace=True)
if validatePercent is None:
Expand Down
8 changes: 6 additions & 2 deletions sourcecode/scoring/matrix_factorization/pseudo_raters.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ def _check_note_parameters_same(self, newMatrixFactorization: MatrixFactorizatio
(
noteParamsFromNewModel,
raterParamsFromNewModel,
) = newMatrixFactorization._get_parameters_from_trained_model()
) = newMatrixFactorization._get_parameters_from_trained_model(
flipFactorsForIdentification=False
)
assert (noteParamsFromNewModel == self.noteParams).all().all()

def _make_extreme_raters(self, raterParams: pd.DataFrame, raterIdMap: pd.DataFrame):
Expand Down Expand Up @@ -206,7 +208,9 @@ def _fit_all_notes_with_raters_constant(self, ratingFeaturesAndLabelsWithExtreme
if self._checkParamsSame:
self._check_rater_parameters_same(newExtremeMF)

fitNoteParams, fitRaterParams = newExtremeMF._get_parameters_from_trained_model()
fitNoteParams, fitRaterParams = newExtremeMF._get_parameters_from_trained_model(
flipFactorsForIdentification=False
)
return fitNoteParams

def _create_extreme_ratings(self):
Expand Down
3 changes: 3 additions & 0 deletions sourcecode/scoring/mf_base_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ def _score_notes_and_users(
prescoringNoteModelOutput: pd.DataFrame,
prescoringRaterModelOutput: pd.DataFrame,
prescoringMetaScorerOutput: c.PrescoringMetaScorerOutput,
flipFactorsForIdentification: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Run the "final" matrix factorization scoring algorithm.
Accepts prescoring's output as its input, as well as the new ratings and note status history.
Expand Down Expand Up @@ -983,6 +984,7 @@ def _score_notes_and_users(
freezeGlobalParameters=True,
ratingPerNoteLossRatio=prescoringMetaScorerOutput.finalRoundNumRatings
/ prescoringMetaScorerOutput.finalRoundNumNotes,
flipFactorsForIdentification=flipFactorsForIdentification,
)

if self._saveIntermediateState:
Expand Down Expand Up @@ -1139,6 +1141,7 @@ def score_final(self, scoringArgs: c.FinalScoringArgs) -> c.ModelResult:
prescoringNoteModelOutput=prescoringNoteModelOutput,
prescoringRaterModelOutput=prescoringRaterModelOutput,
prescoringMetaScorerOutput=prescoringMetaScorerOutput,
flipFactorsForIdentification=False,
)

with self.time_block("Postprocess output"):
Expand Down
4 changes: 4 additions & 0 deletions sourcecode/scoring/note_status_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ def _update_single_note_status_history(mergedNote, currentTimeMillis, newScoredN
mergedNote[c.lockedStatusKey] = mergedNote[c.finalRatingStatusKey]
mergedNote[c.timestampMillisOfStatusLockKey] = currentTimeMillis

# Clear timestampMillisOfNmrDueToMinStableCrhTimeKey if the note is locked.
if pd.notna(mergedNote[c.lockedStatusKey]):
mergedNote[c.timestampMillisOfNmrDueToMinStableCrhTimeKey] = -1

if pd.isna(mergedNote[c.createdAtMillisKey + newScoredNotesSuffix]):
# note used to be scored but isn't now; just retain old info
return mergedNote
Expand Down
5 changes: 3 additions & 2 deletions sourcecode/scoring/pandas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,11 @@ def get_df_info(
.reset_index(drop=False)
.rename(columns={"index": "column", 0: "RAM"})
)
ramBytes = stats["RAM"].sum()
if name is not None:
lines = [f"""{name} total RAM: {stats["RAM"].sum()}"""]
lines = [f"""{name} total RAM: {ramBytes} bytes ({ramBytes * 1e-9:.3f} GB)"""]
else:
lines = [f"""total RAM: {stats["RAM"].sum()}"""]
lines = [f"""total RAM: {ramBytes} bytes ({ramBytes * 1e-9:.3f} GB)"""]
lines.extend(str(stats).split("\n"))
if counter:
for col, dtype in zip(stats["column"], stats["dtype"]):
Expand Down
Loading