Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple scorer updates, mostly aimed at reducing note status flipping #144

Merged
merged 1 commit into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions sourcecode/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
--notes data/notes-00000.tsv \
--ratings data/ratings-00000.tsv \
--status data/noteStatusHistory-00000.tsv \
--outdir data/scored_notes.tsv
--outdir data
"""

import argparse
import os

import scoring.constants as c
from scoring.enums import scorers_from_csv
from scoring.process_data import get_data, write_tsv_local
from scoring.process_data import LocalDataLoader, write_tsv_local
from scoring.run_scoring import run_scoring


Expand Down Expand Up @@ -82,6 +82,13 @@ def parse_args():
dest="strict_columns",
)
parser.set_defaults(strict_columns=True)
parser.add_argument(
"--parallel",
help="Disable parallel run of algorithm.",
action="store_true",
dest="parallel",
)
parser.set_defaults(parallel=False)

return parser.parse_args()

Expand All @@ -93,9 +100,8 @@ def main():
c.epochMillis = args.epoch_millis

# Load input dataframes.
_, ratings, statusHistory, userEnrollment = get_data(
args.notes, args.ratings, args.status, args.enrollment, args.headers
)
dataLoader = LocalDataLoader(args.notes, args.ratings, args.status, args.enrollment, args.headers)
_, ratings, statusHistory, userEnrollment = dataLoader.get_data()

# Invoke scoring and user contribution algorithms.
scoredNotes, helpfulnessScores, newStatus, auxNoteInfo = run_scoring(
Expand All @@ -106,6 +112,8 @@ def main():
pseudoraters=args.pseudoraters,
enabledScorers=args.scorers,
strictColumns=args.strict_columns,
runParallel=args.parallel,
dataLoader=dataLoader if args.parallel == True else None,
)

# Write outputs to local disk.
Expand Down
2 changes: 1 addition & 1 deletion sourcecode/scoring/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

maxTrainError = 0.09

coreFlipPct = 0.175
coreFlipPct = 0.09
expansionFlipPct = 0.19
maxReruns = 5

Expand Down
13 changes: 9 additions & 4 deletions sourcecode/scoring/matrix_factorization/matrix_factorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ def _initialize_parameters(
"""
assert self.mf_model is not None
if noteInit is not None:
print("initializing notes")
if self._logging:
print("initializing notes")
noteInit = self.noteIdMap.merge(noteInit, on=c.noteIdKey, how="left")
self.mf_model.note_intercepts.weight.data = torch.tensor(
np.expand_dims(noteInit[c.internalNoteInterceptKey].astype(np.float32).values, axis=1)
Expand All @@ -144,7 +145,8 @@ def _initialize_parameters(
)

if userInit is not None:
print("initializing users")
if self._logging:
print("initializing users")
userInit = self.raterIdMap.merge(userInit, on=c.raterParticipantIdKey, how="left")
self.mf_model.user_intercepts.weight.data = torch.tensor(
np.expand_dims(userInit[c.internalRaterInterceptKey].astype(np.float32).values, axis=1)
Expand All @@ -156,7 +158,8 @@ def _initialize_parameters(
)

if globalInterceptInit is not None:
print("initialized global intercept")
if self._logging:
print("initialized global intercept")
self.mf_model.global_intercept = torch.nn.parameter.Parameter(
torch.ones(1, 1) * globalInterceptInit
)
Expand Down Expand Up @@ -214,7 +217,8 @@ def _create_mf_model(
) # smaller learning rate
else:
self.optimizer = torch.optim.Adam(self.mf_model.parameters(), lr=self._noInitLearningRate)
print(self.mf_model.device)
if self._logging:
print(self.mf_model.device)
self.mf_model.to(self.mf_model.device)

def _instantiate_biased_mf_model(self):
Expand All @@ -225,6 +229,7 @@ def _instantiate_biased_mf_model(self):
n_notes,
use_global_intercept=self._useGlobalIntercept,
n_factors=self._numFactors,
logging=self._logging,
)
if self._logging:
print("------------------")
Expand Down
6 changes: 5 additions & 1 deletion sourcecode/scoring/matrix_factorization/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(
n_notes: int,
n_factors: int = 1,
use_global_intercept: bool = True,
logging: bool = True,
) -> None:
"""Initialize matrix factorization model using xavier_uniform for factors
and zeros for intercepts.
Expand All @@ -32,6 +33,8 @@ def __init__(
"""
super().__init__()

self._logging = logging

self.user_factors = torch.nn.Embedding(n_users, n_factors, sparse=False)
self.note_factors = torch.nn.Embedding(n_notes, n_factors, sparse=False)

Expand Down Expand Up @@ -78,5 +81,6 @@ def _freeze_parameters(self, words_to_freeze: set):
for name, param in self.named_parameters():
for word in words_to_freeze:
if word in name:
print("Freezing parameter: ", name)
if self._logging:
print("Freezing parameter: ", name)
param.requires_grad_(False)
79 changes: 47 additions & 32 deletions sourcecode/scoring/matrix_factorization/pseudo_raters.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ def __init__(
globalBias: float,
mfRanker: MatrixFactorization,
logging=True,
checkParamsSame=True,
):
self._logging = logging
self._mfRanker = mfRanker
self._checkParamsSame = checkParamsSame
self.ratings = ratings
(
self.noteIdMap,
Expand All @@ -43,9 +45,12 @@ def __init__(

def compute_note_parameter_confidence_bounds_with_pseudo_raters(self):
self._make_extreme_raters(self.raterParams, self.raterIdMap)
self._add_extreme_raters_to_id_maps_and_params()
self._create_extreme_ratings()

notesWithConfidenceBounds = self._fit_note_params_for_each_dataset_with_extreme_ratings()
noteParamsList = self._fit_note_params_for_each_dataset_with_extreme_ratings()

notesWithConfidenceBounds = self._aggregate_note_params(noteParamsList)
return self.noteParams.merge(
notesWithConfidenceBounds.reset_index(), on=c.noteIdKey, how="left"
)
Expand Down Expand Up @@ -170,8 +175,9 @@ def _create_new_model_with_extreme_raters_from_original_params(
lr=newExtremeMF._initLearningRate,
)

self._check_note_parameters_same(newExtremeMF)
self._check_rater_parameters_same(newExtremeMF)
if self._checkParamsSame:
self._check_note_parameters_same(newExtremeMF)
self._check_rater_parameters_same(newExtremeMF)

return newExtremeMF

Expand All @@ -183,15 +189,15 @@ def _fit_all_notes_with_raters_constant(self, ratingFeaturesAndLabelsWithExtreme
newExtremeMF._fit_model()

# Double check that we kept rater parameters fixed during re-training of note parameters.
self._check_rater_parameters_same(newExtremeMF)
if self._checkParamsSame:
self._check_rater_parameters_same(newExtremeMF)

fitNoteParams, fitRaterParams = newExtremeMF._get_parameters_from_trained_model()
return fitNoteParams

def _fit_note_params_for_each_dataset_with_extreme_ratings(self, joinOrig=False):
self._add_extreme_raters_to_id_maps_and_params()
extremeRatingsToAddWithoutNotes = []
extremeRatingsToAddWithoutNotes.append(
def _create_extreme_ratings(self):
self.extremeRatingsToAddWithoutNotes = []
self.extremeRatingsToAddWithoutNotes.append(
{
c.internalRaterInterceptKey: None,
c.internalRaterFactor1Key: None,
Expand All @@ -203,37 +209,44 @@ def _fit_note_params_for_each_dataset_with_extreme_ratings(self, joinOrig=False)

for helpfulNum in (0.0, 1.0):
extremeRater[c.helpfulNumKey] = helpfulNum
extremeRatingsToAddWithoutNotes.append(extremeRater.copy())
self.extremeRatingsToAddWithoutNotes.append(extremeRater.copy())

def _create_dataset_with_extreme_rating_on_each_note(self, ratingToAddWithoutNoteId):
## for each rating (ided by raterParticipantId and raterIndex)
if ratingToAddWithoutNoteId[c.helpfulNumKey] is not None:
ratingsWithNoteIds = []
for i, noteRow in (
self.ratingFeaturesAndLabels[[c.noteIdKey, mf_c.noteIndexKey]].drop_duplicates().iterrows()
):
ratingToAdd = ratingToAddWithoutNoteId.copy()
ratingToAdd[c.noteIdKey] = noteRow[c.noteIdKey]
ratingToAdd[mf_c.noteIndexKey] = noteRow[mf_c.noteIndexKey]
ratingsWithNoteIds.append(ratingToAdd)
extremeRatingsToAdd = pd.DataFrame(ratingsWithNoteIds).drop(
[c.internalRaterInterceptKey, c.internalRaterFactor1Key], axis=1
)
ratingFeaturesAndLabelsWithExtremeRatings = pd.concat(
[self.ratingFeaturesAndLabels, extremeRatingsToAdd]
)
else:
ratingFeaturesAndLabelsWithExtremeRatings = self.ratingFeaturesAndLabels
return ratingFeaturesAndLabelsWithExtremeRatings

def _fit_note_params_for_each_dataset_with_extreme_ratings(self):
noteParamsList = []
for ratingToAddWithoutNoteId in extremeRatingsToAddWithoutNotes:
## for each rating (ided by raterParticipantId and raterIndex)
if ratingToAddWithoutNoteId[c.helpfulNumKey] is not None:
ratingsWithNoteIds = []
for i, noteRow in (
self.ratingFeaturesAndLabels[[c.noteIdKey, mf_c.noteIndexKey]]
.drop_duplicates()
.iterrows()
):
ratingToAdd = ratingToAddWithoutNoteId.copy()
ratingToAdd[c.noteIdKey] = noteRow[c.noteIdKey]
ratingToAdd[mf_c.noteIndexKey] = noteRow[mf_c.noteIndexKey]
ratingsWithNoteIds.append(ratingToAdd)
extremeRatingsToAdd = pd.DataFrame(ratingsWithNoteIds).drop(
[c.internalRaterInterceptKey, c.internalRaterFactor1Key], axis=1
)
ratingFeaturesAndLabelsWithExtremeRatings = pd.concat(
[self.ratingFeaturesAndLabels, extremeRatingsToAdd]
)
else:
ratingFeaturesAndLabelsWithExtremeRatings = self.ratingFeaturesAndLabels
for ratingToAddWithoutNoteId in self.extremeRatingsToAddWithoutNotes:
ratingFeaturesAndLabelsWithExtremeRatings = (
self._create_dataset_with_extreme_rating_on_each_note(ratingToAddWithoutNoteId)
)

if self._logging:
print("------------------")
print(f"Re-scoring all notes with extra rating added: {ratingToAddWithoutNoteId}")

fitNoteParams = self._fit_all_notes_with_raters_constant(
ratingFeaturesAndLabelsWithExtremeRatings
)

fitNoteParams[Constants.extraRaterInterceptKey] = ratingToAddWithoutNoteId[
c.internalRaterInterceptKey
]
Expand All @@ -242,7 +255,9 @@ def _fit_note_params_for_each_dataset_with_extreme_ratings(self, joinOrig=False)
]
fitNoteParams[Constants.extraRatingHelpfulNumKey] = ratingToAddWithoutNoteId[c.helpfulNumKey]
noteParamsList.append(fitNoteParams)
return noteParamsList

def _aggregate_note_params(self, noteParamsList, joinOrig=False):
rawRescoredNotesWithEachExtraRater = pd.concat(noteParamsList)
rawRescoredNotesWithEachExtraRater.drop(mf_c.noteIndexKey, axis=1, inplace=True)
rawRescoredNotesWithEachExtraRater = rawRescoredNotesWithEachExtraRater.sort_values(
Expand Down Expand Up @@ -278,9 +293,9 @@ def _fit_note_params_for_each_dataset_with_extreme_ratings(self, joinOrig=False)
raterFacs[Constants.allKey] = 1
raterFacs[Constants.negFacKey] = raterFacs[c.internalRaterFactor1Key] < 0
raterFacs[Constants.posFacKey] = raterFacs[c.internalRaterFactor1Key] > 0
r = raterFacs.groupby(c.noteIdKey).sum()[
r = raterFacs.groupby(c.noteIdKey)[
[Constants.allKey, Constants.negFacKey, Constants.posFacKey]
]
].sum()
r.columns = pd.MultiIndex.from_product([[c.ratingCountKey], r.columns])
notesWithConfidenceBounds = notesWithConfidenceBounds.join(r)

Expand Down
2 changes: 1 addition & 1 deletion sourcecode/scoring/mf_base_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(
crnhThresholdNoteFactorMultiplier: float = -0.8,
crnhThresholdNMIntercept: float = -0.15,
crnhThresholdUCBIntercept: float = -0.04,
crhThresholdLCBIntercept: float = 0.32,
crhThresholdLCBIntercept: float = 0.35,
crhSuperThreshold: float = 0.5,
inertiaDelta: float = 0.01,
weightedTotalVotes: float = 1.0,
Expand Down
Loading
Loading