Skip to content

Commit

Permalink
Allow using mean CV score as main score in the leaderboard (#590)
Browse files Browse the repository at this point in the history
  • Loading branch information
rth authored Mar 1, 2024
1 parent 52e5650 commit 97fbb52
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 99 deletions.
1 change: 1 addition & 0 deletions doc/whats_new/v0.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ Changelog

- AWS runner: better detection of training errors :pr:`591`
- Switch to fetching starting kit repos via HTTP rather than using git clone to avoid being blocked by Github :pr:`592`.
- Allow using mean CV score as main score in the leaderboard :pr:`590`
8 changes: 8 additions & 0 deletions ramp-database/ramp_database/model/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,14 @@ def workflow(self):
event."""
return self.problem.workflow

@property
def leaderboard_score_kind(self):
"""Whether the main leaderboard score should be bagged or not
Valid values are "bag" or "mean" (of the CV folds)
"""
return getattr(self.problem.module, "leaderboard_score_kind", "bag")

@property
def official_score_type(self):
""":class:`ramp_database.model.EventScoreType`: The score type for the
Expand Down
19 changes: 14 additions & 5 deletions ramp-database/ramp_database/tools/leaderboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def _compute_leaderboard(
"""
record_score = []
event = session.query(Event).filter_by(name=event_name).one()
score_kind = event.leaderboard_score_kind
map_score_precision = {
score_type.name: score_type.precision for score_type in event.score_types
}
Expand Down Expand Up @@ -130,7 +131,14 @@ def _compute_leaderboard(
df["submitted at (UTC)"] = df["submitted at (UTC)"].astype("datetime64[s]")

# reordered the column
stats_order = ["bag", "mean", "std"] if leaderboard_type == "private" else ["bag"]
if score_kind == "bag":
stats_order = (
["bag", "mean", "std"] if leaderboard_type == "private" else ["bag"]
)
else:
stats_order = (
["mean", "std", "bag"] if leaderboard_type == "private" else ["mean"]
)
dataset_order = (
["public", "private"] if leaderboard_type == "private" else ["public"]
)
Expand Down Expand Up @@ -166,7 +174,7 @@ def _compute_leaderboard(
df = df.drop(columns=contrib_columns)

df = df.sort_values(
"bag {} {}".format(leaderboard_type, event.official_score_name),
"{} {} {}".format(score_kind, leaderboard_type, event.official_score_name),
ascending=event.get_official_score_type(session).is_lower_the_better,
)

Expand Down Expand Up @@ -202,6 +210,7 @@ def _compute_competition_leaderboard(
event = session.query(Event).filter_by(name=event_name).one()
score_type = event.get_official_score_type(session)
score_name = event.official_score_name
score_kind = event.leaderboard_score_kind

private_leaderboard = _compute_leaderboard(
session, submissions, "private", event_name, with_links=False
Expand All @@ -215,15 +224,15 @@ def _compute_competition_leaderboard(

col_selected_private = (
["team", "submission"]
+ ["bag private " + score_name, "bag public " + score_name]
+ [f"{score_kind} private " + score_name, f"{score_kind} public " + score_name]
+ time_list
+ ["submitted at (UTC)"]
)
leaderboard_df = private_leaderboard[col_selected_private]
leaderboard_df = leaderboard_df.rename(
columns={
"bag private " + score_name: "private " + score_name,
"bag public " + score_name: "public " + score_name,
f"{score_kind} private " + score_name: "private " + score_name,
f"{score_kind} public " + score_name: "public " + score_name,
}
)

Expand Down
249 changes: 155 additions & 94 deletions ramp-database/ramp_database/tools/tests/test_leaderboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,101 +56,10 @@ def session_toy_function(database_connection):
Model.metadata.drop_all(db)


def test_update_leaderboard_functions(session_toy_function):
event_name = "iris_test"
user_name = "test_user"
for leaderboard_type in [
"public",
"private",
"failed",
"public competition",
"private competition",
]:
leaderboard = get_leaderboard(
session_toy_function, leaderboard_type, event_name
)
assert leaderboard is None
leaderboard = get_leaderboard(session_toy_function, "new", event_name)
assert leaderboard

event = get_event(session_toy_function, event_name)
assert event.private_leaderboard_html is None
assert event.public_leaderboard_html_with_links is None
assert event.public_leaderboard_html_no_links is None
assert event.failed_leaderboard_html is None
assert event.public_competition_leaderboard_html is None
assert event.private_competition_leaderboard_html is None
assert event.new_leaderboard_html

event_team = get_event_team_by_name(session_toy_function, event_name, user_name)
assert event_team.leaderboard_html is None
assert event_team.failed_leaderboard_html is None
assert event_team.new_leaderboard_html

event_teams = session_toy_function.query(EventTeam).filter_by(event=event).all()
for et in event_teams:
assert et.leaderboard_html is None
assert et.failed_leaderboard_html is None
assert et.new_leaderboard_html

# run the dispatcher to process the different submissions
config = read_config(database_config_template())
event_config = read_config(ramp_config_template())
dispatcher = Dispatcher(config, event_config, n_workers=-1, hunger_policy="exit")
dispatcher.launch()
session_toy_function.commit()

update_leaderboards(session_toy_function, event_name)
event = get_event(session_toy_function, event_name)
assert event.private_leaderboard_html
assert event.public_leaderboard_html_with_links
assert event.public_leaderboard_html_no_links
assert event.failed_leaderboard_html
assert event.public_competition_leaderboard_html
assert event.private_competition_leaderboard_html
assert event.new_leaderboard_html is None

update_user_leaderboards(session_toy_function, event_name, user_name)
event_team = get_event_team_by_name(session_toy_function, event_name, user_name)
assert event_team.leaderboard_html
assert event_team.failed_leaderboard_html
assert event_team.new_leaderboard_html is None

update_all_user_leaderboards(session_toy_function, event_name)
event_teams = session_toy_function.query(EventTeam).filter_by(event=event).all()
for et in event_teams:
assert et.leaderboard_html
assert et.failed_leaderboard_html
assert et.new_leaderboard_html is None


@pytest.mark.parametrize(
"leaderboard_type, expected_html",
[
("new", not None),
("public", None),
("private", None),
("failed", None),
("public competition", None),
("private competition", None),
],
)
def test_get_leaderboard_only_new_submissions(
session_toy_db, leaderboard_type, expected_html
):
# only check that the submission should be shown as new when the
# dispatcher was not started.
if expected_html is not None:
assert get_leaderboard(session_toy_db, leaderboard_type, "iris_test")
else:
assert (
get_leaderboard(session_toy_db, leaderboard_type, "iris_test")
is expected_html
)


def test_get_leaderboard(session_toy_db):
def test_get_leaderboard(session_toy_function):
"""this test assumes that all the submissions in the database are 'new'"""
session_toy_db = session_toy_function

leaderboard_new = get_leaderboard(session_toy_db, "new", "iris_test")
assert leaderboard_new.count("<tr>") == 6
leaderboard_new = get_leaderboard(session_toy_db, "new", "iris_test", "test_user")
Expand Down Expand Up @@ -288,6 +197,158 @@ def test_get_leaderboard(session_toy_db):
)


def test_update_leaderboard_functions(session_toy_function):
event_name = "iris_test"
user_name = "test_user"
for leaderboard_type in [
"public",
"private",
"failed",
"public competition",
"private competition",
]:
leaderboard = get_leaderboard(
session_toy_function, leaderboard_type, event_name
)
assert leaderboard is None
leaderboard = get_leaderboard(session_toy_function, "new", event_name)
assert leaderboard

event = get_event(session_toy_function, event_name)
assert event.private_leaderboard_html is None
assert event.public_leaderboard_html_with_links is None
assert event.public_leaderboard_html_no_links is None
assert event.failed_leaderboard_html is None
assert event.public_competition_leaderboard_html is None
assert event.private_competition_leaderboard_html is None
assert event.new_leaderboard_html

event_team = get_event_team_by_name(session_toy_function, event_name, user_name)
assert event_team.leaderboard_html is None
assert event_team.failed_leaderboard_html is None
assert event_team.new_leaderboard_html

event_teams = session_toy_function.query(EventTeam).filter_by(event=event).all()
for et in event_teams:
assert et.leaderboard_html is None
assert et.failed_leaderboard_html is None
assert et.new_leaderboard_html

# run the dispatcher to process the different submissions
config = read_config(database_config_template())
event_config = read_config(ramp_config_template())
dispatcher = Dispatcher(config, event_config, n_workers=-1, hunger_policy="exit")
dispatcher.launch()
session_toy_function.commit()

update_leaderboards(session_toy_function, event_name)
event = get_event(session_toy_function, event_name)
assert event.private_leaderboard_html
assert event.public_leaderboard_html_with_links
assert event.public_leaderboard_html_no_links
assert event.failed_leaderboard_html
assert event.public_competition_leaderboard_html
assert event.private_competition_leaderboard_html
assert event.new_leaderboard_html is None

update_user_leaderboards(session_toy_function, event_name, user_name)
event_team = get_event_team_by_name(session_toy_function, event_name, user_name)
assert event_team.leaderboard_html
assert event_team.failed_leaderboard_html
assert event_team.new_leaderboard_html is None

update_all_user_leaderboards(session_toy_function, event_name)
event_teams = session_toy_function.query(EventTeam).filter_by(event=event).all()
for et in event_teams:
assert et.leaderboard_html
assert et.failed_leaderboard_html
assert et.new_leaderboard_html is None


@pytest.mark.parametrize(
"leaderboard_type, expected_html",
[
("new", not None),
("public", None),
("private", None),
("failed", None),
("public competition", None),
("private competition", None),
],
)
def test_get_leaderboard_only_new_submissions(
session_toy_db, leaderboard_type, expected_html
):
# only check that the submission should be shown as new when the
# dispatcher was not started.
if expected_html is not None:
assert get_leaderboard(session_toy_db, leaderboard_type, "iris_test")
else:
assert (
get_leaderboard(session_toy_db, leaderboard_type, "iris_test")
is expected_html
)


def test_get_leaderboard_non_bagged_scores(session_toy_db, monkeypatch):
"""Check that we can use non-bagged scores in the leaderboard
by setting leaderboard_score_kind = "mean" in problem.py
"""
from ramp_database.tools.leaderboard import Event

# mock setting `leaderboard_score_kind = "mean"` in problem.py
monkeypatch.setattr(Event, "leaderboard_score_kind", property(lambda self: "mean"))

# run the dispatcher to process a submission
config = read_config(database_config_template())
event_config = read_config(ramp_config_template())
dispatcher = Dispatcher(config, event_config, n_workers=-1, hunger_policy="exit")
dispatcher.launch()
session_toy_db.commit()

res = get_leaderboard(session_toy_db, "private", "iris_test")
assert (
(
"""<th>submission</th>
<th>mean public acc</th>
<th>std public acc</th>
<th>bag public acc</th>"""
)
in res
)

res = get_leaderboard(session_toy_db, "public", "iris_test")
assert (
(
"""<th>submission</th>
<th>acc</th>
<th>error</th>"""
)
in res
)
res = get_leaderboard(session_toy_db, "public competition", "iris_test")
assert (
(
"""<th>rank</th>
<th>team</th>
<th>submission</th>
<th>acc</th>"""
)
in res
)

res = get_leaderboard(session_toy_db, "private competition", "iris_test")
assert (
(
"""<th>team</th>
<th>submission</th>
<th>acc</th>"""
)
in res
)


@pytest.mark.parametrize(
"event_name, expected_size",
[("iris_test", 4), ("iris_aws_test", 0), ("boston_housing_test", 0)],
Expand Down

0 comments on commit 97fbb52

Please sign in to comment.