From c2884e2e0b7f1ae7612019d69d887c8fe6e174da Mon Sep 17 00:00:00 2001 From: Markus Schepke Date: Thu, 12 Dec 2024 22:45:56 +0200 Subject: [PATCH] Added baseline implementations of BaseGamesRecommender --- src/board_game_recommender/baseline.py | 329 +++++++++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 src/board_game_recommender/baseline.py diff --git a/src/board_game_recommender/baseline.py b/src/board_game_recommender/baseline.py new file mode 100644 index 0000000..d3481f3 --- /dev/null +++ b/src/board_game_recommender/baseline.py @@ -0,0 +1,329 @@ +"""Baseline recommender models.""" + +from __future__ import annotations + +from collections import defaultdict +from typing import TYPE_CHECKING, cast + +import numpy as np +import polars as pl + +from board_game_recommender.abc import BaseGamesRecommender + +if TYPE_CHECKING: + from collections.abc import Iterable + from pathlib import Path + from typing import Any, Self + + +def dataframe_from_scores( + *, + users: Iterable[str], + games: Iterable[Any], + scores: np.ndarray, # shape: (users, games) +) -> pl.DataFrame: + """Creates a Pandas DataFrame out of raw recommendation scores.""" + + users = tuple(users) + games = tuple(games) + + rank_cols = ( + pl.col(f"{column}_score") + .rank(method="min", descending=True) + .alias(f"{column}_rank") + for column in users + ) + + result = ( + pl.DataFrame( + data=scores.T, + schema=[f"{column}_score" for column in users], + ) + .lazy() + .with_columns( + pl.Series("index", games), + *rank_cols, + ) + ) + + if len(users) == 1: + result = result.sort(f"{users[0]}_rank") + + return result.collect() + + +class RandomGamesRecommender(BaseGamesRecommender): + """Random recommender.""" + + def __init__(self) -> None: + self.rng = np.random.default_rng() + + @property + def known_games(self) -> frozenset[int]: + return frozenset() + + @property + def rated_games(self) -> frozenset[int]: + return frozenset() + + @property + def known_users(self) -> frozenset[str]: + return frozenset() + + def _recommendation_scores(self, users: int, games: int) -> np.ndarray: + """Random scores.""" + return self.rng.random((users, games)) + + def recommend( # type: ignore[override] + self, + users: Iterable[str], + games: Iterable[int], + **kwargs, # noqa: ARG002 + ) -> pl.DataFrame: + """Random recommendations for certain users.""" + + users = list(users) + games = list(games) + scores = self._recommendation_scores(users=len(users), games=len(games)) + + return dataframe_from_scores(users=users, games=games, scores=scores) + + def recommend_as_numpy( + self, + users: Iterable[str], + games: Iterable[int], + ) -> np.ndarray: + """Random recommendations for certain users and games as a numpy array.""" + users = list(users) + games = list(games) + return self._recommendation_scores(users=len(users), games=len(games)) + + def recommend_group( # type: ignore[override] + self, + users: Iterable[str], # noqa: ARG002 + games: Iterable[int], + **kwargs, # noqa: ARG002 + ) -> pl.DataFrame: + """Random recommendations for a group of users.""" + + games = list(games) + scores = self._recommendation_scores(users=1, games=len(games)) + + return dataframe_from_scores(users=["_all"], games=games, scores=scores) + + def recommend_group_as_numpy( + self, + users: Iterable[str], # noqa: ARG002 + games: Iterable[int], + ) -> np.ndarray: + """Random recommendations for a group of users and games as a numpy array.""" + games = list(games) + return self._recommendation_scores(users=1, games=len(games)) + + def recommend_similar(self, games: Iterable[int], **kwargs) -> pl.DataFrame: + raise NotImplementedError + + def similar_games(self, games: Iterable[int], **kwargs) -> pl.DataFrame: + raise NotImplementedError + + +class PopularGamesRecommender(BaseGamesRecommender): + """Popular games recommender.""" + + id_field: str = "bgg_id" + user_id_field: str = "bgg_user_name" + rating_id_field: str = "bgg_user_rating" + + scores: dict[int, float] + raw_scores: np.ndarray + default_value: float + game_ids: tuple[int, ...] + + _known_games: frozenset[int] | None = None + + def __init__( + self, + game_ids: Iterable[int], + scores: np.ndarray, + default_value: float | None = None, + ) -> None: + self.default_value = ( + default_value if default_value is not None else scores.mean() + ) + self.raw_scores = scores + self.game_ids = tuple(game_ids) + self.scores = defaultdict( + self.default_factory, + zip(self.game_ids, self.raw_scores), + ) + + @classmethod + def train(cls, ratings: pl.DataFrame) -> Self: + """Train the recommender from ratings data.""" + raise NotImplementedError + + @classmethod + def train_from_csv(cls, ratings_file: Path | str) -> Self: + """Train the recommender from a ratings file in CSV format.""" + ratings = pl.read_csv(ratings_file) + return cls.train( + ratings.select(cls.id_field, cls.user_id_field, cls.rating_id_field), + ) + + @classmethod + def train_from_json_lines(cls, ratings_file: Path | str) -> Self: + """Train the recommender from a ratings file in JSON lines format.""" + ratings = pl.read_ndjson(ratings_file) + return cls.train( + ratings.select(cls.id_field, cls.user_id_field, cls.rating_id_field), + ) + + @property + def known_games(self) -> frozenset[int]: + if self._known_games is not None: + return self._known_games + self._known_games = frozenset(self.game_ids) + return self._known_games + + @property + def rated_games(self) -> frozenset[int]: + return self.known_games + + @property + def num_games(self) -> int: + return len(self.game_ids) + + @property + def known_users(self) -> frozenset[str]: + return frozenset() + + def default_factory(self) -> float: + """Default value for unknown games.""" + return self.default_value + + def _recommendation_scores( + self, + users: int, + games: list[int] | None = None, + ) -> np.ndarray: + """Popularity scores.""" + scores = ( + np.array([self.scores[game_id] for game_id in games]) + if games + else self.raw_scores + ) + return np.tile(scores, [users, 1]) + + def recommend( + self, + users: Iterable[str], + **kwargs, # noqa: ARG002 + ) -> pl.DataFrame: + """Popular recommendations for certain users.""" + users = list(users) + scores = self._recommendation_scores(users=len(users)) + return dataframe_from_scores(users=users, games=self.game_ids, scores=scores) + + def recommend_as_numpy( + self, + users: Iterable[str], + games: Iterable[int], + ) -> np.ndarray: + """Popular recommendations for certain users and games as a numpy array.""" + users = list(users) + games = list(games) + return self._recommendation_scores(users=len(users), games=games) + + def recommend_group( + self, + users: Iterable[str], # noqa: ARG002 + **kwargs, # noqa: ARG002 + ) -> pl.DataFrame: + """Popular recommendations for a group of users.""" + scores = self._recommendation_scores(users=1) + return dataframe_from_scores(users=["_all"], games=self.game_ids, scores=scores) + + def recommend_group_as_numpy( + self, + users: Iterable[str], # noqa: ARG002 + games: Iterable[int], + ) -> np.ndarray: + """Popular recommendations for a group of users and games as a numpy array.""" + games = list(games) + return self._recommendation_scores(users=1, games=games) + + def recommend_similar( + self, + games: Iterable[int], + **kwargs, + ) -> pl.DataFrame: + raise NotImplementedError + + def similar_games( + self, + games: Iterable[int], + **kwargs, + ) -> pl.DataFrame: + raise NotImplementedError + + +class PopularMeanGamesRecommender(PopularGamesRecommender): + """Recommend games by their mean rating score.""" + + @classmethod + def train(cls, ratings: pl.DataFrame) -> Self: + data = ratings.group_by(cls.id_field).agg( + mean=pl.col(cls.rating_id_field).mean(), + ) + return cls( + game_ids=data[cls.id_field], + scores=data["mean"].to_numpy(), + default_value=cast(float, ratings[cls.rating_id_field].mean()), + ) + + +class PopularBayesianGamesRecommender(PopularGamesRecommender): + """Recommend games by their Bayesian average rating score.""" + + ratings_per_dummy: float = 10_000 + dummy_rating: float | None = 5.5 + + @classmethod + def train(cls, ratings: pl.DataFrame) -> Self: + num_dummies = len(ratings) / cls.ratings_per_dummy + dummy_rating = cast( + float, + ratings[cls.rating_id_field].mean() + if cls.dummy_rating is None + else cls.dummy_rating, + ) + + stats = ( + ratings.group_by(cls.id_field) + .agg( + mean=pl.col(cls.rating_id_field).mean(), + size=pl.len(), + ) + .with_columns( + bayes=pl.col("mean") * pl.col("size") + + dummy_rating * num_dummies / (pl.col("size") + num_dummies), + ) + ) + + return cls( + game_ids=stats[cls.id_field], + scores=stats["bayes"].to_numpy(), + default_value=dummy_rating, + ) + + +class PopularNumRatingsGamesRecommender(PopularGamesRecommender): + """Recommend games by their number of ratings.""" + + @classmethod + def train(cls, ratings: pl.DataFrame) -> Self: + data = ratings.group_by(cls.id_field).agg(count=pl.len()) + return cls( + game_ids=data[cls.id_field], + scores=data["count"].to_numpy(), + )