diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 6f43c87..477b0b8 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -42,5 +42,5 @@ jobs:
run: treon missing_data/BorrowingCapacity.ipynb missing_data/MissingBorrowingActivity.ipynb missing_data/MissingBooks.ipynb missing_data/MissingMembers.ipynb missing_data/MissingMembershipActivities.ipynb missing_data/BookcatalogBooksEstimate.ipynb
- name: Run treon to check selected speculative_reading notebooks
- run: treon speculative_reading/HemingwayBorrowing.ipynb speculative_reading/PartialBorrowers.ipynb
+ run: treon speculative_reading/HemingwayBorrowing.ipynb speculative_reading/PartialBorrowers.ipynb speculative_reading/CombineRecommendations.ipynb speculative_reading/LenskitRecommendations.ipynb
diff --git a/requirements.lock b/requirements.lock
index 763538a..bbcb163 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -57,6 +57,7 @@ ipywidgets==8.1.2
isoduration==20.11.0
jedi==0.19.1
Jinja2==3.1.2
+joblib==1.4.2
json5==0.9.24
jsonpointer==2.4
jsonschema==4.21.1
@@ -101,6 +102,7 @@ parso==0.8.4
pexpect==4.9.0
pillow==10.3.0
platformdirs==4.2.0
+plotly==5.22.0
pluggy==1.5.0
portpicker==1.5.2
powerlaw==1.5
@@ -132,6 +134,7 @@ requests==2.31.0
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rpds-py==0.18.0
+scikit-learn==1.4.2
scipy==1.13.0
seaborn==0.11.0
seedbank==0.1.3
@@ -145,6 +148,7 @@ stack-data==0.6.3
stanio==0.5.0
tenacity==8.2.3
terminado==0.18.1
+threadpoolctl==3.5.0
tinycss2==1.2.1
tomli==2.0.1
toolz==0.12.0
diff --git a/requirements.txt b/requirements.txt
index 7feb142..3e373ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,6 @@ seaborn
great-tables
# we used an unreleased version of copia with label options for plots
git+https://github.com/mikekestemont/copia@3e57da4
-matplotlib==3.7
\ No newline at end of file
+matplotlib==3.7
+scikit-learn
+lenskit
\ No newline at end of file
diff --git a/speculative_reading/LenskitRecommendations.ipynb b/speculative_reading/LenskitRecommendations.ipynb
index 508b314..e7ce264 100644
--- a/speculative_reading/LenskitRecommendations.ipynb
+++ b/speculative_reading/LenskitRecommendations.ipynb
@@ -1,1495 +1,1452 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "I1x1AE35cOtj"
- },
- "source": [
- "# Generate and Evaluate Lenskit Model Stability and Select Scores for Predictions"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "EjZT_ntacTdr"
- },
- "source": [
- "## Load libraries and data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "id": "kAuugX9YGLpu"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Lenskit is already installed.\n"
- ]
- }
- ],
- "source": [
- "# This code checks if the lenskit library is installed and installs it if it is not.\n",
- "try:\n",
- " import lenskit\n",
- " print(\"Lenskit is already installed.\")\n",
- "except ImportError:\n",
- " print(\"Lenskit is not installed. Installing...\")\n",
- " import subprocess\n",
- " subprocess.check_call([\"pip\", \"install\", \"lenskit\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {
- "id": "Xo656ZZAGfei"
- },
- "outputs": [],
- "source": [
- "# Standard library imports\n",
- "import os\n",
- "import sys\n",
- "from typing import List\n",
- "\n",
- "# Third party imports\n",
- "import altair as alt # For data visualization\n",
- "import numpy as np # For numerical operations\n",
- "import pandas as pd # For data manipulation\n",
- "from scipy.stats import zscore # For statistical computations\n",
- "from sklearn.preprocessing import MinMaxScaler # For data preprocessing\n",
- "from tqdm.notebook import tqdm # For progress bars\n",
- "\n",
- "# LensKit imports\n",
- "from lenskit import Recommender, topn, util, batch, crossfold as xf # For recommendation systems\n",
- "from lenskit.algorithms import als, basic # For recommendation algorithms\n",
- "\n",
- "# Local application/library specific imports\n",
- "sys.path.append(\"..\")\n",
- "from utils.missing_data_processing import * # For handling missing data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "b5W2qUTyGSkv",
- "outputId": "6d8bf784-b7a5-4df2-a71a-db8da792081b"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " user | \n",
- " item | \n",
- " rating | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 2 | \n",
- " rhys | \n",
- " conrad-typhoon | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " lanux-eyre-de | \n",
- " woolf-night-day | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 12 | \n",
- " tery | \n",
- " james-joyce | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 13 | \n",
- " tery | \n",
- " freeman-portrait-george-moore | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 22 | \n",
- " macleish-ada | \n",
- " stern-tents-israel | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 38 | \n",
- " alvear | \n",
- " yeats-later-poems | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 46 | \n",
- " joyce-james | \n",
- " mantzius-history-theatrical-art | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 51 | \n",
- " joyce-james | \n",
- " scott-poems-walter-scott | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 52 | \n",
- " joyce-james | \n",
- " chekhov-horse-stealers-stories | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " 53 | \n",
- " joyce-james | \n",
- " stephens-crock-gold | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " user item rating\n",
- "2 rhys conrad-typhoon 1\n",
- "8 lanux-eyre-de woolf-night-day 1\n",
- "12 tery james-joyce 1\n",
- "13 tery freeman-portrait-george-moore 1\n",
- "22 macleish-ada stern-tents-israel 1\n",
- "38 alvear yeats-later-poems 1\n",
- "46 joyce-james mantzius-history-theatrical-art 1\n",
- "51 joyce-james scott-poems-walter-scott 1\n",
- "52 joyce-james chekhov-horse-stealers-stories 1\n",
- "53 joyce-james stephens-crock-gold 1"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Load the initial data into four DataFrames: events_df, members_df, books_df, and borrow_overrides_df.\n",
- "events_df, members_df, books_df, borrow_overrides_df = load_initial_data()\n",
- "\n",
- "# Process the events data to clean it and prepare it for analysis.\n",
- "events_df = preprocess_events_data(events_df)\n",
- "\n",
- "# Extract the item ID from the URI in the books DataFrame.\n",
- "# The item ID is the second to last part of the URI.\n",
- "books_df[\"item_id\"] = books_df.uri.apply(\n",
- " lambda x: x.split(\"/\")[-2] if pd.notna(x) else None\n",
- ")\n",
- "\n",
- "# Generate short IDs for the members in the members DataFrame.\n",
- "# The ID is the second to last part of the URI.\n",
- "members_df[\"id\"] = members_df.uri.apply(\n",
- " lambda x: x.split(\"/\")[-2]\n",
- ")\n",
- "\n",
- "# Get all member-book interactions from the events DataFrame.\n",
- "# Only include rows where the item URI is not null.\n",
- "interactions_df = events_df[events_df.item_uri.notna()].copy()\n",
- "\n",
- "# Restrict the interactions to borrow events only.\n",
- "interactions_df = interactions_df[interactions_df.event_type == 'Borrow'].copy()\n",
- "\n",
- "# Reduce the interactions DataFrame to the minimum user/item interaction fields and drop duplicate rows.\n",
- "unique_interactions_df = interactions_df[\n",
- " [\"member_id\", \"item_id\"]\n",
- "].drop_duplicates(subset=[\"member_id\", \"item_id\"])\n",
- "\n",
- "# Rename the columns to the names expected by LensKit.\n",
- "# The DataFrame is renamed to 'ratings' for use with the tutorial.\n",
- "ratings = unique_interactions_df.rename(columns={'member_id': 'user', 'item_id': 'item'})\n",
- "\n",
- "# The example assumes a rating. Use a 1/0 rating and set all to 1 to confirm interaction.\n",
- "ratings['rating'] = 1\n",
- "\n",
- "# Display the first 10 rows of the ratings DataFrame.\n",
- "ratings.head(10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " member_id | \n",
- " subscription_start | \n",
- " subscription_end | \n",
- " subscription_events | \n",
- " subscription_volumes | \n",
- " subscription_days | \n",
- " internal_gaps | \n",
- " known_borrows | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 23 | \n",
- " raphael-france | \n",
- " 1920-04-30 | \n",
- " 1921-11-17 | \n",
- " Subscription;Renewal;Renewal;Renewal | \n",
- " 1.0 | \n",
- " 566 | \n",
- " 0;0;0 | \n",
- " 1008 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " member_id subscription_start subscription_end \\\n",
- "23 raphael-france 1920-04-30 1921-11-17 \n",
- "\n",
- " subscription_events subscription_volumes \\\n",
- "23 Subscription;Renewal;Renewal;Renewal 1.0 \n",
- "\n",
- " subscription_days internal_gaps known_borrows \n",
- "23 566 0;0;0 1008 "
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# load previously computed partial borrowers list (sequential / near sequential subscriptions collapsed)\n",
- "partial_borrowers = pd.read_csv('../appendix/speculative_reading/data/partial_borrowers_collapsed.csv')\n",
- "partial_borrowers.sort_values('known_borrows', ascending=False, inplace=True)\n",
- "# parse subscription dates so we can use them to identify circulating books\n",
- "partial_borrowers['subscription_start'] = pd.to_datetime(partial_borrowers['subscription_start'])\n",
- "partial_borrowers['subscription_end'] = pd.to_datetime(partial_borrowers['subscription_end'])\n",
- "partial_borrowers.head(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "# generate subset of events dataset with dates, for use in identifying books \n",
- "# in circulation during and before these subscriptions\n",
- "\n",
- "dated_events_df = events_df.copy()\n",
- "dated_events_df['start_date_dt'] = pd.to_datetime(dated_events_df['start_date'], errors='coerce')\n",
- "dated_events_df['end_date_dt'] = pd.to_datetime(dated_events_df['end_date'], errors='coerce')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "2IwUVNfddJ4_"
- },
- "source": [
- "## Fit initial model"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "vYH5o_UqdV2H"
- },
- "source": [
- "### Run Model Comparisons"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "\n",
- "# Define constants\n",
- "N_RECOMMENDATIONS = 20\n",
- "\n",
- "def get_item_ids(user_id: str, bookless_sub: pd.Series, dated_events_df: pd.DataFrame, events_df: pd.DataFrame) -> List[str]:\n",
- "\t\"\"\"\n",
- "\tGet the item IDs for books that were in circulation during the subscription period\n",
- "\n",
- "\tParameters\n",
- "\t----------\n",
- "\tuser_id : str\n",
- "\t\tThe user ID for the member\n",
- "\tbookless_sub : pd.Series\n",
- "\t\tA Series with the subscription start and end dates\n",
- "\tdated_events_df : pd.DataFrame\n",
- "\t\tA DataFrame with the events data and dates\n",
- "\tevents_df : pd.DataFrame\n",
- "\t\tA DataFrame with the events data\n",
- "\t\n",
- "\tReturns\n",
- "\t-------\n",
- "\tList[str]\n",
- "\t\tA list of item IDs for books that were in circulation during the subscription period\n",
- "\t\n",
- "\t\"\"\"\n",
- "\tcirculating_book_events = dated_events_df[(dated_events_df.start_date_dt < bookless_sub.subscription_end) | (dated_events_df.end_date_dt < bookless_sub.subscription_end)]\n",
- "\titem_ids = circulating_book_events[circulating_book_events.item_id.notna()].item_id.unique()\n",
- "\tmember_book_ids = events_df[(events_df.item_id.notna()) & (events_df.member_id.str.contains(user_id))].item_id.unique()\n",
- "\tsubset_item_ids = list(set(item_ids) - set(member_book_ids))\n",
- "\treturn subset_item_ids\n",
- "\n",
- "def get_predictions(user_id: str, bookless_sub: pd.Series, rec: Recommender, subset_item_ids: List[str]) -> pd.DataFrame:\n",
- "\t\"\"\"\n",
- "\tGet the recommendations for a user\n",
- "\n",
- "\tParameters\n",
- "\t----------\n",
- "\tuser_id : str\n",
- "\t\tThe user ID for the member\n",
- "\tbookless_sub : pd.Series\n",
- "\t\tA Series with the subscription start and end dates\n",
- "\trec : Recommender\n",
- "\t\tThe recommender model\n",
- "\tsubset_item_ids : List[str]\n",
- "\t\tA list of item IDs for books that were in circulation during the subscription period\n",
- "\t\n",
- "\tReturns\n",
- "\t-------\n",
- "\tpd.DataFrame\n",
- "\t\tA DataFrame with the recommendations for the user\n",
- "\n",
- "\t\"\"\"\n",
- "\tpredictions = rec.recommend(user_id, candidates=subset_item_ids)\n",
- "\tpredictions['member_id'] = user_id\n",
- "\tpredictions['subscription_start'] = bookless_sub.subscription_start\n",
- "\tpredictions['subscription_end'] = bookless_sub.subscription_end\n",
- "\tpredictions.rename(columns={'item': 'item_id'}, inplace=True)\n",
- "\treturn predictions\n",
- "\n",
- "def run_model_comparisons(number_of_runs: List[int], return_scores: bool, output_path: str, members: List[str]) -> pd.DataFrame:\n",
- "\t\"\"\"\n",
- "\tRun model comparisons for a list of run lengths\n",
- "\n",
- "\tParameters\n",
- "\t----------\n",
- "\tnumber_of_runs : List[int]\n",
- "\t\tA list of run lengths\n",
- "\treturn_scores : bool\n",
- "\t\tA boolean indicating whether to return scores\n",
- "\toutput_path : str\n",
- "\t\tThe path to save the output\n",
- "\tmembers : List[str]\n",
- "\t\tA list of member IDs\n",
- "\t\n",
- "\tReturns\n",
- "\t-------\n",
- "\tpd.DataFrame\n",
- "\t\tA DataFrame with the model comparisons\n",
- "\t\"\"\"\n",
- "\tif os.path.exists(output_path):\n",
- "\t\tcompare_models = pd.read_csv(output_path)\n",
- "\telse: \n",
- "\t\tmodel_runs=[]\n",
- "\t\tfor run_length in number_of_runs:\n",
- "\t\t\tall_recs = []\n",
- "\t\t\tfor index in tqdm(range(run_length)):\n",
- "\t\t\t\trec = Recommender.adapt(als.ImplicitMF(50, use_ratings=False))\n",
- "\t\t\t\trec.fit(ratings)\n",
- "\t\t\t\tpopular = Recommender.adapt(basic.Popular())\n",
- "\t\t\t\tpopular.fit(ratings)\n",
- "\t\t\t\tfor bookless_sub in list(partial_borrowers.itertuples()):\n",
- "\t\t\t\t\tuser_id = bookless_sub.member_id\n",
- "\t\t\t\t\tif user_id in members:\n",
- "\t\t\t\t\t\tsubset_item_ids = get_item_ids(user_id, bookless_sub, dated_events_df, events_df)\n",
- "\t\t\t\t\t\tpredictions = get_predictions(user_id, bookless_sub, rec, subset_item_ids)\n",
- "\t\t\t\t\t\tpredictions['model_run'] = index\n",
- "\t\t\t\t\t\tall_recs.append(predictions)\n",
- "\t\t\tall_recs_df = pd.concat(all_recs)\n",
- "\t\t\tmetrics_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score': [np.median, 'skew', 'std', 'var']}).reset_index()\n",
- "\t\t\tmetrics_df.columns = list(map(''.join, metrics_df.columns.values))\n",
- "\t\t\tmetrics_df.columns = [col if 'score' not in col else col.split('score')[1] for col in metrics_df.columns ]\n",
- "\t\t\tkurt_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].apply(pd.DataFrame.kurt).reset_index(name='kurtosis')\n",
- "\t\t\tfinal_df = pd.merge(metrics_df, kurt_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])\n",
- "\t\t\tfinal_df['model_loops'] = run_length\n",
- "\t\t\tif return_scores:\n",
- "\t\t\t\tfinal_df = pd.merge(final_df, all_recs_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'], how='left')\n",
- "\t\t\tmodel_runs.append(final_df)\n",
- "\t\t\tcompare_models = pd.concat(model_runs)\n",
- "\t\t\tcompare_models.to_csv(output_path, index=False)\n",
- "\treturn compare_models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "id": "Cpeze52F_UpV"
- },
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "6fd97b547b6d461d881f3f9053142320",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/10 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Numba is using threading layer workqueue - consider TBB\n",
- "found 1 potential runtime problems - see https://boi.st/lkpy-perf\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "799b35639cb744e8b4f76ff1d7f0f788",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/20 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "513568fde99a4911bd6cbd8459f40dbd",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/50 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "1dee895f725842b9be0722d174087b72",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/100 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "54a2466bfd474b5ebf02b09e240b64c4",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/200 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# specify run size in number of runs\n",
- "number_of_runs = [10,20,50,100, 200]\n",
- "# specify members\n",
- "members = ['kittredge-eleanor-hayden', 'colens-fernand', 'raphael-france', 'hemingway-ernest']\n",
- "compare_models = run_model_comparisons(number_of_runs, False, './data/lenskit_comparison_model_runs.csv', members)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "COMVpXwHdZPp"
- },
- "source": [
- "### Visualize stability of model scores"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "id": "5ozr3kDtaM6U"
- },
- "outputs": [],
- "source": [
- "compare_models['member_period'] = compare_models.member_id + ': ' + compare_models.subscription_start.astype(str) + '/' + compare_models.subscription_end.astype(str)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {
- "id": "dVc1uhYTaSek"
- },
- "outputs": [],
- "source": [
- "def sample_scores(df: pd.DataFrame, get_top: bool, numb_of_books: int) -> pd.DataFrame:\n",
- " \"\"\"\n",
- " This function samples scores from a DataFrame for a given number of books and periods.\n",
- " \n",
- " Parameters:\n",
- " df (pd.DataFrame): The DataFrame containing the scores.\n",
- " get_top (bool): If True, the function will return the top scores. If False, it will return random scores.\n",
- " numb_of_books (int): The number of books to sample scores for.\n",
- " \n",
- " Returns:\n",
- " pd.DataFrame: A DataFrame containing the sampled scores.\n",
- " \"\"\"\n",
- " \n",
- " # Get the unique periods from the DataFrame.\n",
- " periods = df.member_period.unique().tolist()\n",
- " \n",
- " # Initialize an empty list to store the DataFrames for each period.\n",
- " visualize_df = []\n",
- " \n",
- " # For each period...\n",
- " for period in periods:\n",
- " # Initialize an empty list to store the books for this period.\n",
- " final_books = []\n",
- " \n",
- " # Get the rows from the DataFrame for this period.\n",
- " rows = df[df.member_period == period]\n",
- " \n",
- " # Get the unique loop numbers from the rows.\n",
- " loops = rows.model_loops.unique().tolist()\n",
- " \n",
- " # While the number of books is less than the specified number...\n",
- " while len(final_books) < numb_of_books:\n",
- " # For each loop...\n",
- " for loop in loops:\n",
- " # Get the rows for this loop.\n",
- " final_rows = rows[rows.model_loops == loop]\n",
- " \n",
- " # If get_top is True, sort the rows by median score in descending order and get the top books.\n",
- " # Otherwise, get a random sample of books.\n",
- " if get_top:\n",
- " final_rows = final_rows.sort_values(by='median', ascending=False)\n",
- " books = final_rows[0:numb_of_books].item_id.unique().tolist()\n",
- " else:\n",
- " books = rows.item_id.sample(n=numb_of_books).reset_index()\n",
- " books = books.item_id.unique().tolist()\n",
- " \n",
- " # If the number of books is less than the specified number, add more books until the number is reached.\n",
- " increment = numb_of_books\n",
- " while len(books) < numb_of_books:\n",
- " increment = increment + 1\n",
- " books = final_rows[0:increment].item_id.unique().tolist()\n",
- " \n",
- " # Add the books to the list of books for this period.\n",
- " final_books.extend(books)\n",
- " \n",
- " # Remove duplicate books from the list.\n",
- " final_books = list(set(final_books))\n",
- " \n",
- " # Add the rows for the books in the list to the list of DataFrames.\n",
- " visualize_df.append(rows[rows.item_id.isin(set(final_books))])\n",
- " \n",
- " # Concatenate the DataFrames in the list into a single DataFrame.\n",
- " final_df = pd.concat(visualize_df)\n",
- " \n",
- " return final_df"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {
- "id": "a5hqOlTNbHgR"
- },
- "outputs": [],
- "source": [
- "def visualize_model_stability(df: pd.DataFrame, get_top: bool, numb_of_books: int) -> alt.Chart:\n",
- " \"\"\"\n",
- " This function visualizes the stability of a model by creating box plots and scatter plots of various score distribution metrics.\n",
- " \n",
- " Parameters:\n",
- " df (pd.DataFrame): The DataFrame containing the scores.\n",
- " get_top (bool): If True, the function will return the top scores. If False, it will return random scores.\n",
- " numb_of_books (int): The number of books to sample scores for.\n",
- " \n",
- " Returns:\n",
- " alt.Chart: A concatenated Altair chart containing the box plots and scatter plots.\n",
- " \"\"\"\n",
- " \n",
- " # Sample scores from the DataFrame.\n",
- " sample_df = sample_scores(df, get_top, numb_of_books)\n",
- " \n",
- " # Define the distribution metrics to be used.\n",
- " distribution_metrics = ['median', 'skew', 'std', 'var', 'kurtosis']\n",
- " \n",
- " # Normalize the distribution metrics in the sample DataFrame using MinMaxScaler.\n",
- " sample_df[distribution_metrics] = MinMaxScaler().fit_transform(sample_df[distribution_metrics])\n",
- " \n",
- " # Melt the sample DataFrame to a long format for visualization.\n",
- " melted_sample = pd.melt(sample_df, id_vars=['member_id', 'subscription_start', 'subscription_end', 'item_id', 'model_loops', \n",
- " 'member_period'], value_vars=['median', 'skew', 'std', 'var', 'kurtosis'])\n",
- "\n",
- " # Create a box plot of the distribution metrics.\n",
- " boxplot = alt.Chart(melted_sample).mark_boxplot().encode(\n",
- " x= alt.X('model_loops:O', axis=alt.Axis(title='')),\n",
- " y=alt.Y('value', axis=alt.Axis(title='')),\n",
- " column=alt.Column('variable', title=''),\n",
- " ).properties(title = \"Variability with Box and Whiskers\")\n",
- "\n",
- " # Create a scatter plot of the distribution metrics.\n",
- " points = alt.Chart(melted_sample).mark_circle().encode(\n",
- " x= alt.X('model_loops:O', axis=alt.Axis(title='')),\n",
- " y=alt.Y('value', axis=alt.Axis(title='')),\n",
- " color=alt.Color('variable', legend=alt.Legend(title=['Measure of', 'Score Variability'])), \n",
- " column=alt.Column('variable', title='')\n",
- " ).properties(title = \"Variability with Score Distributions\")\n",
- "\n",
- " # Concatenate the box plot and scatter plot horizontally and return the result.\n",
- " return alt.hconcat(boxplot, points).properties(title='Variability in Predicted Scores By Resampling Implicit Matrix Factorization Model ')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 428
- },
- "id": "HyS03DjrbWKs",
- "outputId": "ce98fa42-3e18-4f1b-8a69-a36cb9d9fc3f"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.HConcatChart(...)"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "chart = visualize_model_stability(compare_models, True, 10)\n",
- "chart.configure_axisX(\n",
- " labelAngle=0\n",
- ").configure_title(anchor='middle')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "DwEwSsFdjD7r"
- },
- "source": [
- "### Select Optimal Model and Generate Item Scores"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {
- "id": "_Z3-aAZHdr4p"
- },
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "db7232cf2a7b4c2294eca12b03b7c5aa",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- " 0%| | 0/100 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "final_run = [100]\n",
- "members = ['hemingway-ernest']\n",
- "final_model = run_model_comparisons(final_run, True, f'./data/lenskit_model{str(final_run[0])}_scores.csv', members)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "id": "BOacDcQIjNb9"
- },
- "outputs": [],
- "source": [
- "member_subscriptions = final_model[['member_id', 'subscription_start', 'subscription_end']].drop_duplicates()"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "I1x1AE35cOtj"
+ },
+ "source": [
+ "# Generate and Evaluate Lenskit Model Stability and Select Scores for Predictions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "EjZT_ntacTdr"
+ },
+ "source": [
+ "## Load libraries and data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "Xo656ZZAGfei"
+ },
+ "outputs": [],
+ "source": [
+ "# Standard library imports\n",
+ "import os\n",
+ "import sys\n",
+ "from typing import List\n",
+ "\n",
+ "# Third party imports\n",
+ "import altair as alt # For data visualization\n",
+ "import numpy as np # For numerical operations\n",
+ "import pandas as pd # For data manipulation\n",
+ "from scipy.stats import zscore # For statistical computations\n",
+ "from sklearn.preprocessing import MinMaxScaler # For data preprocessing\n",
+ "from tqdm.notebook import tqdm # For progress bars\n",
+ "\n",
+ "# LensKit imports\n",
+ "from lenskit import Recommender, topn, util, batch, crossfold as xf # For recommendation systems\n",
+ "from lenskit.algorithms import als, basic # For recommendation algorithms\n",
+ "\n",
+ "# Local application/library specific imports\n",
+ "sys.path.append(\"..\")\n",
+ "from utils.missing_data_processing import get_preprocessed_data, DATA_DIR # For loading datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "b5W2qUTyGSkv",
+ "outputId": "6d8bf784-b7a5-4df2-a71a-db8da792081b"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "XVVmXsmvrNus",
- "outputId": "e0d5e8e3-d06e-4937-dc56-bd4ebcfec1a0"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "member_id subscription_start subscription_end\n",
- "hemingway-ernest 1924-03-28 1925-03-28 128500\n",
- " 1921-12-28 1922-11-08 65400\n",
- "dtype: int64"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " user | \n",
+ " item | \n",
+ " rating | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " rhys | \n",
+ " conrad-typhoon | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " lanux-eyre-de | \n",
+ " woolf-night-day | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " tery | \n",
+ " james-joyce | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " tery | \n",
+ " freeman-portrait-george-moore | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " macleish-ada | \n",
+ " stern-tents-israel | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " alvear | \n",
+ " yeats-later-poems | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " joyce-james | \n",
+ " mantzius-history-theatrical-art | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 51 | \n",
+ " joyce-james | \n",
+ " scott-poems-walter-scott | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 52 | \n",
+ " joyce-james | \n",
+ " chekhov-horse-stealers-stories | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 53 | \n",
+ " joyce-james | \n",
+ " stephens-crock-gold | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "final_model[['member_id', 'subscription_start', 'subscription_end']].value_counts()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {
- "id": "Uluk5V6yF9xR"
- },
- "outputs": [],
- "source": [
- "final_model['member_period'] = final_model.member_id + ': ' + final_model.subscription_start.astype(str) + '/' + final_model.subscription_end.astype(str)"
+ "text/plain": [
+ " user item rating\n",
+ "2 rhys conrad-typhoon 1\n",
+ "8 lanux-eyre-de woolf-night-day 1\n",
+ "12 tery james-joyce 1\n",
+ "13 tery freeman-portrait-george-moore 1\n",
+ "22 macleish-ada stern-tents-israel 1\n",
+ "38 alvear yeats-later-poems 1\n",
+ "46 joyce-james mantzius-history-theatrical-art 1\n",
+ "51 joyce-james scott-poems-walter-scott 1\n",
+ "52 joyce-james chekhov-horse-stealers-stories 1\n",
+ "53 joyce-james stephens-crock-gold 1"
]
- },
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load the initial data into four DataFrames: events_df, members_df, books_df, and borrow_overrides_df.\n",
+ "data = get_preprocessed_data()\n",
+ "events_df = data[\"events\"]\n",
+ "members_df = data[\"members\"]\n",
+ "books_df = data[\"books\"]\n",
+ "# borrow_overrides unused in this notebook\n",
+ "\n",
+ "# Get all member-book interactions from the events DataFrame.\n",
+ "# Only include rows where the item URI is not null.\n",
+ "interactions_df = events_df[events_df.item_uri.notna()].copy()\n",
+ "\n",
+ "# Restrict the interactions to borrow events only.\n",
+ "interactions_df = interactions_df[interactions_df.event_type == 'Borrow'].copy()\n",
+ "\n",
+ "# Reduce the interactions DataFrame to the minimum user/item interaction fields and drop duplicate rows.\n",
+ "unique_interactions_df = interactions_df[\n",
+ " [\"member_id\", \"item_id\"]\n",
+ "].drop_duplicates(subset=[\"member_id\", \"item_id\"])\n",
+ "\n",
+ "# Rename the columns to the names expected by LensKit.\n",
+ "# The DataFrame is renamed to 'ratings' for use with the tutorial.\n",
+ "ratings = unique_interactions_df.rename(columns={'member_id': 'user', 'item_id': 'item'})\n",
+ "\n",
+ "# The example assumes a rating. Use a 1/0 rating and set all to 1 to confirm interaction.\n",
+ "ratings['rating'] = 1\n",
+ "\n",
+ "# Display the first 10 rows of the ratings DataFrame.\n",
+ "ratings.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "juCy1eUbHsuc",
- "outputId": "bf9cdb0d-cbde-4430-a133-b832d6442fde"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "member_period\n",
- "hemingway-ernest: 1921-12-28/1922-11-08 654\n",
- "hemingway-ernest: 1924-03-28/1925-03-28 1285\n",
- "Name: item_id, dtype: int64"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " member_id | \n",
+ " subscription_start | \n",
+ " subscription_end | \n",
+ " subscription_events | \n",
+ " subscription_volumes | \n",
+ " subscription_days | \n",
+ " internal_gaps | \n",
+ " known_borrows | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 23 | \n",
+ " raphael-france | \n",
+ " 1920-04-30 | \n",
+ " 1921-11-17 | \n",
+ " Subscription;Renewal;Renewal;Renewal | \n",
+ " 1.0 | \n",
+ " 566 | \n",
+ " 0;0;0 | \n",
+ " 1008 | \n",
+ "
\n",
+ " \n",
+ " 86 | \n",
+ " kittredge-eleanor-hayden | \n",
+ " 1924-01-17 | \n",
+ " 1924-05-17 | \n",
+ " Subscription;Renewal | \n",
+ " 2.0 | \n",
+ " 121 | \n",
+ " 0 | \n",
+ " 583 | \n",
+ "
\n",
+ " \n",
+ " 89 | \n",
+ " kittredge-eleanor-hayden | \n",
+ " 1929-09-10 | \n",
+ " 1929-12-10 | \n",
+ " Subscription | \n",
+ " 2.0 | \n",
+ " 91 | \n",
+ " NaN | \n",
+ " 583 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "final_model[(final_model.member_id == 'hemingway-ernest') & (final_model.model_run ==0)].groupby('member_period')['item_id'].nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {
- "id": "RWdlg9pmLadd"
- },
- "outputs": [],
- "source": [
- "subset_model = final_model[final_model.member_id == 'hemingway-ernest']"
+ "text/plain": [
+ " member_id subscription_start subscription_end \\\n",
+ "23 raphael-france 1920-04-30 1921-11-17 \n",
+ "86 kittredge-eleanor-hayden 1924-01-17 1924-05-17 \n",
+ "89 kittredge-eleanor-hayden 1929-09-10 1929-12-10 \n",
+ "\n",
+ " subscription_events subscription_volumes \\\n",
+ "23 Subscription;Renewal;Renewal;Renewal 1.0 \n",
+ "86 Subscription;Renewal 2.0 \n",
+ "89 Subscription 2.0 \n",
+ "\n",
+ " subscription_days internal_gaps known_borrows \n",
+ "23 566 0;0;0 1008 \n",
+ "86 121 0 583 \n",
+ "89 91 NaN 583 "
]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# load previously computed partial borrowers list (sequential / near sequential subscriptions collapsed)\n",
+ "partial_borrowers = pd.read_csv(DATA_DIR / 'partial_borrowers_collapsed.csv')\n",
+ "partial_borrowers.sort_values('known_borrows', ascending=False, inplace=True)\n",
+ "# parse subscription dates so we can use them to identify circulating books\n",
+ "partial_borrowers['subscription_start'] = pd.to_datetime(partial_borrowers['subscription_start'])\n",
+ "partial_borrowers['subscription_end'] = pd.to_datetime(partial_borrowers['subscription_end'])\n",
+ "partial_borrowers.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# generate subset of events dataset with dates, for use in identifying books \n",
+ "# in circulation during and before these subscriptions\n",
+ "\n",
+ "dated_events_df = events_df.copy()\n",
+ "dated_events_df['start_date_dt'] = pd.to_datetime(dated_events_df['start_date'], errors='coerce')\n",
+ "dated_events_df['end_date_dt'] = pd.to_datetime(dated_events_df['end_date'], errors='coerce')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2IwUVNfddJ4_"
+ },
+ "source": [
+ "## Fit initial model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "vYH5o_UqdV2H"
+ },
+ "source": [
+ "### Run Model Comparisons"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Define constants\n",
+ "N_RECOMMENDATIONS = 20\n",
+ "\n",
+ "def get_item_ids(user_id: str, bookless_sub: pd.Series, dated_events_df: pd.DataFrame, events_df: pd.DataFrame) -> List[str]:\n",
+ "\t\"\"\"\n",
+ "\tGet the item IDs for books that were in circulation during the subscription period\n",
+ "\n",
+ "\tParameters\n",
+ "\t----------\n",
+ "\tuser_id : str\n",
+ "\t\tThe user ID for the member\n",
+ "\tbookless_sub : pd.Series\n",
+ "\t\tA Series with the subscription start and end dates\n",
+ "\tdated_events_df : pd.DataFrame\n",
+ "\t\tA DataFrame with the events data and dates\n",
+ "\tevents_df : pd.DataFrame\n",
+ "\t\tA DataFrame with the events data\n",
+ "\t\n",
+ "\tReturns\n",
+ "\t-------\n",
+ "\tList[str]\n",
+ "\t\tA list of item IDs for books that were in circulation during the subscription period\n",
+ "\t\n",
+ "\t\"\"\"\n",
+ "\tcirculating_book_events = dated_events_df[(dated_events_df.start_date_dt < bookless_sub.subscription_end) | (dated_events_df.end_date_dt < bookless_sub.subscription_end)]\n",
+ "\titem_ids = circulating_book_events[circulating_book_events.item_id.notna()].item_id.unique()\n",
+ "\tmember_book_ids = events_df[(events_df.item_id.notna()) & (events_df.member_id.str.contains(user_id))].item_id.unique()\n",
+ "\tsubset_item_ids = list(set(item_ids) - set(member_book_ids))\n",
+ "\treturn subset_item_ids\n",
+ "\n",
+ "def get_predictions(user_id: str, bookless_sub: pd.Series, rec: Recommender, subset_item_ids: List[str]) -> pd.DataFrame:\n",
+ "\t\"\"\"\n",
+ "\tGet the recommendations for a user\n",
+ "\n",
+ "\tParameters\n",
+ "\t----------\n",
+ "\tuser_id : str\n",
+ "\t\tThe user ID for the member\n",
+ "\tbookless_sub : pd.Series\n",
+ "\t\tA Series with the subscription start and end dates\n",
+ "\trec : Recommender\n",
+ "\t\tThe recommender model\n",
+ "\tsubset_item_ids : List[str]\n",
+ "\t\tA list of item IDs for books that were in circulation during the subscription period\n",
+ "\t\n",
+ "\tReturns\n",
+ "\t-------\n",
+ "\tpd.DataFrame\n",
+ "\t\tA DataFrame with the recommendations for the user\n",
+ "\n",
+ "\t\"\"\"\n",
+ "\tpredictions = rec.recommend(user_id, candidates=subset_item_ids)\n",
+ "\tpredictions['member_id'] = user_id\n",
+ "\tpredictions['subscription_start'] = bookless_sub.subscription_start\n",
+ "\tpredictions['subscription_end'] = bookless_sub.subscription_end\n",
+ "\tpredictions.rename(columns={'item': 'item_id'}, inplace=True)\n",
+ "\treturn predictions\n",
+ "\n",
+ "def run_model_comparisons(number_of_runs: List[int], return_scores: bool, output_path: str, members: List[str]) -> pd.DataFrame:\n",
+ "\t\"\"\"\n",
+ "\tRun model comparisons for a list of run lengths\n",
+ "\n",
+ "\tParameters\n",
+ "\t----------\n",
+ "\tnumber_of_runs : List[int]\n",
+ "\t\tA list of run lengths\n",
+ "\treturn_scores : bool\n",
+ "\t\tA boolean indicating whether to return scores\n",
+ "\toutput_path : str\n",
+ "\t\tThe path to save the output\n",
+ "\tmembers : List[str]\n",
+ "\t\tA list of member IDs\n",
+ "\t\n",
+ "\tReturns\n",
+ "\t-------\n",
+ "\tpd.DataFrame\n",
+ "\t\tA DataFrame with the model comparisons\n",
+ "\t\"\"\"\n",
+ "\tif os.path.exists(output_path):\n",
+ "\t\tcompare_models = pd.read_csv(output_path)\n",
+ "\telse: \n",
+ "\t\tmodel_runs=[]\n",
+ "\t\tfor run_length in number_of_runs:\n",
+ "\t\t\tall_recs = []\n",
+ "\t\t\tfor index in tqdm(range(run_length)):\n",
+ "\t\t\t\trec = Recommender.adapt(als.ImplicitMF(50, use_ratings=False))\n",
+ "\t\t\t\trec.fit(ratings)\n",
+ "\t\t\t\tpopular = Recommender.adapt(basic.Popular())\n",
+ "\t\t\t\tpopular.fit(ratings)\n",
+ "\t\t\t\tfor bookless_sub in list(partial_borrowers.itertuples()):\n",
+ "\t\t\t\t\tuser_id = bookless_sub.member_id\n",
+ "\t\t\t\t\tif user_id in members:\n",
+ "\t\t\t\t\t\tsubset_item_ids = get_item_ids(user_id, bookless_sub, dated_events_df, events_df)\n",
+ "\t\t\t\t\t\tpredictions = get_predictions(user_id, bookless_sub, rec, subset_item_ids)\n",
+ "\t\t\t\t\t\tpredictions['model_run'] = index\n",
+ "\t\t\t\t\t\tall_recs.append(predictions)\n",
+ "\t\t\tall_recs_df = pd.concat(all_recs)\n",
+ "\t\t\tmetrics_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score': [np.median, 'skew', 'std', 'var']}).reset_index()\n",
+ "\t\t\tmetrics_df.columns = list(map(''.join, metrics_df.columns.values))\n",
+ "\t\t\tmetrics_df.columns = [col if 'score' not in col else col.split('score')[1] for col in metrics_df.columns ]\n",
+ "\t\t\tkurt_df = all_recs_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].apply(pd.DataFrame.kurt).reset_index(name='kurtosis')\n",
+ "\t\t\tfinal_df = pd.merge(metrics_df, kurt_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])\n",
+ "\t\t\tfinal_df['model_loops'] = run_length\n",
+ "\t\t\tif return_scores:\n",
+ "\t\t\t\tfinal_df = pd.merge(final_df, all_recs_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'], how='left')\n",
+ "\t\t\tmodel_runs.append(final_df)\n",
+ "\t\t\tcompare_models = pd.concat(model_runs)\n",
+ "\t\t\tcompare_models.to_csv(output_path, index=False)\n",
+ "\treturn compare_models"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "Cpeze52F_UpV"
+ },
+ "outputs": [],
+ "source": [
+ "# specify run size in number of runs\n",
+ "number_of_runs = [10,20,50,100, 200]\n",
+ "# specify members\n",
+ "members = ['kittredge-eleanor-hayden', 'colens-fernand', 'raphael-france', 'hemingway-ernest']\n",
+ "compare_models = run_model_comparisons(number_of_runs, False, './data/lenskit_comparison_model_runs.csv', members)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "COMVpXwHdZPp"
+ },
+ "source": [
+ "### Visualize stability of model scores"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "5ozr3kDtaM6U"
+ },
+ "outputs": [],
+ "source": [
+ "compare_models['member_period'] = compare_models.member_id + ': ' + compare_models.subscription_start.astype(str) + '/' + compare_models.subscription_end.astype(str)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "id": "dVc1uhYTaSek"
+ },
+ "outputs": [],
+ "source": [
+ "def sample_scores(df: pd.DataFrame, get_top: bool, numb_of_books: int) -> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " This function samples scores from a DataFrame for a given number of books and periods.\n",
+ " \n",
+ " Parameters:\n",
+ " df (pd.DataFrame): The DataFrame containing the scores.\n",
+ " get_top (bool): If True, the function will return the top scores. If False, it will return random scores.\n",
+ " numb_of_books (int): The number of books to sample scores for.\n",
+ " \n",
+ " Returns:\n",
+ " pd.DataFrame: A DataFrame containing the sampled scores.\n",
+ " \"\"\"\n",
+ " \n",
+ " # Get the unique periods from the DataFrame.\n",
+ " periods = df.member_period.unique().tolist()\n",
+ " \n",
+ " # Initialize an empty list to store the DataFrames for each period.\n",
+ " visualize_df = []\n",
+ " \n",
+ " # For each period...\n",
+ " for period in periods:\n",
+ " # Initialize an empty list to store the books for this period.\n",
+ " final_books = []\n",
+ " \n",
+ " # Get the rows from the DataFrame for this period.\n",
+ " rows = df[df.member_period == period]\n",
+ " \n",
+ " # Get the unique loop numbers from the rows.\n",
+ " loops = rows.model_loops.unique().tolist()\n",
+ " \n",
+ " # While the number of books is less than the specified number...\n",
+ " while len(final_books) < numb_of_books:\n",
+ " # For each loop...\n",
+ " for loop in loops:\n",
+ " # Get the rows for this loop.\n",
+ " final_rows = rows[rows.model_loops == loop]\n",
+ " \n",
+ " # If get_top is True, sort the rows by median score in descending order and get the top books.\n",
+ " # Otherwise, get a random sample of books.\n",
+ " if get_top:\n",
+ " final_rows = final_rows.sort_values(by='median', ascending=False)\n",
+ " books = final_rows[0:numb_of_books].item_id.unique().tolist()\n",
+ " else:\n",
+ " books = rows.item_id.sample(n=numb_of_books).reset_index()\n",
+ " books = books.item_id.unique().tolist()\n",
+ " \n",
+ " # If the number of books is less than the specified number, add more books until the number is reached.\n",
+ " increment = numb_of_books\n",
+ " while len(books) < numb_of_books:\n",
+ " increment = increment + 1\n",
+ " books = final_rows[0:increment].item_id.unique().tolist()\n",
+ " \n",
+ " # Add the books to the list of books for this period.\n",
+ " final_books.extend(books)\n",
+ " \n",
+ " # Remove duplicate books from the list.\n",
+ " final_books = list(set(final_books))\n",
+ " \n",
+ " # Add the rows for the books in the list to the list of DataFrames.\n",
+ " visualize_df.append(rows[rows.item_id.isin(set(final_books))])\n",
+ " \n",
+ " # Concatenate the DataFrames in the list into a single DataFrame.\n",
+ " final_df = pd.concat(visualize_df)\n",
+ " \n",
+ " return final_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "id": "a5hqOlTNbHgR"
+ },
+ "outputs": [],
+ "source": [
+ "def visualize_model_stability(df: pd.DataFrame, get_top: bool, numb_of_books: int) -> alt.Chart:\n",
+ " \"\"\"\n",
+ " This function visualizes the stability of a model by creating box plots and scatter plots of various score distribution metrics.\n",
+ " \n",
+ " Parameters:\n",
+ " df (pd.DataFrame): The DataFrame containing the scores.\n",
+ " get_top (bool): If True, the function will return the top scores. If False, it will return random scores.\n",
+ " numb_of_books (int): The number of books to sample scores for.\n",
+ " \n",
+ " Returns:\n",
+ " alt.Chart: A concatenated Altair chart containing the box plots and scatter plots.\n",
+ " \"\"\"\n",
+ " \n",
+ " # Sample scores from the DataFrame.\n",
+ " sample_df = sample_scores(df, get_top, numb_of_books)\n",
+ " \n",
+ " # Define the distribution metrics to be used.\n",
+ " distribution_metrics = ['median', 'skew', 'std', 'var', 'kurtosis']\n",
+ " \n",
+ " # Normalize the distribution metrics in the sample DataFrame using MinMaxScaler.\n",
+ " sample_df[distribution_metrics] = MinMaxScaler().fit_transform(sample_df[distribution_metrics])\n",
+ " \n",
+ " # Melt the sample DataFrame to a long format for visualization.\n",
+ " melted_sample = pd.melt(sample_df, id_vars=['member_id', 'subscription_start', 'subscription_end', 'item_id', 'model_loops', \n",
+ " 'member_period'], value_vars=['median', 'skew', 'std', 'var', 'kurtosis'])\n",
+ "\n",
+ " # Create a box plot of the distribution metrics.\n",
+ " boxplot = alt.Chart(melted_sample).mark_boxplot().encode(\n",
+ " x= alt.X('model_loops:O', axis=alt.Axis(title='')),\n",
+ " y=alt.Y('value', axis=alt.Axis(title='')),\n",
+ " column=alt.Column('variable', title=''),\n",
+ " ).properties(title = \"Variability with Box and Whiskers\")\n",
+ "\n",
+ " # Create a scatter plot of the distribution metrics.\n",
+ " points = alt.Chart(melted_sample).mark_circle().encode(\n",
+ " x= alt.X('model_loops:O', axis=alt.Axis(title='')),\n",
+ " y=alt.Y('value', axis=alt.Axis(title='')),\n",
+ " color=alt.Color('variable', legend=alt.Legend(title=['Measure of', 'Score Variability'])), \n",
+ " column=alt.Column('variable', title='')\n",
+ " ).properties(title = \"Variability with Score Distributions\")\n",
+ "\n",
+ " # Concatenate the box plot and scatter plot horizontally and return the result.\n",
+ " return alt.hconcat(boxplot, points).properties(title='Variability in Predicted Scores By Resampling Implicit Matrix Factorization Model ')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 428
},
+ "id": "HyS03DjrbWKs",
+ "outputId": "ce98fa42-3e18-4f1b-8a69-a36cb9d9fc3f"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 179
- },
- "id": "DIh_AnSMGGbt",
- "outputId": "d670a3a7-83e0-4774-86a1-846619e5425d"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " member_id | \n",
- " subscription_start | \n",
- " subscription_end | \n",
- " item_id | \n",
- " median | \n",
- " skew | \n",
- " std | \n",
- " var | \n",
- " kurtosis | \n",
- " model_loops | \n",
- " score | \n",
- " model_run | \n",
- " member_period | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 7200 | \n",
- " hemingway-ernest | \n",
- " 1921-12-28 | \n",
- " 1922-11-08 | \n",
- " burney-evelina-history-young | \n",
- " 0.320107 | \n",
- " -0.048081 | \n",
- " 0.18916 | \n",
- " 0.035781 | \n",
- " 0.023806 | \n",
- " 100 | \n",
- " 0.809843 | \n",
- " 0 | \n",
- " hemingway-ernest: 1921-12-28/1922-11-08 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " member_id subscription_start subscription_end \\\n",
- "7200 hemingway-ernest 1921-12-28 1922-11-08 \n",
- "\n",
- " item_id median skew std var \\\n",
- "7200 burney-evelina-history-young 0.320107 -0.048081 0.18916 0.035781 \n",
- "\n",
- " kurtosis model_loops score model_run \\\n",
- "7200 0.023806 100 0.809843 0 \n",
- "\n",
- " member_period \n",
- "7200 hemingway-ernest: 1921-12-28/1922-11-08 "
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "top_results = sample_scores(subset_model, True, 36)\n",
- "top_results[0:1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {
- "id": "D7ACefq438yi"
- },
- "outputs": [],
- "source": [
- "def get_formatted_titles(row):\n",
- "\n",
- " item = books_df[books_df.item_id == row.item_id]\n",
- " if item.author.isna().any() == False:\n",
- " author = ' '.join(item.author.str.split(',').values[0][::-1])\n",
- " author = ' by' + author\n",
- " else: \n",
- " author = '(Periodical)'\n",
- " title = item.title.values[0]\n",
- " return title + author"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {
- "id": "CGqXVIA64og0"
- },
- "outputs": [],
- "source": [
- "top_results['formatted_title'] = top_results.apply(get_formatted_titles, axis=1)"
+ "text/plain": [
+ "alt.HConcatChart(...)"
]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chart = visualize_model_stability(compare_models, True, 10)\n",
+ "chart.configure_axisX(\n",
+ " labelAngle=0\n",
+ ").configure_title(anchor='middle')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "DwEwSsFdjD7r"
+ },
+ "source": [
+ "### Select Optimal Model and Generate Item Scores"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "id": "_Z3-aAZHdr4p"
+ },
+ "outputs": [],
+ "source": [
+ "final_run = [100]\n",
+ "members = ['hemingway-ernest']\n",
+ "final_model = run_model_comparisons(final_run, True, f'./data/lenskit_model{str(final_run[0])}_scores.csv', members)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "id": "BOacDcQIjNb9"
+ },
+ "outputs": [],
+ "source": [
+ "member_subscriptions = final_model[['member_id', 'subscription_start', 'subscription_end']].drop_duplicates()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "XVVmXsmvrNus",
+ "outputId": "e0d5e8e3-d06e-4937-dc56-bd4ebcfec1a0"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {
- "id": "nqiDWqem6Xw1"
- },
- "outputs": [],
- "source": [
- "top_results['period'] = top_results.member_period.str.split(':').str[1]"
+ "data": {
+ "text/plain": [
+ "member_id subscription_start subscription_end\n",
+ "hemingway-ernest 1924-03-28 1925-03-28 128500\n",
+ " 1921-12-28 1922-11-08 65400\n",
+ "Name: count, dtype: int64"
]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "final_model[['member_id', 'subscription_start', 'subscription_end']].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "id": "Uluk5V6yF9xR"
+ },
+ "outputs": [],
+ "source": [
+ "final_model['member_period'] = final_model.member_id + ': ' + final_model.subscription_start.astype(str) + '/' + final_model.subscription_end.astype(str)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "juCy1eUbHsuc",
+ "outputId": "bf9cdb0d-cbde-4430-a133-b832d6442fde"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "zbG_i17NCNpn",
- "outputId": "11ee943b-20db-47fc-f627-2e01ab4ab4da"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(59, 52)"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(top_results.item_id.unique()), len(top_results[top_results.formatted_title.str.contains('Periodical') == False].item_id.unique())"
+ "data": {
+ "text/plain": [
+ "member_period\n",
+ "hemingway-ernest: 1921-12-28/1922-11-08 654\n",
+ "hemingway-ernest: 1924-03-28/1925-03-28 1285\n",
+ "Name: item_id, dtype: int64"
]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "final_model[(final_model.member_id == 'hemingway-ernest') & (final_model.model_run ==0)].groupby('member_period')['item_id'].nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "id": "RWdlg9pmLadd"
+ },
+ "outputs": [],
+ "source": [
+ "subset_model = final_model[final_model.member_id == 'hemingway-ernest']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 179
},
+ "id": "DIh_AnSMGGbt",
+ "outputId": "d670a3a7-83e0-4774-86a1-846619e5425d"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "jGrhrzCdLQ7X",
- "outputId": "8fe0f047-a8ba-4e1f-b13b-189c55e7533e"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "period\n",
- " 1921-12-28/1922-11-08 36\n",
- " 1924-03-28/1925-03-28 36\n",
- "Name: item_id, dtype: int64"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " member_id | \n",
+ " subscription_start | \n",
+ " subscription_end | \n",
+ " item_id | \n",
+ " median | \n",
+ " skew | \n",
+ " std | \n",
+ " var | \n",
+ " kurtosis | \n",
+ " model_loops | \n",
+ " score | \n",
+ " model_run | \n",
+ " member_period | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 7200 | \n",
+ " hemingway-ernest | \n",
+ " 1921-12-28 | \n",
+ " 1922-11-08 | \n",
+ " burney-evelina-history-young | \n",
+ " 0.320107 | \n",
+ " -0.048081 | \n",
+ " 0.18916 | \n",
+ " 0.035781 | \n",
+ " 0.023806 | \n",
+ " 100 | \n",
+ " 0.809843 | \n",
+ " 0 | \n",
+ " hemingway-ernest: 1921-12-28/1922-11-08 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "source": [
- "top_results.groupby('period').item_id.nunique()"
+ "text/plain": [
+ " member_id subscription_start subscription_end \\\n",
+ "7200 hemingway-ernest 1921-12-28 1922-11-08 \n",
+ "\n",
+ " item_id median skew std var \\\n",
+ "7200 burney-evelina-history-young 0.320107 -0.048081 0.18916 0.035781 \n",
+ "\n",
+ " kurtosis model_loops score model_run \\\n",
+ "7200 0.023806 100 0.809843 0 \n",
+ "\n",
+ " member_period \n",
+ "7200 hemingway-ernest: 1921-12-28/1922-11-08 "
]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "top_results = sample_scores(subset_model, True, 36)\n",
+ "top_results[0:1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "id": "D7ACefq438yi"
+ },
+ "outputs": [],
+ "source": [
+ "def get_formatted_titles(row):\n",
+ "\n",
+ " item = books_df[books_df.id == row.item_id]\n",
+ " if item.author.isna().any() == False:\n",
+ " author = ' '.join(item.author.str.split(',').values[0][::-1])\n",
+ " author = ' by' + author\n",
+ " else: \n",
+ " author = '(Periodical)'\n",
+ " title = item.title.values[0]\n",
+ " return title + author"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "id": "CGqXVIA64og0"
+ },
+ "outputs": [],
+ "source": [
+ "top_results['formatted_title'] = top_results.apply(get_formatted_titles, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "id": "nqiDWqem6Xw1"
+ },
+ "outputs": [],
+ "source": [
+ "top_results['period'] = top_results.member_period.str.split(':').str[1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "zbG_i17NCNpn",
+ "outputId": "11ee943b-20db-47fc-f627-2e01ab4ab4da"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "05283Ys2LQiM",
- "outputId": "cc89e25b-f9da-4090-ad54-443fa723bee5"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "period\n",
- " 1921-12-28/1922-11-08 34\n",
- " 1924-03-28/1925-03-28 29\n",
- "Name: item_id, dtype: int64"
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "top_results[top_results.formatted_title.str.contains('Periodical') == False].groupby('period').item_id.nunique()"
+ "data": {
+ "text/plain": [
+ "(59, 52)"
]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(top_results.item_id.unique()), len(top_results[top_results.formatted_title.str.contains('Periodical') == False].item_id.unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "jGrhrzCdLQ7X",
+ "outputId": "8fe0f047-a8ba-4e1f-b13b-189c55e7533e"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {
- "id": "3J07CDIr0Xx1"
- },
- "outputs": [],
- "source": [
- "items = top_results[top_results.member_id == 'hemingway-ernest'].groupby(['period','item_id'])['score'].mean().reset_index(name='avg').sort_values(by='avg', ascending=False)"
+ "data": {
+ "text/plain": [
+ "period\n",
+ "1921-12-28/1922-11-08 36\n",
+ "1924-03-28/1925-03-28 36\n",
+ "Name: item_id, dtype: int64"
]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "top_results.groupby('period').item_id.nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "05283Ys2LQiM",
+ "outputId": "cc89e25b-f9da-4090-ad54-443fa723bee5"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "TCIh8hqRDsu3",
- "outputId": "930d1695-15b7-4bcc-c61f-9efe77d6d108"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "period\n",
- " 1921-12-28/1922-11-08 36\n",
- " 1924-03-28/1925-03-28 36\n",
- "dtype: int64"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "items.groupby(['period']).size()"
+ "data": {
+ "text/plain": [
+ "period\n",
+ "1921-12-28/1922-11-08 34\n",
+ "1924-03-28/1925-03-28 29\n",
+ "Name: item_id, dtype: int64"
]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "top_results[top_results.formatted_title.str.contains('Periodical') == False].groupby('period').item_id.nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "id": "3J07CDIr0Xx1"
+ },
+ "outputs": [],
+ "source": [
+ "items = top_results[top_results.member_id == 'hemingway-ernest'].groupby(['period','item_id'])['score'].mean().reset_index(name='avg').sort_values(by='avg', ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "TCIh8hqRDsu3",
+ "outputId": "930d1695-15b7-4bcc-c61f-9efe77d6d108"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {
- "id": "x6an0dLz2mcR"
- },
- "outputs": [],
- "source": [
- "members = top_results.member_id.unique().tolist()\n",
- "charts = []\n",
- "members=['hemingway-ernest']\n",
- "for member in members:\n",
- " full_name = members_df[members_df.id == member].name.values[0]\n",
- " tickplot = alt.Chart(top_results[(top_results.member_id == member)]).mark_tick(opacity=0.7).encode(\n",
- " y=alt.Y('formatted_title', sort='-x', axis=alt.Axis(title=\"Predicted Book\")),\n",
- " x='score',\n",
- " color=alt.Color('period:N', legend=alt.Legend(title=\"Missing Borrowing Records Period\")),\n",
- " # facet='member_period:N'\n",
- " ).properties(\n",
- " title=f'Top Predictions by Implicit Matrix Factorization Model',\n",
- " width=300\n",
- " )\n",
- " charts.append(tickplot)\n"
+ "data": {
+ "text/plain": [
+ "period\n",
+ "1921-12-28/1922-11-08 36\n",
+ "1924-03-28/1925-03-28 36\n",
+ "dtype: int64"
]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "items.groupby(['period']).size()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "id": "x6an0dLz2mcR"
+ },
+ "outputs": [],
+ "source": [
+ "members = top_results.member_id.unique().tolist()\n",
+ "charts = []\n",
+ "members=['hemingway-ernest']\n",
+ "for member in members:\n",
+ " full_name = members_df[members_df.id == member].name.values[0]\n",
+ " tickplot = alt.Chart(top_results[(top_results.member_id == member)]).mark_tick(opacity=0.7).encode(\n",
+ " y=alt.Y('formatted_title', sort='-x', axis=alt.Axis(title=\"Predicted Book\")),\n",
+ " x='score',\n",
+ " color=alt.Color('period:N', legend=alt.Legend(title=\"Missing Borrowing Records Period\")),\n",
+ " # facet='member_period:N'\n",
+ " ).properties(\n",
+ " title=f'Top Predictions by Implicit Matrix Factorization Model',\n",
+ " width=300\n",
+ " )\n",
+ " charts.append(tickplot)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
},
+ "id": "1uLzE3Cs6IeL",
+ "outputId": "18ffe8c3-162d-4c10-dcbe-40904443e55d"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 1000
- },
- "id": "1uLzE3Cs6IeL",
- "outputId": "18ffe8c3-162d-4c10-dcbe-40904443e55d"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- ""
- ],
- "text/plain": [
- "alt.Chart(...)"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ ""
],
- "source": [
- "charts[0].configure_axisY(\n",
- " titleAngle=0,\n",
- " titleAlign=\"left\",\n",
- " titleY=-10,\n",
- " titleX=-100,\n",
- " labelLimit=1000\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 1000
- },
- "id": "iFNwpGKN3eqB",
- "outputId": "c2c1175c-8dcc-4c0b-bb20-afa24c87565c"
- },
- "outputs": [],
- "source": [
- "# alt.hconcat(*charts).configure_axisY(\n",
- "# titleAngle=0,\n",
- "# titleAlign=\"left\",\n",
- "# titleY=-10,\n",
- "# titleX=-10,\n",
- "# labelLimit=1000\n",
- "# )"
+ "text/plain": [
+ "alt.Chart(...)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "mQdXDLaMGLq_"
- },
- "outputs": [],
- "source": [
- "# # [top_results.member_period == 'colens-fernand: 1920-04-01/1920-07-07']\n",
- "# tickplot = alt.Chart(top_results).mark_tick(opacity=0.7).encode(\n",
- "# y=alt.Y('item_id', sort='-x'),\n",
- "# x='score',\n",
- "# color='member_period:N',\n",
- "# # facet='member_period:N'\n",
- "# )\n",
- "# tickplot"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "metadata": {},
- "outputs": [],
- "source": [
- "top_results[top_results.formatted_title.str.contains('Periodical') == False].to_csv('./data/top_scores_lenskit_model100.csv', index=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {
- "id": "GY36uXxDrkhb"
- },
- "outputs": [],
- "source": [
- "final_model['zscore'] = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].transform(lambda x : zscore(x,ddof=1))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "metadata": {
- "id": "GyXSOx3FsXwA"
- },
- "outputs": [],
- "source": [
- "top_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'max'})[['score']].reset_index()\n",
- "top_scores = pd.merge(top_scores, final_model, on=top_scores.columns.tolist(), how='inner')\n",
- "\n",
- "top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {
- "id": "bUKA8e8evqdZ"
- },
- "outputs": [],
- "source": [
- "avg_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].mean().reset_index(name='avg_score')\n",
- "scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "metadata": {
- "id": "TcnFmCfsb3vm"
- },
- "outputs": [],
- "source": [
- "std_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'std'}).reset_index()\n",
- "std_scores = std_scores.rename(columns={'score': 'std_score'})\n",
- "scores_df = pd.merge(scores_df, std_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {
- "id": "OuXdNkW2-Qje"
- },
- "outputs": [],
- "source": [
- "median_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].median().reset_index(name='median_score')\n",
- "scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {
- "id": "8JYQlLyL1Exy"
- },
- "outputs": [],
- "source": [
- "import scipy\n",
- "mode_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()\n",
- "mode_scores = pd.merge(mode_scores, final_model, on=mode_scores.columns.tolist(), how='inner')\n",
- "mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {
- "id": "VUA6CLNT70pn"
- },
- "outputs": [],
- "source": [
- "final_scores = pd.merge(mode_scores[['member_id', 'subscription_start', 'subscription_end', 'item_id', 'mode_score', 'mode_zscore']], scores_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "qfaWBviL9E75"
- },
- "outputs": [],
- "source": [
- "# for index, group in member_subscriptions.iterrows():\n",
- "# print(group.to_dict())\n",
- "# rows = final_scores[(final_scores.member_id == group.member_id) & (final_scores.subscription_start == group.subscription_start) & (final_scores.subscription_end == group.subscription_end)]\n",
- "# print('top_scores:', rows.sort_values(by=['top_score'], ascending=False)[0:5][['item_id', 'top_score']].to_dict())\n",
- "# print('avg_scores:', rows.sort_values(by=['avg_score'], ascending=False)[0:5][['item_id', 'avg_score']].to_dict())\n",
- "# print('mode_scores:', rows.sort_values(by=['mode_score'], ascending=False)[0:5][['item_id', 'mode_score']].to_dict())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "metadata": {
- "id": "k_bFoYYh-ZR7"
- },
- "outputs": [],
- "source": [
- "final_scores.to_csv(f'./data/collapsed_lenskit_model{str(final_run[0])}_scores.csv', index=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ],
- "metadata": {
+ ],
+ "source": [
+ "charts[0].configure_axisY(\n",
+ " titleAngle=0,\n",
+ " titleAlign=\"left\",\n",
+ " titleY=-10,\n",
+ " titleX=-100,\n",
+ " labelLimit=1000\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {
"colab": {
- "collapsed_sections": [],
- "name": "lenskit_model_scores_stability.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
},
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.18"
+ "id": "iFNwpGKN3eqB",
+ "outputId": "c2c1175c-8dcc-4c0b-bb20-afa24c87565c"
+ },
+ "outputs": [],
+ "source": [
+ "# alt.hconcat(*charts).configure_axisY(\n",
+ "# titleAngle=0,\n",
+ "# titleAlign=\"left\",\n",
+ "# titleY=-10,\n",
+ "# titleX=-10,\n",
+ "# labelLimit=1000\n",
+ "# )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "id": "mQdXDLaMGLq_"
+ },
+ "outputs": [],
+ "source": [
+ "# # [top_results.member_period == 'colens-fernand: 1920-04-01/1920-07-07']\n",
+ "# tickplot = alt.Chart(top_results).mark_tick(opacity=0.7).encode(\n",
+ "# y=alt.Y('item_id', sort='-x'),\n",
+ "# x='score',\n",
+ "# color='member_period:N',\n",
+ "# # facet='member_period:N'\n",
+ "# )\n",
+ "# tickplot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "top_results[top_results.formatted_title.str.contains('Periodical') == False].to_csv('./data/top_scores_lenskit_model100.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "id": "GY36uXxDrkhb"
+ },
+ "outputs": [],
+ "source": [
+ "final_model['zscore'] = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].transform(lambda x : zscore(x,ddof=1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "id": "GyXSOx3FsXwA"
+ },
+ "outputs": [],
+ "source": [
+ "top_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'max'})[['score']].reset_index()\n",
+ "top_scores = pd.merge(top_scores, final_model, on=top_scores.columns.tolist(), how='inner')\n",
+ "\n",
+ "top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "id": "bUKA8e8evqdZ"
+ },
+ "outputs": [],
+ "source": [
+ "avg_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].mean().reset_index(name='avg_score')\n",
+ "scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {
+ "id": "TcnFmCfsb3vm"
+ },
+ "outputs": [],
+ "source": [
+ "std_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'std'}).reset_index()\n",
+ "std_scores = std_scores.rename(columns={'score': 'std_score'})\n",
+ "scores_df = pd.merge(scores_df, std_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {
+ "id": "OuXdNkW2-Qje"
+ },
+ "outputs": [],
+ "source": [
+ "median_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].median().reset_index(name='median_score')\n",
+ "scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {
+ "id": "8JYQlLyL1Exy"
+ },
+ "outputs": [],
+ "source": [
+ "import scipy\n",
+ "mode_scores = final_model.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].agg(lambda x: scipy.stats.mode(x)[0]).reset_index()\n",
+ "mode_scores = pd.merge(mode_scores, final_model, on=mode_scores.columns.tolist(), how='inner')\n",
+ "mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {
+ "id": "VUA6CLNT70pn"
+ },
+ "outputs": [],
+ "source": [
+ "final_scores = pd.merge(mode_scores[['member_id', 'subscription_start', 'subscription_end', 'item_id', 'mode_score', 'mode_zscore']], scores_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {
+ "id": "qfaWBviL9E75"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Predictions for hemingway-ernest, subscription 1921-12-28 to 1922-11-08\n",
+ "\n",
+ "Top scores:\n",
+ " item_id top_score\n",
+ " joyce-exiles 1.326223\n",
+ "mansfield-bliss-short-stories 1.183781\n",
+ " joyce-portrait-artist-young 1.162345\n",
+ " oneill-beyond-horizon 1.095982\n",
+ " cather-antonia 1.093625\n",
+ "\n",
+ "Average scores:\n",
+ " item_id avg_score\n",
+ " oneill-beyond-horizon 0.546617\n",
+ " conrad-shadow-line-confession 0.510644\n",
+ "twain-adventures-huckleberry-finn 0.421426\n",
+ " morris-specimens-early-english 0.402074\n",
+ " eastman-enjoyment-poetry 0.373157\n",
+ "\n",
+ "Mode scores:\n",
+ " item_id mode_score\n",
+ "morris-specimens-early-english 0.073113\n",
+ " conrad-shadow-line-confession 0.069347\n",
+ " schreiner-story-african-farm 0.064250\n",
+ " frank-rahab 0.058457\n",
+ " saltus-paliser-case 0.021517\n",
+ "\n",
+ "Predictions for hemingway-ernest, subscription 1924-03-28 to 1925-03-28\n",
+ "\n",
+ "Top scores:\n",
+ " item_id top_score\n",
+ " joyce-exiles 1.326223\n",
+ "mansfield-bliss-short-stories 1.183781\n",
+ " joyce-portrait-artist-young 1.162345\n",
+ " forster-howards-end 1.141532\n",
+ " melville-moby-dick-whale 1.139511\n",
+ "\n",
+ "Average scores:\n",
+ " item_id avg_score\n",
+ " oneill-hairy-ape 0.590482\n",
+ " criterion 0.587726\n",
+ " forster-howards-end 0.572118\n",
+ " oneill-beyond-horizon 0.546617\n",
+ "conrad-shadow-line-confession 0.510644\n",
+ "\n",
+ "Mode scores:\n",
+ " item_id mode_score\n",
+ " ponsonby-english-diaries 0.243522\n",
+ " leskov-sentry-stories 0.242126\n",
+ " meredith-amazing-marriage 0.183322\n",
+ " machen-house-souls 0.093522\n",
+ "saintsbury-collected-essays-papers 0.077510\n"
+ ]
}
+ ],
+ "source": [
+ "for index, group in member_subscriptions.iterrows():\n",
+ " print(\"\\nPredictions for %(member_id)s, subscription %(subscription_start)s to %(subscription_end)s\" %\n",
+ " group.to_dict())\n",
+ " rows = final_scores[(final_scores.member_id == group.member_id) & (final_scores.subscription_start == group.subscription_start) & (final_scores.subscription_end == group.subscription_end)]\n",
+ " top_scores = rows.sort_values(by=['top_score'], ascending=False)[0:5][['item_id', 'top_score']]\n",
+ " print(\"\\nTop scores:\")\n",
+ " print(top_scores.to_string(index=False))\n",
+ " avg_scores = rows.sort_values(by=['avg_score'], ascending=False)[0:5][['item_id', 'avg_score']]\n",
+ " print(\"\\nAverage scores:\")\n",
+ " print(avg_scores.to_string(index=False))\n",
+ " mode_scores = rows.sort_values(by=['mode_score'], ascending=False)[0:5][['item_id', 'mode_score']]\n",
+ " print(\"\\nMode scores:\")\n",
+ " print(mode_scores.to_string(index=False))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {
+ "id": "k_bFoYYh-ZR7"
+ },
+ "outputs": [],
+ "source": [
+ "final_scores.to_csv(f'./data/collapsed_lenskit_model{str(final_run[0])}_scores.csv', index=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "name": "lenskit_model_scores_stability.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 0
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}
diff --git a/tests/test_utils/test_data.py b/tests/test_utils/test_data.py
index 643d465..a840ef8 100644
--- a/tests/test_utils/test_data.py
+++ b/tests/test_utils/test_data.py
@@ -35,18 +35,18 @@ def test_load_initial_data():
@patch("utils.missing_data_processing.pd")
@patch("utils.missing_data_processing.preprocess_events_data")
-@patch("utils.missing_data_processing.preprocess_books_data")
-def test_get_preprocessed_data(mock_preprocess_books, mock_preprocess_events, mock_pd):
+@patch("utils.missing_data_processing.preprocess_shxco_data")
+def test_get_preprocessed_data(mock_preprocess_shxco, mock_preprocess_events, mock_pd):
# no datasets specified: should return all
data = missing_data_processing.get_preprocessed_data()
for dataset in missing_data_processing.CSV_PATHS.keys():
assert dataset in data
assert mock_pd.read_csv.call_count == 4
mock_preprocess_events.assert_called()
- mock_preprocess_books.assert_called()
+ mock_preprocess_shxco.assert_called()
# reset mocks
- for m in [mock_preprocess_books, mock_preprocess_events, mock_pd]:
+ for m in [mock_preprocess_shxco, mock_preprocess_events, mock_pd]:
m.reset_mock()
# test loading selected datasets
@@ -55,7 +55,7 @@ def test_get_preprocessed_data(mock_preprocess_books, mock_preprocess_events, mo
assert "books" in data
assert "borrow_overrides" in data
mock_preprocess_events.assert_not_called()
- mock_preprocess_books.assert_called()
+ mock_preprocess_shxco.assert_called()
# test unknown dataset
with pytest.raises(ValueError):
diff --git a/utils/missing_data_processing.py b/utils/missing_data_processing.py
index 036ecb4..586f271 100644
--- a/utils/missing_data_processing.py
+++ b/utils/missing_data_processing.py
@@ -91,7 +91,9 @@ def get_preprocessed_data(*datasets) -> Dict[str, pd.DataFrame]:
if "events" in datasets:
data["events"] = preprocess_events_data(data["events"])
if "books" in datasets:
- data["books"] = preprocess_books_data(data["books"])
+ data["books"] = preprocess_shxco_data(data["books"])
+ if "members" in datasets:
+ data["members"] = preprocess_shxco_data(data["members"])
return data
@@ -131,24 +133,24 @@ def preprocess_events_data(events_df: pd.DataFrame) -> pd.DataFrame:
return events_df
-def preprocess_books_data(books_df: pd.DataFrame) -> pd.DataFrame:
+def preprocess_shxco_data(df: pd.DataFrame) -> pd.DataFrame:
"""
- Pre-processing for book data.
+ Pre-processing for book or member data.
- This function processes the 'books' data by generating short-form IDs
- from the longer project URIs.
+ This function processes the 'books' or 'members' data by generating
+ short-form IDs from the longer project URIs.
Args:
- books_df (pd.DataFrame): The initial 'books' DataFrame.
+ df (pd.DataFrame): The initial 'books' or 'members' DataFrame.
Returns:
- pd.DataFrame: The processed 'books' DataFrame.
+ pd.DataFrame: processed 'books' or 'members' DataFrame.
"""
# Generate short IDs from item URIs
- books_df["id"] = books_df.uri.apply(short_id)
+ df["id"] = df.uri.apply(short_id)
- # Return the processed 'books' DataFrame.
- return books_df
+ # Return the processed 'DataFrame.
+ return df
def get_logbook_events(events_df: pd.DataFrame) -> pd.DataFrame: