get_recs.py

################
### IMPORTS ####

import datetime
import json
import sys

import pandas as pd

from collections import defaultdict
from surprise import Dataset, NMF, Reader

def export_preds(n_preds, orig_fname):
    """
    Format & export the predictions to a file. 
    """
    print("Writing results to file...")

    now = datetime.datetime.now()
    stamp = f"{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}"

    with open(f"{stamp}_RECS_FOR-{orig_fname}", "w") as outfile:
        outfile.write(json.dumps(n_preds, indent=4))

    print("Done!")

def get_top_n(these_preds, n):
    """
    Return the top-N recommendations for each patient from a set of predictions.
    From the surprise library FAQ. 

    Args:
        these_preds - Prediction objects; list of generated predictions.
        n - integer; The number of recommendation to output for each user.

    Returns:
        top_n - dict; keys are user ids & values are 2-tuples of (item id, predicted value).
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in these_preds:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def train_algo(this_data):
    """
    Fit a Non-negative Matrix Factorization algo to the data.

    Args:
        this_data - surprise.dataset; the loaded json data.

    Returns:
        predictions - surprise library object; all predictions generated by algo.  
    """
    print("Running algo...")
    trainset = this_data.build_full_trainset()

    NMF_algo = NMF(biased=False, n_epochs=50, n_factors=35)

    NMF_algo.fit(trainset)

    testset = trainset.build_anti_testset()

    predictions = NMF_algo.test(testset)

    print("Getting predictions...")

    return predictions

def load_json(fname):
    """
    Unpack a JSON data file into a DataFrame with columns for 'Patient','Procedure','Count'.

    Args:
        fname - string; the file to load & unpack.

    Returns:
        surprise_data - surprise library object; the data from 'fname' converted to a dataset object 
                        from the surprise library.
    """
    print("Loading data...")

    this_file = pd.read_json(fname)

    temp_list = []

    for patient, procedure_list in this_file.iteritems():
        for entry in procedure_list:
            for procedure in entry:
                temp_list.append((patient, procedure, 1))

    temp_df = pd.DataFrame(data=temp_list, columns=['Patient','Procedure','Count']).groupby(['Patient','Procedure']).sum().reset_index()
    
    reader = Reader()
    surprise_data = Dataset.load_from_df(temp_df, reader)

    return surprise_data

if __name__ == "__main__":

    try:
        FNAME = sys.argv[1]
        NUM_PROCS = int(sys.argv[2])

        export_preds(
        n_preds=get_top_n(
            these_preds=train_algo(
                this_data=load_json(FNAME)
            ),
            n=NUM_PROCS
        ),
        orig_fname=FNAME
    )
    except:
        print("\nPlease check your input and try again.\nExpected 'get_recs.py <file name> <integer>\n")