diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py index fe7ba75..c4f2cc9 100644 --- a/pypi_scout/api/main.py +++ b/pypi_scout/api/main.py @@ -55,7 +55,7 @@ async def search(query: QueryModel, request: Request): raise HTTPException(status_code=400, detail="top_k cannot be larger than 100.") logging.info(f"Searching for similar projects. Query: '{query.query}'") - df_matches = vector_database.find_similar(query.query, top_k=query.top_k * 2) + df_matches = vector_database.find_similar(query.query, top_k=int(query.top_k * 3)) df_matches = df_matches.join(df_packages, how="left", on="name") logging.info( f"Fetched the {len(df_matches)} most similar projects. Calculating the weighted scores and filtering..." diff --git a/pypi_scout/utils/score_calculator.py b/pypi_scout/utils/score_calculator.py index f8cf338..ab2f9b5 100644 --- a/pypi_scout/utils/score_calculator.py +++ b/pypi_scout/utils/score_calculator.py @@ -1,3 +1,4 @@ +import numpy as np import polars as pl @@ -7,30 +8,33 @@ def calculate_score( """ Calculate a combined score for packages based on similarity and weekly downloads. - This function ranks the entries according to the 'similarity' and 'weekly_downloads' columns, normalizes these - ranks to a [0, 1] scale, and computes a combined score using the provided weights for similarity and weekly downloads. - The combined score helps in recommending packages that are both popular and relevant based on similarity. + This function normalizes the 'similarity' and 'weekly_downloads' columns to a [0, 1] scale, + and computes a combined score using the provided weights for similarity and weekly downloads. + The combined score helps in recommending packages that are both popular and relevant based on similarity. + Args: + df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns. + weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5. + weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5. - Args: - df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns. - weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5. - weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5. - + Returns: + pl.DataFrame: DataFrame with the combined score and sorted by this score in descending order. """ + print(df.sort("weekly_downloads", descending=True).head(10)["name"]) df = df.with_columns( - rank_similarity=pl.col("similarity").rank("dense", descending=False), - rank_weekly_downloads=pl.col("weekly_downloads").rank("dense", descending=False), + log_weekly_downloads=pl.col("weekly_downloads").apply(lambda x: np.log1p(x)) # log1p is log(1 + x) ) df = df.with_columns( - normalized_similarity=(pl.col("rank_similarity") - 1) / (df["rank_similarity"].max() - 1), - normalized_weekly_downloads=(pl.col("rank_weekly_downloads") - 1) / (df["rank_weekly_downloads"].max() - 1), + normalized_similarity=(pl.col("similarity") - pl.col("similarity").min()) + / (pl.col("similarity").max() - pl.col("similarity").min()), + normalized_log_weekly_downloads=(pl.col("log_weekly_downloads") - pl.col("log_weekly_downloads").min()) + / (pl.col("log_weekly_downloads").max() - pl.col("log_weekly_downloads").min()), ) df = df.with_columns( score=weight_similarity * pl.col("normalized_similarity") - + weight_weekly_downloads * pl.col("normalized_weekly_downloads") + + weight_weekly_downloads * pl.col("normalized_log_weekly_downloads") ) df = df.sort("score", descending=True)