improve search algorithm (#14)

fpgmaas · Jul 3, 2024 · e01fbe5 · e01fbe5
1 parent acc9e9b
commit e01fbe5
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 14 deletions.
diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py
@@ -55,7 +55,7 @@ async def search(query: QueryModel, request: Request):
         raise HTTPException(status_code=400, detail="top_k cannot be larger than 100.")
 
     logging.info(f"Searching for similar projects. Query: '{query.query}'")
-    df_matches = vector_database.find_similar(query.query, top_k=query.top_k * 2)
+    df_matches = vector_database.find_similar(query.query, top_k=int(query.top_k * 3))
     df_matches = df_matches.join(df_packages, how="left", on="name")
     logging.info(
         f"Fetched the {len(df_matches)} most similar projects. Calculating the weighted scores and filtering..."

diff --git a/pypi_scout/utils/score_calculator.py b/pypi_scout/utils/score_calculator.py
@@ -1,3 +1,4 @@
+import numpy as np
 import polars as pl
 
 
@@ -7,30 +8,33 @@ def calculate_score(
     """
     Calculate a combined score for packages based on similarity and weekly downloads.
 
-     This function ranks the entries according to the 'similarity' and 'weekly_downloads' columns, normalizes these
-     ranks to a [0, 1] scale, and computes a combined score using the provided weights for similarity and weekly downloads.
-     The combined score helps in recommending packages that are both popular and relevant based on similarity.
+    This function normalizes the 'similarity' and 'weekly_downloads' columns to a [0, 1] scale,
+    and computes a combined score using the provided weights for similarity and weekly downloads.
+    The combined score helps in recommending packages that are both popular and relevant based on similarity.
 
+    Args:
+        df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns.
+        weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5.
+        weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5.
 
-     Args:
-         df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns.
-         weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5.
-         weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5.
-
+    Returns:
+        pl.DataFrame: DataFrame with the combined score and sorted by this score in descending order.
     """
+    print(df.sort("weekly_downloads", descending=True).head(10)["name"])
     df = df.with_columns(
-        rank_similarity=pl.col("similarity").rank("dense", descending=False),
-        rank_weekly_downloads=pl.col("weekly_downloads").rank("dense", descending=False),
+        log_weekly_downloads=pl.col("weekly_downloads").apply(lambda x: np.log1p(x))  # log1p is log(1 + x)
     )
 
     df = df.with_columns(
-        normalized_similarity=(pl.col("rank_similarity") - 1) / (df["rank_similarity"].max() - 1),
-        normalized_weekly_downloads=(pl.col("rank_weekly_downloads") - 1) / (df["rank_weekly_downloads"].max() - 1),
+        normalized_similarity=(pl.col("similarity") - pl.col("similarity").min())
+        / (pl.col("similarity").max() - pl.col("similarity").min()),
+        normalized_log_weekly_downloads=(pl.col("log_weekly_downloads") - pl.col("log_weekly_downloads").min())
+        / (pl.col("log_weekly_downloads").max() - pl.col("log_weekly_downloads").min()),
     )
 
     df = df.with_columns(
         score=weight_similarity * pl.col("normalized_similarity")
-        + weight_weekly_downloads * pl.col("normalized_weekly_downloads")
+        + weight_weekly_downloads * pl.col("normalized_log_weekly_downloads")
     )
 
     df = df.sort("score", descending=True)