Skip to content

Commit

Permalink
improve search algorithm (#14)
Browse files Browse the repository at this point in the history
  • Loading branch information
fpgmaas authored Jul 3, 2024
1 parent acc9e9b commit e01fbe5
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 14 deletions.
2 changes: 1 addition & 1 deletion pypi_scout/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ async def search(query: QueryModel, request: Request):
raise HTTPException(status_code=400, detail="top_k cannot be larger than 100.")

logging.info(f"Searching for similar projects. Query: '{query.query}'")
df_matches = vector_database.find_similar(query.query, top_k=query.top_k * 2)
df_matches = vector_database.find_similar(query.query, top_k=int(query.top_k * 3))
df_matches = df_matches.join(df_packages, how="left", on="name")
logging.info(
f"Fetched the {len(df_matches)} most similar projects. Calculating the weighted scores and filtering..."
Expand Down
30 changes: 17 additions & 13 deletions pypi_scout/utils/score_calculator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import polars as pl


Expand All @@ -7,30 +8,33 @@ def calculate_score(
"""
Calculate a combined score for packages based on similarity and weekly downloads.
This function ranks the entries according to the 'similarity' and 'weekly_downloads' columns, normalizes these
ranks to a [0, 1] scale, and computes a combined score using the provided weights for similarity and weekly downloads.
The combined score helps in recommending packages that are both popular and relevant based on similarity.
This function normalizes the 'similarity' and 'weekly_downloads' columns to a [0, 1] scale,
and computes a combined score using the provided weights for similarity and weekly downloads.
The combined score helps in recommending packages that are both popular and relevant based on similarity.
Args:
df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns.
weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5.
weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5.
Args:
df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns.
weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5.
weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5.
Returns:
pl.DataFrame: DataFrame with the combined score and sorted by this score in descending order.
"""
print(df.sort("weekly_downloads", descending=True).head(10)["name"])
df = df.with_columns(
rank_similarity=pl.col("similarity").rank("dense", descending=False),
rank_weekly_downloads=pl.col("weekly_downloads").rank("dense", descending=False),
log_weekly_downloads=pl.col("weekly_downloads").apply(lambda x: np.log1p(x)) # log1p is log(1 + x)
)

df = df.with_columns(
normalized_similarity=(pl.col("rank_similarity") - 1) / (df["rank_similarity"].max() - 1),
normalized_weekly_downloads=(pl.col("rank_weekly_downloads") - 1) / (df["rank_weekly_downloads"].max() - 1),
normalized_similarity=(pl.col("similarity") - pl.col("similarity").min())
/ (pl.col("similarity").max() - pl.col("similarity").min()),
normalized_log_weekly_downloads=(pl.col("log_weekly_downloads") - pl.col("log_weekly_downloads").min())
/ (pl.col("log_weekly_downloads").max() - pl.col("log_weekly_downloads").min()),
)

df = df.with_columns(
score=weight_similarity * pl.col("normalized_similarity")
+ weight_weekly_downloads * pl.col("normalized_weekly_downloads")
+ weight_weekly_downloads * pl.col("normalized_log_weekly_downloads")
)

df = df.sort("score", descending=True)
Expand Down

0 comments on commit e01fbe5

Please sign in to comment.