From 16dff29660114ac56cb0d386b8da1df558689833 Mon Sep 17 00:00:00 2001 From: florian Date: Sun, 16 Jun 2024 07:43:59 +0200 Subject: [PATCH] Improve README, better logging --- .env.template | 2 +- README.md | 23 ++++++++++++++++++----- frontend/README.md | 12 ------------ pypi_scout/api/main.py | 8 +++++--- pypi_scout/api/utils.py | 2 +- pypi_scout/utils/score_calculator.py | 17 +++++++++-------- pypi_scout/vector_database/interface.py | 19 +++++++++++++------ pyproject.toml | 2 +- 8 files changed, 48 insertions(+), 37 deletions(-) diff --git a/.env.template b/.env.template index 2c3c5a2..c7b1fc6 100644 --- a/.env.template +++ b/.env.template @@ -1 +1 @@ -PINECONE_TOKEN= +PINECONE_TOKEN=your-api-token diff --git a/README.md b/README.md index 22b004c..9f2042b 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,20 @@ PyPI Scout helps you find PyPI packages through natural language prompts with th ![Demo](./static/demo.gif) +## Overview + The project works by collecting project summaries and descriptions for all packages on PyPI with more than 50 weekly downloads. These are then converted into vector representations using [Sentence Transformers](https://www.sbert.net/). When the user enters a query, it is converted into a vector representation, and the most similar package descriptions are fetched from the vector database. Additional weight is given to the amount of weekly downloads before presenting the results to the user in a dashboard. -## Architecture Overview +## Table of Contents + +1. [Overview](#overview) +2. [Architecture](#architecture) +3. [Getting Started](#getting-started) + - [Prerequisites](#prerequisites) + - [Build and Setup](#build-and-setup) +4. [Data](#data) + +## Architecture The project uses the following technologies: @@ -23,7 +34,11 @@ The project uses the following technologies: ### Prerequisites -1. **Create a `.env` File** +1. **Set Up Pinecone** + + Since PyPI Scout uses [Pinecone](https://www.pinecone.io/) as the vector database, register for a free account on their website. Obtain your API key using the instructions [here](https://docs.pinecone.io/guides/get-started/quickstart). + +2. **Create a `.env` File** Copy the `.env.template` to create a new `.env` file: @@ -31,9 +46,7 @@ The project uses the following technologies: cp .env.template .env ``` -2. **Set Up Pinecone** - - Since PyPI Scout uses [Pinecone](https://www.pinecone.io/) as the vector database, register for a free account on their website. Obtain your API key using the instructions [here](https://docs.pinecone.io/guides/get-started/quickstart) and add it to your `.env` file. + Then add your Pinecone API key from step 1 to this file. ### Build and Setup diff --git a/frontend/README.md b/frontend/README.md index c403366..c8febeb 100644 --- a/frontend/README.md +++ b/frontend/README.md @@ -6,12 +6,6 @@ First, run the development server: ```bash npm run dev -# or -yarn dev -# or -pnpm dev -# or -bun dev ``` Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. @@ -28,9 +22,3 @@ To learn more about Next.js, take a look at the following resources: - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js/) - your feedback and contributions are welcome! - -## Deploy on Vercel - -The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js. - -Check out our [Next.js deployment documentation](https://nextjs.org/docs/deployment) for more details. diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py index 3b9dc45..b0f72db 100644 --- a/pypi_scout/api/main.py +++ b/pypi_scout/api/main.py @@ -13,6 +13,7 @@ from pypi_scout.vector_database import VectorDatabaseInterface setup_logging() +logging.info("Initializing backend...") app = FastAPI() @@ -71,12 +72,13 @@ async def search(query: QueryModel): df_matches = vector_database_interface.find_similar(query.query, top_k=query.top_k * 2) df_matches = df_matches.join(df, how="left", on="name") - logging.info("Found similar projects. Calculating the weighted scores and filtering...") + logging.info( + f"Fetched the {len(df_matches)} most similar projects. Calculating the weighted scores and filtering..." + ) df_matches = calculate_score( df_matches, weight_similarity=config.WEIGHT_SIMILARITY, weight_weekly_downloads=config.WEIGHT_WEEKLY_DOWNLOADS ) df_matches = df_matches.sort("score", descending=True) df_matches = df_matches.head(query.top_k) - - logging.info("Returning the results...") + logging.info(f"Returning the {len(df_matches)} best matches.") return SearchResponse(matches=df_matches.to_dicts()) diff --git a/pypi_scout/api/utils.py b/pypi_scout/api/utils.py index 413516a..cab3e58 100644 --- a/pypi_scout/api/utils.py +++ b/pypi_scout/api/utils.py @@ -7,7 +7,7 @@ def load_dataset(path_to_dataset: Path): logging.info("Loading the processed dataset...") df = pl.read_csv(path_to_dataset) - logging.info(f"Loaded the processed dataset. Number of rows: {len(df):,}") + logging.info(f"Finished loading the processed dataset. Number of rows: {len(df):,}") logging.info(f"The highest weekly downloads in the dataset: {df['weekly_downloads'].max():,}") logging.info(f"The lowest weekly downloads in the dataset: {df['weekly_downloads'].min():,}") return df diff --git a/pypi_scout/utils/score_calculator.py b/pypi_scout/utils/score_calculator.py index 6918192..f8cf338 100644 --- a/pypi_scout/utils/score_calculator.py +++ b/pypi_scout/utils/score_calculator.py @@ -5,16 +5,17 @@ def calculate_score( df: pl.DataFrame, weight_similarity: float = 0.5, weight_weekly_downloads: float = 0.5 ) -> pl.DataFrame: """ - Calculate a combined score based on similarity and weekly downloads. + Calculate a combined score for packages based on similarity and weekly downloads. - The function ranks the similarity and weekly downloads, normalizes these ranks to a [0, 1] scale, - and then computes a combined score based on the provided weights for similarity and weekly downloads. - The DataFrame is sorted by the combined score in descending order. + This function ranks the entries according to the 'similarity' and 'weekly_downloads' columns, normalizes these + ranks to a [0, 1] scale, and computes a combined score using the provided weights for similarity and weekly downloads. + The combined score helps in recommending packages that are both popular and relevant based on similarity. - Args: - df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns. - weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5. - weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5. + + Args: + df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns. + weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5. + weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5. """ df = df.with_columns( diff --git a/pypi_scout/vector_database/interface.py b/pypi_scout/vector_database/interface.py index 31ea294..ea8586d 100644 --- a/pypi_scout/vector_database/interface.py +++ b/pypi_scout/vector_database/interface.py @@ -1,3 +1,5 @@ +import logging + import polars as pl from pinecone import Pinecone from sentence_transformers import SentenceTransformer @@ -26,6 +28,7 @@ def __init__( ): self.batch_size = batch_size self.model = embeddings_model + logging.info("Connecting to Pinecone...") pc = Pinecone(api_key=pinecone_token) self.index = pc.Index(pinecone_index_name) self.pinecone_namespace = pinecone_namespace @@ -39,7 +42,7 @@ def upsert_polars(self, df: pl.DataFrame, key_column: str, text_column: str): key_column (str): The name of the column in the DataFrame containing the unique keys. text_column (str): The name of the column in the DataFrame containing the text data. """ - df_chunks = self._split_dataframe_in_batches(df) + df_chunks = self._split_dataframe_in_batches(df, batch_size=self.batch_size) for chunk in tqdm(df_chunks, desc="Upserting batches", unit="batch"): self._upsert_chunk(chunk, key_column, text_column) @@ -54,19 +57,23 @@ def find_similar(self, query: str, top_k: int = 25) -> pl.DataFrame: Returns: pl.DataFrame: A Polars DataFrame containing the similar vectors and their similarity scores. """ - embeddings = self.model.encode(query) + embeddings = self.model.encode(query, show_progress=False) matches = self.index.query( namespace=self.pinecone_namespace, vector=embeddings.tolist(), top_k=top_k, include_values=False ) return pl.from_dicts([{"name": x["id"], "similarity": x["score"]} for x in matches["matches"]]) - def _upsert_chunk(self, chunk: pl.DataFrame, key_column: str, text_column: str): + def _upsert_chunk(self, chunk: pl.DataFrame, key_column: str, text_column: str) -> None: embeddings = self.model.encode(list(chunk[text_column]), show_progress_bar=False) vectors = [ {"id": project_name, "values": embedding} for project_name, embedding in zip(chunk[key_column], embeddings) ] self.index.upsert(vectors=vectors, namespace=self.pinecone_namespace, show_progress=False) - def _split_dataframe_in_batches(self, df): - n_chunks = (df.height + self.batch_size - 1) // self.batch_size - return [df.slice(i * self.batch_size, self.batch_size) for i in range(n_chunks)] + @staticmethod + def _split_dataframe_in_batches(self, df: pl.DataFrame, batch_size: int) -> pl.DataFrame: + """ + Splits a Polars DataFrame into batches. + """ + n_chunks = (df.height + batch_size - 1) // batch_size + return [df.slice(i * batch_size, batch_size) for i in range(n_chunks)] diff --git a/pyproject.toml b/pyproject.toml index 0aa3b6f..de2e2fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "pypi_scout" version = "0.0.1" -description = "This is a template repository for Python projects that use Poetry for their dependency management." +description = "PyPI Scout helps you find PyPI packages through natural language prompts with the help of Large Language Models (LLMs)." authors = ["Florian Maas "] repository = "https://github.com/fpgmaas/pypi-scout" documentation = "https://fpgmaas.github.io/pypi-scout/"