From 01203d19df3a0fd5555ce58bfc112015d55f524a Mon Sep 17 00:00:00 2001 From: florian Date: Sat, 15 Jun 2024 20:06:08 +0200 Subject: [PATCH] add better logging --- docker-compose.yml | 2 +- pypi_scout/api/main.py | 9 +++-- pypi_scout/api/utils.py | 13 +++++++ pypi_scout/config.py | 5 +++ pypi_scout/scripts/process_dataset.py | 54 +++++++++++++++++++-------- pypi_scout/scripts/upsert_data.py | 11 +----- 6 files changed, 64 insertions(+), 30 deletions(-) create mode 100644 pypi_scout/api/utils.py diff --git a/docker-compose.yml b/docker-compose.yml index 0c4d860..f106db6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: context: . dockerfile: Dockerfile working_dir: / - command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000 --reload + command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000 ports: - "8000:8000" volumes: diff --git a/pypi_scout/api/main.py b/pypi_scout/api/main.py index d4db375..cf90b0d 100644 --- a/pypi_scout/api/main.py +++ b/pypi_scout/api/main.py @@ -1,12 +1,12 @@ import logging -import polars as pl from dotenv import load_dotenv from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from sentence_transformers import SentenceTransformer +from pypi_scout.api.utils import load_dataset from pypi_scout.config import Config from pypi_scout.utils.logging import setup_logging from pypi_scout.utils.score_calculator import calculate_score @@ -31,7 +31,8 @@ allow_headers=["*"], ) -df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) +df = load_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) + model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME) vector_database_interface = VectorDatabaseInterface( @@ -71,7 +72,9 @@ async def search(query: QueryModel): df_matches = df_matches.join(df, how="left", on="name") logging.info("Found similar projects. Calculating the weighted scores and filtering...") - df_matches = calculate_score(df_matches) + df_matches = calculate_score( + df_matches, weight_similarity=config.WEIGHT_SIMILARITY, weight_weekly_downloads=config.WEIGHT_WEEKLY_DOWNLOADS + ) df_matches = df_matches.sort("score", descending=True) df_matches = df_matches.head(query.top_k) diff --git a/pypi_scout/api/utils.py b/pypi_scout/api/utils.py new file mode 100644 index 0000000..413516a --- /dev/null +++ b/pypi_scout/api/utils.py @@ -0,0 +1,13 @@ +import logging +from pathlib import Path + +import polars as pl + + +def load_dataset(path_to_dataset: Path): + logging.info("Loading the processed dataset...") + df = pl.read_csv(path_to_dataset) + logging.info(f"Loaded the processed dataset. Number of rows: {len(df):,}") + logging.info(f"The highest weekly downloads in the dataset: {df['weekly_downloads'].max():,}") + logging.info(f"The lowest weekly downloads in the dataset: {df['weekly_downloads'].min():,}") + return df diff --git a/pypi_scout/config.py b/pypi_scout/config.py index 6aec97d..d813a2f 100644 --- a/pypi_scout/config.py +++ b/pypi_scout/config.py @@ -21,6 +21,11 @@ class Config: # Defaults to 0.1, change this to 1.0 to include the entire dataset. FRAC_DATA_TO_INCLUDE = 0.1 + # Weights for the similarity calculation. Higher values for WEIGHT_WEEKLY_DOWNLOADS + # will prioritize displaying packages with higher weekly downloads. + WEIGHT_SIMILARITY = 0.8 + WEIGHT_WEEKLY_DOWNLOADS = 0.2 + def __post_init__(self) -> None: if not self.PINECONE_TOKEN: raise OSError("PINECONE_TOKEN not found in environment variables") # noqa: TRY003 diff --git a/pypi_scout/scripts/process_dataset.py b/pypi_scout/scripts/process_dataset.py index 6e91af1..e7dc265 100644 --- a/pypi_scout/scripts/process_dataset.py +++ b/pypi_scout/scripts/process_dataset.py @@ -9,33 +9,55 @@ from pypi_scout.utils.logging import setup_logging -def process_dataset(): - """ - This script processes a dataset by cleaning the description column and saving the processed dataset as a CSV file. - """ - - load_dotenv() - config = Config() - - processed_dataset_path = config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME - - if processed_dataset_path.exists(): - logging.info("Processed dataset already exists. Skipping the cleaning process.") - return - +def read_raw_dataset(path_to_raw_dataset): logging.info("Reading the raw dataset...") - df = DataReader(config.DATA_DIR / config.RAW_DATASET_CSV_NAME).read() - + df = DataReader(path_to_raw_dataset).read() + logging.info("Number of rows in the raw dataset: %s", len(df)) + logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}") + logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}") + return df + + +def filter_top_packages(df, frac_data_to_include): + logging.info( + f"Using only the packages with weekly_downloads in the top {frac_data_to_include * 100}% of the dataset because config.FRAC_DATA_TO_INCLUDE is set to {frac_data_to_include}!" + ) + logging.info( + "This means packages with low download counts are excluded from the results in the dashboard. To include the entire dataset, set config.FRAC_DATA_TO_INCLUDE to 1.0." + ) + df = df.sort("weekly_downloads", descending=True) + df = df.head(round(frac_data_to_include * len(df))) + + logging.info(f"Number of rows after filtering: {len(df):,}") + logging.info(f"The highest weekly downloads in the filtered dataset: {df['weekly_downloads'].max():,}") + logging.info(f"The lowest weekly downloads in the filtered dataset: {df['weekly_downloads'].min():,}") + return df + + +def clean_descriptions(df): logging.info("Cleaning the descriptions...") df = DescriptionCleaner().clean(df, "description", "description_cleaned") df = df.filter(~pl.col("description_cleaned").is_null()) df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED) + return df + +def store_processed_dataset(df, processed_dataset_path): logging.info("Storing the processed dataset...") df.write_csv(processed_dataset_path) logging.info("Done!") +def process_dataset(): + load_dotenv() + config = Config() + df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME) + if config.FRAC_DATA_TO_INCLUDE < 1.0: + df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE) + df = clean_descriptions(df) + store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) + + if __name__ == "__main__": setup_logging() process_dataset() diff --git a/pypi_scout/scripts/upsert_data.py b/pypi_scout/scripts/upsert_data.py index 9be5e1d..b34780e 100644 --- a/pypi_scout/scripts/upsert_data.py +++ b/pypi_scout/scripts/upsert_data.py @@ -18,16 +18,7 @@ def upsert_data(): logging.info("Reading the processed dataset...") df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) - - if config.FRAC_DATA_TO_INCLUDE < 1.0: - logging.info( - f"Using only the packages with weekly_downloads in the top {config.FRAC_DATA_TO_INCLUDE * 100}% of the dataset because config.FRAC_DATA_TO_INCLUDE is set to {config.FRAC_DATA_TO_INCLUDE}!" - ) - logging.info( - "This can be useful for testing purposes and to quickly get started. To include the entire dataset, set config.FRAC_DATA_TO_INCLUDE to 1.0." - ) - df = df.sort("weekly_downloads", descending=True) - df = df.head(round(config.FRAC_DATA_TO_INCLUDE * len(df))) + logging.info("Number of rows in the dataset: %s", len(df)) logging.info("Connecting to the vector database..") vector_database_interface = VectorDatabaseInterface(