Skip to content

Commit

Permalink
add better logging
Browse files Browse the repository at this point in the history
  • Loading branch information
florian committed Jun 15, 2024
1 parent 5dfe166 commit 01203d1
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 30 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ services:
context: .
dockerfile: Dockerfile
working_dir: /
command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000 --reload
command: uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000
ports:
- "8000:8000"
volumes:
Expand Down
9 changes: 6 additions & 3 deletions pypi_scout/api/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import logging

import polars as pl
from dotenv import load_dotenv
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer

from pypi_scout.api.utils import load_dataset
from pypi_scout.config import Config
from pypi_scout.utils.logging import setup_logging
from pypi_scout.utils.score_calculator import calculate_score
Expand All @@ -31,7 +31,8 @@
allow_headers=["*"],
)

df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
df = load_dataset(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)

model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)

vector_database_interface = VectorDatabaseInterface(
Expand Down Expand Up @@ -71,7 +72,9 @@ async def search(query: QueryModel):
df_matches = df_matches.join(df, how="left", on="name")

logging.info("Found similar projects. Calculating the weighted scores and filtering...")
df_matches = calculate_score(df_matches)
df_matches = calculate_score(
df_matches, weight_similarity=config.WEIGHT_SIMILARITY, weight_weekly_downloads=config.WEIGHT_WEEKLY_DOWNLOADS
)
df_matches = df_matches.sort("score", descending=True)
df_matches = df_matches.head(query.top_k)

Expand Down
13 changes: 13 additions & 0 deletions pypi_scout/api/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import logging
from pathlib import Path

import polars as pl


def load_dataset(path_to_dataset: Path):
logging.info("Loading the processed dataset...")
df = pl.read_csv(path_to_dataset)
logging.info(f"Loaded the processed dataset. Number of rows: {len(df):,}")
logging.info(f"The highest weekly downloads in the dataset: {df['weekly_downloads'].max():,}")
logging.info(f"The lowest weekly downloads in the dataset: {df['weekly_downloads'].min():,}")
return df
5 changes: 5 additions & 0 deletions pypi_scout/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ class Config:
# Defaults to 0.1, change this to 1.0 to include the entire dataset.
FRAC_DATA_TO_INCLUDE = 0.1

# Weights for the similarity calculation. Higher values for WEIGHT_WEEKLY_DOWNLOADS
# will prioritize displaying packages with higher weekly downloads.
WEIGHT_SIMILARITY = 0.8
WEIGHT_WEEKLY_DOWNLOADS = 0.2

def __post_init__(self) -> None:
if not self.PINECONE_TOKEN:
raise OSError("PINECONE_TOKEN not found in environment variables") # noqa: TRY003
54 changes: 38 additions & 16 deletions pypi_scout/scripts/process_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,33 +9,55 @@
from pypi_scout.utils.logging import setup_logging


def process_dataset():
"""
This script processes a dataset by cleaning the description column and saving the processed dataset as a CSV file.
"""

load_dotenv()
config = Config()

processed_dataset_path = config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME

if processed_dataset_path.exists():
logging.info("Processed dataset already exists. Skipping the cleaning process.")
return

def read_raw_dataset(path_to_raw_dataset):
logging.info("Reading the raw dataset...")
df = DataReader(config.DATA_DIR / config.RAW_DATASET_CSV_NAME).read()

df = DataReader(path_to_raw_dataset).read()
logging.info("Number of rows in the raw dataset: %s", len(df))
logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}")
logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}")
return df


def filter_top_packages(df, frac_data_to_include):
logging.info(
f"Using only the packages with weekly_downloads in the top {frac_data_to_include * 100}% of the dataset because config.FRAC_DATA_TO_INCLUDE is set to {frac_data_to_include}!"
)
logging.info(
"This means packages with low download counts are excluded from the results in the dashboard. To include the entire dataset, set config.FRAC_DATA_TO_INCLUDE to 1.0."
)
df = df.sort("weekly_downloads", descending=True)
df = df.head(round(frac_data_to_include * len(df)))

logging.info(f"Number of rows after filtering: {len(df):,}")
logging.info(f"The highest weekly downloads in the filtered dataset: {df['weekly_downloads'].max():,}")
logging.info(f"The lowest weekly downloads in the filtered dataset: {df['weekly_downloads'].min():,}")
return df


def clean_descriptions(df):
logging.info("Cleaning the descriptions...")
df = DescriptionCleaner().clean(df, "description", "description_cleaned")
df = df.filter(~pl.col("description_cleaned").is_null())
df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED)
return df


def store_processed_dataset(df, processed_dataset_path):
logging.info("Storing the processed dataset...")
df.write_csv(processed_dataset_path)
logging.info("Done!")


def process_dataset():
load_dotenv()
config = Config()
df = read_raw_dataset(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
if config.FRAC_DATA_TO_INCLUDE < 1.0:
df = filter_top_packages(df, config.FRAC_DATA_TO_INCLUDE)
df = clean_descriptions(df)
store_processed_dataset(df, config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)


if __name__ == "__main__":
setup_logging()
process_dataset()
11 changes: 1 addition & 10 deletions pypi_scout/scripts/upsert_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,7 @@ def upsert_data():

logging.info("Reading the processed dataset...")
df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)

if config.FRAC_DATA_TO_INCLUDE < 1.0:
logging.info(
f"Using only the packages with weekly_downloads in the top {config.FRAC_DATA_TO_INCLUDE * 100}% of the dataset because config.FRAC_DATA_TO_INCLUDE is set to {config.FRAC_DATA_TO_INCLUDE}!"
)
logging.info(
"This can be useful for testing purposes and to quickly get started. To include the entire dataset, set config.FRAC_DATA_TO_INCLUDE to 1.0."
)
df = df.sort("weekly_downloads", descending=True)
df = df.head(round(config.FRAC_DATA_TO_INCLUDE * len(df)))
logging.info("Number of rows in the dataset: %s", len(df))

logging.info("Connecting to the vector database..")
vector_database_interface = VectorDatabaseInterface(
Expand Down

0 comments on commit 01203d1

Please sign in to comment.