Skip to content

Commit

Permalink
add some emoji to the log messages
Browse files Browse the repository at this point in the history
  • Loading branch information
florian committed Jun 16, 2024
1 parent c25b977 commit 30b728f
Show file tree
Hide file tree
Showing 6 changed files with 19 additions and 17 deletions.
2 changes: 1 addition & 1 deletion pypi_scout/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class Config:

# The fraction of the dataset to include in the vector database. Defaults to 0.1,
# change this to 1.0 to include the entire dataset.
FRAC_DATA_TO_INCLUDE = 0.1
FRAC_DATA_TO_INCLUDE = 0.25

# Weights for the similarity calculation. Higher values for WEIGHT_WEEKLY_DOWNLOADS
# will prioritize displaying packages with higher weekly downloads.
Expand Down
6 changes: 3 additions & 3 deletions pypi_scout/scripts/download_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ def download_dataset():

target_path = config.DATA_DIR / config.RAW_DATASET_CSV_NAME
if target_path.exists():
logging.info(f"Raw dataset {target_path} from Google Drive already exists! Skipping download.")
logging.info(f"✔️ Raw dataset {target_path} from Google Drive already exists! Skipping download.")
return

logging.info(f"Downloading raw dataset from Google Drive to {target_path}...")
logging.info(f"⬇️ Downloading raw dataset from Google Drive to {target_path}...")
url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
gdown.download(url, target_path, quiet=False)
logging.info("Done!")
logging.info("Done!")


if __name__ == "__main__":
Expand Down
10 changes: 5 additions & 5 deletions pypi_scout/scripts/process_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@


def read_raw_dataset(path_to_raw_dataset):
logging.info("Reading the raw dataset...")
logging.info("📂 Reading the raw dataset...")
df = DataReader(path_to_raw_dataset).read()
logging.info("Number of rows in the raw dataset: %s", len(df))
logging.info("📊 Number of rows in the raw dataset: %s", len(df))
logging.info(f"The highest weekly downloads in the raw dataset: {df['weekly_downloads'].max():,}")
logging.info(f"The lowest weekly downloads in the raw dataset: {df['weekly_downloads'].min():,}")
return df
Expand All @@ -28,14 +28,14 @@ def filter_top_packages(df, frac_data_to_include):
df = df.sort("weekly_downloads", descending=True)
df = df.head(round(frac_data_to_include * len(df)))

logging.info(f"Number of rows after filtering: {len(df):,}")
logging.info(f"📊 Number of rows after filtering: {len(df):,}")
logging.info(f"The highest weekly downloads in the filtered dataset: {df['weekly_downloads'].max():,}")
logging.info(f"The lowest weekly downloads in the filtered dataset: {df['weekly_downloads'].min():,}")
return df


def clean_descriptions(df):
logging.info("Cleaning the descriptions...")
logging.info("🧹 Cleaning the descriptions...")
df = DescriptionCleaner().clean(df, "description", "description_cleaned")
df = df.filter(~pl.col("description_cleaned").is_null())
df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED)
Expand All @@ -45,7 +45,7 @@ def clean_descriptions(df):
def store_processed_dataset(df, processed_dataset_path):
logging.info("Storing the processed dataset...")
df.write_csv(processed_dataset_path)
logging.info("Done!")
logging.info("Done!")


def process_dataset():
Expand Down
8 changes: 4 additions & 4 deletions pypi_scout/scripts/setup_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def setup_pinecone():
load_dotenv()
config = Config()

logging.info("Connecting to Pinecone..")
logging.info("🔗 Connecting to Pinecone..")
pc = Pinecone(api_key=config.PINECONE_TOKEN)

try:
Expand All @@ -30,12 +30,12 @@ def setup_pinecone():
metric="dotproduct",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
logging.info("Pinecone index created successfully.")
logging.info("Pinecone index created successfully.")
except PineconeApiException as e:
if e.status == 409:
logging.warning(f"Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
logging.warning(f"✔️ Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
else:
logging.exception("An error occurred while creating the Pinecone index.")
logging.exception("An error occurred while creating the Pinecone index.")


if __name__ == "__main__":
Expand Down
8 changes: 5 additions & 3 deletions pypi_scout/scripts/upsert_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,22 @@ def upsert_data():
df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
logging.info("Number of rows in the dataset: %s", len(df))

logging.info("Connecting to the vector database..")
logging.info("🔗 Connecting to the vector database..")
vector_database_interface = VectorDatabaseInterface(
pinecone_token=config.PINECONE_TOKEN,
pinecone_index_name=config.PINECONE_INDEX_NAME,
embeddings_model=SentenceTransformer(config.EMBEDDINGS_MODEL_NAME),
pinecone_namespace=config.PINECONE_NAMESPACE,
)

logging.info("Upserting data into the vector database..")
logging.info("⬆️ Upserting data into the vector database..")
logging.info("This can take a while...")
logging.info("If this really takes too long, consider lowering the value of `FRAC_DATA_TO_INCLUDE` in config.py.")
df = df.with_columns(
summary_and_description_cleaned=pl.concat_str(pl.col("summary"), pl.lit(" - "), pl.col("description_cleaned"))
)
vector_database_interface.upsert_polars(df, key_column="name", text_column="summary_and_description_cleaned")
logging.info("Done!")
logging.info("Done!")


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion pypi_scout/vector_database/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def _upsert_chunk(self, chunk: pl.DataFrame, key_column: str, text_column: str)
self.index.upsert(vectors=vectors, namespace=self.pinecone_namespace, show_progress=False)

@staticmethod
def _split_dataframe_in_batches(self, df: pl.DataFrame, batch_size: int) -> pl.DataFrame:
def _split_dataframe_in_batches(df: pl.DataFrame, batch_size: int) -> pl.DataFrame:
"""
Splits a Polars DataFrame into batches.
"""
Expand Down

0 comments on commit 30b728f

Please sign in to comment.