From 7e525c194187f5ee0f5a3d5f9d23021ec5322e1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 29 Oct 2024 15:51:20 +0100 Subject: [PATCH] feat: schedule Hugging Face Parquet dataset push every day --- .github/workflows/container-deploy.yml | 4 ++++ docker-compose.yml | 1 + robotoff/cli/main.py | 2 +- robotoff/products.py | 6 +++--- robotoff/scheduler/__init__.py | 28 +++++++++++++++++++++++--- robotoff/settings.py | 1 + 6 files changed, 35 insertions(+), 7 deletions(-) diff --git a/.github/workflows/container-deploy.yml b/.github/workflows/container-deploy.yml index f031e52461..afc4f105e7 100644 --- a/.github/workflows/container-deploy.yml +++ b/.github/workflows/container-deploy.yml @@ -37,6 +37,7 @@ jobs: echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.net,static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.net,images.openfoodfacts.org" >> $GITHUB_ENV echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=8GB" >> $GITHUB_ENV echo "ROBOTOFF_POSTGRES_WORK_MEM=1GB" >> $GITHUB_ENV + echo "ENABLE_HF_PUSH=0" >> $GITHUB_ENV - name: Set various variable for production deployment if: matrix.env == 'robotoff-org' run: | @@ -58,6 +59,7 @@ jobs: echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.org" >> $GITHUB_ENV echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=16GB" >> $GITHUB_ENV echo "ROBOTOFF_POSTGRES_WORK_MEM=2GB" >> $GITHUB_ENV + echo "ENABLE_HF_PUSH=1" >> $GITHUB_ENV - name: Wait for container build workflow uses: tomchv/wait-my-workflow@v1.1.0 id: wait-build @@ -182,6 +184,8 @@ jobs: # Secret key to secure batch job import echo "BATCH_JOB_KEY=${{ secrets.BATCH_JOB_KEY }}" >> .env + # Enable or not dataset push to Hugging Face + echo "ENABLE_HF_PUSH=${{ env.ENABLE_HF_PUSH }}" >> .env - name: Create Docker volumes uses: appleboy/ssh-action@master diff --git a/docker-compose.yml b/docker-compose.yml index 14ece4fd1f..4a6aa657bc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -59,6 +59,7 @@ x-robotoff-base-env: GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable BATCH_JOB_KEY: # Secure Batch job import with a token key HF_TOKEN: # Hugging Face token + ENABLE_HF_PUSH: # Enable Hugging Face dataset push (0 or 1, disabled by default) x-robotoff-worker-base: &robotoff-worker diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py index 7cabfb4893..05b74906d2 100644 --- a/robotoff/cli/main.py +++ b/robotoff/cli/main.py @@ -1205,7 +1205,7 @@ def launch_normalize_barcode_job( def push_jsonl_to_hf( repo_id: str = "openfoodfacts/product-database", revision: str = "main", - commit_message: str = "Database updated.", + commit_message: str = "Database updated", output_path: Optional[str] = None, ): """Clean and convert the JSONL database before pushing to HF. diff --git a/robotoff/products.py b/robotoff/products.py index cc5d6727ce..284e4d8445 100644 --- a/robotoff/products.py +++ b/robotoff/products.py @@ -601,9 +601,9 @@ def convert_jsonl_to_parquet( def push_data_to_hf( data_path: str, - repo_id: str, - revision: str, - commit_message: str, + repo_id: str = "openfoodfacts/product-database", + revision: str = "main", + commit_message: str = "Database updated", ) -> None: logger.info(f"Start pushing data to Hugging Face at {repo_id}") if not os.path.exists(data_path): diff --git a/robotoff/scheduler/__init__.py b/robotoff/scheduler/__init__.py index 2d66481e15..e007313bd7 100644 --- a/robotoff/scheduler/__init__.py +++ b/robotoff/scheduler/__init__.py @@ -1,5 +1,6 @@ import datetime import os +import tempfile import uuid from typing import Iterable @@ -23,9 +24,11 @@ from robotoff.models import Prediction, ProductInsight, db from robotoff.products import ( Product, + convert_jsonl_to_parquet, fetch_dataset, get_min_product_store, has_dataset_changed, + push_data_to_hf, ) from robotoff.types import InsightType, ServerType from robotoff.utils import get_logger @@ -290,14 +293,33 @@ def update_insight_attributes(product: Product, insight: ProductInsight) -> bool # this job does no use database -def _update_data(): - """Refreshes the PO product dump data.""" +def _update_data() -> None: + """Download the latest version of the Product Opener product JSONL dump, + convert it to Parquet format and push it to Hugging Face Hub. + + Conversion to Parquet is only performed if the envvar ENABLE_HF_PUSH is + set to 1. + """ logger.info("Downloading new version of product dataset") + ds_changed = False try: - if has_dataset_changed(): + if ds_changed := has_dataset_changed(): fetch_dataset() except requests.exceptions.RequestException: logger.exception("Exception during product dataset refresh") + return + + if not settings.ENABLE_HF_PUSH: + logger.info("HF push is disabled, skipping Parquet conversion") + + if ds_changed: + logger.info("Starting conversion of JSONL to Parquet (to HF)") + with tempfile.TemporaryDirectory() as tmp_dir: + file_path = os.path.join(tmp_dir, "converted_data.parquet") + convert_jsonl_to_parquet(output_file_path=file_path) + push_data_to_hf(data_path=file_path) + else: + logger.info("No changes in product dataset, skipping Parquet conversion") def transform_insight_iter(insights_iter: Iterable[dict]): diff --git a/robotoff/settings.py b/robotoff/settings.py index fe38fee1dd..9840f5321e 100644 --- a/robotoff/settings.py +++ b/robotoff/settings.py @@ -363,3 +363,4 @@ def get_package_version() -> str: # SQL queries paths JSONL_TO_PARQUET_SQL_QUERY = PROJECT_DIR / "robotoff/utils/sql/jsonl_to_parquet.sql" +ENABLE_HF_PUSH = bool(int(os.environ.get("ENABLE_HF_PUSH", 0)))