feat: add nutrient extractor (#1437)

* feat: add nutrition extractor * chore: add ML integration tests * chore: add command to download nutrition extractor model * fix: fix toml-check issue with pyproject.toml * fix: fix isort issue
openfoodfacts · Oct 24, 2024 · 9ae5ff7 · 9ae5ff7
1 parent f9a60dc
commit 9ae5ff7
Show file tree

Hide file tree

Showing 22 changed files with 1,667 additions and 184 deletions.
diff --git a/Makefile b/Makefile
@@ -3,7 +3,7 @@
 # nice way to have our .env in environment for use in makefile
 # see https://lithic.tech/blog/2020-05/makefile-dot-env
 # Note: this will mask environment variable as opposed to docker-compose priority
-# yet most developper should'nt bump into this
+# yet most developper shouldn't bump into this
 ifneq (,$(wildcard ./.env))
     -include .env
     -include .envrc
@@ -161,6 +161,13 @@ dl-image-clf-models:
 			wget -cO - https://huggingface.co/openfoodfacts/$${asset_name}/resolve/main/weights/best.onnx > $${dir}/model.onnx; \
 	done;
 
+
+dl-nutrition-extractor-model:
+	@echo "⏬ Downloading nutrition extractor model files …"
+	${DOCKER_COMPOSE} run --rm --no-deps api huggingface-cli download openfoodfacts/nutrition-extractor --include 'onnx/*' --local-dir models/triton/nutrition_extractor/1/; \
+	cd models/triton/nutrition_extractor/1/; \
+	mv onnx model.onnx;
+
 init-elasticsearch:
 	@echo "Initializing elasticsearch indices"
 	${DOCKER_COMPOSE} up -d elasticsearch 2>&1
@@ -237,6 +244,14 @@ integration-tests:
 	${DOCKER_COMPOSE_TEST} run --rm worker_1 poetry run pytest -vv --cov-report xml --cov=robotoff --cov-append tests/integration
 	( ${DOCKER_COMPOSE_TEST} down -v || true )
 
+ml-tests: 
+	@echo "🥫 Running ML tests …"
+	${DOCKER_COMPOSE_TEST} up -d triton
+	@echo "Sleeping for 30s, waiting for triton to be ready..."
+	@sleep 30
+	${DOCKER_COMPOSE_TEST} run --rm worker_1 poetry run pytest -vv tests/ml ${args}
+	( ${DOCKER_COMPOSE_TEST} down -v || true )
+
 # interactive testings
 # usage: make pytest args='test/unit/my-test.py --pdb'
 pytest: guard-args

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -83,6 +83,7 @@ duckdb = "~1.0.0"
 google-cloud-storage = "~2.14.0"
 pandas = "~2.2.2"
 pyarrow = "~17.0.0"
+rich = "~13.9.2"  # Used for CLI pretty print
 
 [tool.poetry.dependencies.sentry-sdk]
 version = ">=1.14,<2.9"

diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py
@@ -93,12 +93,19 @@ def create_redis_update(
 
     get_logger()
     client = get_redis_client()
+    flavor_to_product_type = {
+        "off": "food",
+        "obf": "beauty",
+        "opff": "petfood",
+        "opf": "product",
+    }
     event = {
         "code": barcode,
         "flavor": flavor,
         "user_id": user_id,
         "action": action,
         "comment": comment,
+        "product_type": flavor_to_product_type[flavor],
     }
 
     diffs: JSONType
@@ -602,6 +609,45 @@ def run_object_detection_model(
             )
 
 
+@app.command()
+def run_nutrition_extraction(
+    image_url: str = typer.Argument(
+        ..., help="URL of the image to run nutrition extraction on"
+    ),
+    triton_uri: Optional[str] = typer.Option(
+        None,
+        help="URI of the Triton Inference Server to use. If not provided, the default value from settings is used.",
+    ),
+) -> None:
+    """Run nutrition extraction on a product image.
+
+    The image URL should be an Open Food Facts image URL, e.g.
+    https://images.openfoodfacts.org/images/products/327/408/000/5003/3.jpg
+
+    The OCR JSON is expected to be available at the same URL with a `.json`
+    extension, e.g.
+    https://images.openfoodfacts.org/images/products/327/408/000/5003/3.json
+
+    Prediction is printed to stdout.
+    """
+    from typing import cast
+
+    from openfoodfacts.ocr import OCRResult
+    from PIL import Image
+    from rich import print as pprint
+
+    from robotoff.images import get_image_from_url
+    from robotoff.prediction.nutrition_extraction import predict
+
+    image = cast(Image.Image, get_image_from_url(image_url))
+    ocr_result = cast(OCRResult, OCRResult.from_url(image_url.replace(".jpg", ".json")))
+    prediction = predict(image, ocr_result, triton_uri=triton_uri)
+    if prediction is not None:
+        pprint(prediction)
+    else:
+        pprint("No prediction")
+
+
 @app.command()
 def init_elasticsearch() -> None:
     """This command is used for index creation."""

diff --git a/robotoff/insights/annotate.py b/robotoff/insights/annotate.py
@@ -660,24 +660,6 @@ def process_annotation(
         return UPDATED_ANNOTATION_RESULT
 
 
-class NutritionTableStructureAnnotator(InsightAnnotator):
-    @classmethod
-    def process_annotation(
-        cls,
-        insight: ProductInsight,
-        data: Optional[dict] = None,
-        auth: Optional[OFFAuthentication] = None,
-        is_vote: bool = False,
-    ) -> AnnotationResult:
-        insight.data["annotation"] = data
-        insight.save()
-        return SAVED_ANNOTATION_RESULT
-
-    @classmethod
-    def is_data_required(cls) -> bool:
-        return True
-
-
 class IngredientSpellcheckAnnotator(InsightAnnotator):
     @classmethod
     def process_annotation(
@@ -687,7 +669,8 @@ def process_annotation(
         auth: Optional[OFFAuthentication] = None,
         is_vote: bool = False,
     ) -> AnnotationResult:
-        # Possibility for the annotator to change the spellcheck correction if data is provided
+        # Possibility for the annotator to change the spellcheck correction if data is
+        # provided
         if data is not None:
             annotation = data.get("annotation")
             if not annotation or len(data) > 1:
@@ -720,7 +703,6 @@ def process_annotation(
     InsightType.store.name: StoreAnnotator,
     InsightType.packaging.name: PackagingAnnotator,
     InsightType.nutrition_image.name: NutritionImageAnnotator,
-    InsightType.nutrition_table_structure.name: NutritionTableStructureAnnotator,
     InsightType.is_upc_image.name: UPCImageAnnotator,
     InsightType.ingredient_spellcheck.name: IngredientSpellcheckAnnotator,
 }

diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py
@@ -1524,6 +1524,37 @@ def _keep_prediction(
         )
 
 
+class NutrientExtractionImporter(InsightImporter):
+    @staticmethod
+    def get_type() -> InsightType:
+        return InsightType.nutrient_extraction
+
+    @classmethod
+    def get_required_prediction_types(cls) -> set[PredictionType]:
+        return {PredictionType.nutrient_extraction}
+
+    @classmethod
+    def generate_candidates(
+        cls,
+        product: Optional[Product],
+        predictions: list[Prediction],
+        product_id: ProductIdentifier,
+    ) -> Iterator[ProductInsight]:
+        if product is not None and product.nutriments:
+            # Don't generate candidates if the product already has nutrients
+            return
+
+        for prediction in predictions:
+            yield ProductInsight(**prediction.to_dict())
+
+    @classmethod
+    def is_conflicting_insight(
+        cls, candidate: ProductInsight, reference: ProductInsight
+    ) -> bool:
+        # Only one insight per product
+        return True
+
+
 class PackagingElementTaxonomyException(Exception):
     pass
 
@@ -1860,6 +1891,7 @@ def import_product_predictions(
     UPCImageImporter,
     NutritionImageImporter,
     IngredientSpellcheckImporter,
+    NutrientExtractionImporter,
 ]
 
 

diff --git a/robotoff/off.py b/robotoff/off.py
@@ -68,8 +68,16 @@ def get_username(self) -> Optional[str]:
         return None
 
 
-def get_source_from_url(ocr_url: str) -> str:
-    url_path = urlparse(ocr_url).path
+def get_source_from_url(url: str) -> str:
+    """Get the `source_image` field from an image or OCR URL.
+
+    It's the path of the image or OCR JSON file, but without the `/images/products`
+    prefix. It always ends with `.jpg`, whather it's an image or an OCR JSON file.
+
+    :param url: the URL of the image or OCR JSON file
+    :return: the source image path
+    """
+    url_path = urlparse(url).path
 
     if url_path.startswith("/images/products"):
         url_path = url_path[len("/images/products") :]