Skip to content

Commit

Permalink
feat: add nutrient extractor (#1437)
Browse files Browse the repository at this point in the history
* feat: add nutrition extractor

* chore: add ML integration tests

* chore: add command to download nutrition extractor model

* fix: fix toml-check issue with pyproject.toml

* fix: fix isort issue
  • Loading branch information
raphael0202 authored Oct 24, 2024
1 parent f9a60dc commit 9ae5ff7
Show file tree
Hide file tree
Showing 22 changed files with 1,667 additions and 184 deletions.
17 changes: 16 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# nice way to have our .env in environment for use in makefile
# see https://lithic.tech/blog/2020-05/makefile-dot-env
# Note: this will mask environment variable as opposed to docker-compose priority
# yet most developper should'nt bump into this
# yet most developper shouldn't bump into this
ifneq (,$(wildcard ./.env))
-include .env
-include .envrc
Expand Down Expand Up @@ -161,6 +161,13 @@ dl-image-clf-models:
wget -cO - https://huggingface.co/openfoodfacts/$${asset_name}/resolve/main/weights/best.onnx > $${dir}/model.onnx; \
done;


dl-nutrition-extractor-model:
@echo "⏬ Downloading nutrition extractor model files …"
${DOCKER_COMPOSE} run --rm --no-deps api huggingface-cli download openfoodfacts/nutrition-extractor --include 'onnx/*' --local-dir models/triton/nutrition_extractor/1/; \
cd models/triton/nutrition_extractor/1/; \
mv onnx model.onnx;

init-elasticsearch:
@echo "Initializing elasticsearch indices"
${DOCKER_COMPOSE} up -d elasticsearch 2>&1
Expand Down Expand Up @@ -237,6 +244,14 @@ integration-tests:
${DOCKER_COMPOSE_TEST} run --rm worker_1 poetry run pytest -vv --cov-report xml --cov=robotoff --cov-append tests/integration
( ${DOCKER_COMPOSE_TEST} down -v || true )

ml-tests:
@echo "🥫 Running ML tests …"
${DOCKER_COMPOSE_TEST} up -d triton
@echo "Sleeping for 30s, waiting for triton to be ready..."
@sleep 30
${DOCKER_COMPOSE_TEST} run --rm worker_1 poetry run pytest -vv tests/ml ${args}
( ${DOCKER_COMPOSE_TEST} down -v || true )

# interactive testings
# usage: make pytest args='test/unit/my-test.py --pdb'
pytest: guard-args
Expand Down
387 changes: 235 additions & 152 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ duckdb = "~1.0.0"
google-cloud-storage = "~2.14.0"
pandas = "~2.2.2"
pyarrow = "~17.0.0"
rich = "~13.9.2" # Used for CLI pretty print

[tool.poetry.dependencies.sentry-sdk]
version = ">=1.14,<2.9"
Expand Down
46 changes: 46 additions & 0 deletions robotoff/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,19 @@ def create_redis_update(

get_logger()
client = get_redis_client()
flavor_to_product_type = {
"off": "food",
"obf": "beauty",
"opff": "petfood",
"opf": "product",
}
event = {
"code": barcode,
"flavor": flavor,
"user_id": user_id,
"action": action,
"comment": comment,
"product_type": flavor_to_product_type[flavor],
}

diffs: JSONType
Expand Down Expand Up @@ -602,6 +609,45 @@ def run_object_detection_model(
)


@app.command()
def run_nutrition_extraction(
image_url: str = typer.Argument(
..., help="URL of the image to run nutrition extraction on"
),
triton_uri: Optional[str] = typer.Option(
None,
help="URI of the Triton Inference Server to use. If not provided, the default value from settings is used.",
),
) -> None:
"""Run nutrition extraction on a product image.
The image URL should be an Open Food Facts image URL, e.g.
https://images.openfoodfacts.org/images/products/327/408/000/5003/3.jpg
The OCR JSON is expected to be available at the same URL with a `.json`
extension, e.g.
https://images.openfoodfacts.org/images/products/327/408/000/5003/3.json
Prediction is printed to stdout.
"""
from typing import cast

from openfoodfacts.ocr import OCRResult
from PIL import Image
from rich import print as pprint

from robotoff.images import get_image_from_url
from robotoff.prediction.nutrition_extraction import predict

image = cast(Image.Image, get_image_from_url(image_url))
ocr_result = cast(OCRResult, OCRResult.from_url(image_url.replace(".jpg", ".json")))
prediction = predict(image, ocr_result, triton_uri=triton_uri)
if prediction is not None:
pprint(prediction)
else:
pprint("No prediction")


@app.command()
def init_elasticsearch() -> None:
"""This command is used for index creation."""
Expand Down
22 changes: 2 additions & 20 deletions robotoff/insights/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,24 +660,6 @@ def process_annotation(
return UPDATED_ANNOTATION_RESULT


class NutritionTableStructureAnnotator(InsightAnnotator):
@classmethod
def process_annotation(
cls,
insight: ProductInsight,
data: Optional[dict] = None,
auth: Optional[OFFAuthentication] = None,
is_vote: bool = False,
) -> AnnotationResult:
insight.data["annotation"] = data
insight.save()
return SAVED_ANNOTATION_RESULT

@classmethod
def is_data_required(cls) -> bool:
return True


class IngredientSpellcheckAnnotator(InsightAnnotator):
@classmethod
def process_annotation(
Expand All @@ -687,7 +669,8 @@ def process_annotation(
auth: Optional[OFFAuthentication] = None,
is_vote: bool = False,
) -> AnnotationResult:
# Possibility for the annotator to change the spellcheck correction if data is provided
# Possibility for the annotator to change the spellcheck correction if data is
# provided
if data is not None:
annotation = data.get("annotation")
if not annotation or len(data) > 1:
Expand Down Expand Up @@ -720,7 +703,6 @@ def process_annotation(
InsightType.store.name: StoreAnnotator,
InsightType.packaging.name: PackagingAnnotator,
InsightType.nutrition_image.name: NutritionImageAnnotator,
InsightType.nutrition_table_structure.name: NutritionTableStructureAnnotator,
InsightType.is_upc_image.name: UPCImageAnnotator,
InsightType.ingredient_spellcheck.name: IngredientSpellcheckAnnotator,
}
Expand Down
32 changes: 32 additions & 0 deletions robotoff/insights/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1524,6 +1524,37 @@ def _keep_prediction(
)


class NutrientExtractionImporter(InsightImporter):
@staticmethod
def get_type() -> InsightType:
return InsightType.nutrient_extraction

@classmethod
def get_required_prediction_types(cls) -> set[PredictionType]:
return {PredictionType.nutrient_extraction}

@classmethod
def generate_candidates(
cls,
product: Optional[Product],
predictions: list[Prediction],
product_id: ProductIdentifier,
) -> Iterator[ProductInsight]:
if product is not None and product.nutriments:
# Don't generate candidates if the product already has nutrients
return

for prediction in predictions:
yield ProductInsight(**prediction.to_dict())

@classmethod
def is_conflicting_insight(
cls, candidate: ProductInsight, reference: ProductInsight
) -> bool:
# Only one insight per product
return True


class PackagingElementTaxonomyException(Exception):
pass

Expand Down Expand Up @@ -1860,6 +1891,7 @@ def import_product_predictions(
UPCImageImporter,
NutritionImageImporter,
IngredientSpellcheckImporter,
NutrientExtractionImporter,
]


Expand Down
12 changes: 10 additions & 2 deletions robotoff/off.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,16 @@ def get_username(self) -> Optional[str]:
return None


def get_source_from_url(ocr_url: str) -> str:
url_path = urlparse(ocr_url).path
def get_source_from_url(url: str) -> str:
"""Get the `source_image` field from an image or OCR URL.
It's the path of the image or OCR JSON file, but without the `/images/products`
prefix. It always ends with `.jpg`, whather it's an image or an OCR JSON file.
:param url: the URL of the image or OCR JSON file
:return: the source image path
"""
url_path = urlparse(url).path

if url_path.startswith("/images/products"):
url_path = url_path[len("/images/products") :]
Expand Down
Loading

0 comments on commit 9ae5ff7

Please sign in to comment.