merge

pass-culture · Oct 11, 2024 · 1f56b34 · 1f56b34
2 parents 5b23579 + 61b8713
commit 1f56b34
Show file tree

Hide file tree

Showing 20 changed files with 172 additions and 263 deletions.
diff --git a/.github/workflows/ci_gpt.yml b/.github/workflows/ci_gpt.yml
@@ -0,0 +1,19 @@
+name: Code Review GPT
+
+on:
+  pull_request_review_comment:
+
+jobs:
+  run_code_review:
+    runs-on: ubuntu-latest
+    if: ${{ contains('\gpt,/gpt', github.event.comment.body) && contains('lmontier-pass,valoumiaou,dcuesta-pass,LucileRainteau,cdelabre', github.event.pull_request.user.login)  }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Code Review GPT
+        uses: mattzcarey/code-review-gpt@v0.1.10
+        with:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MODEL: 'gpt-4o'
+          GITHUB_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/sonar_build.yml b/.github/workflows/sonar_build.yml
@@ -0,0 +1,23 @@
+name: Build SonarCloud Scan
+on:
+  push:
+    branches:
+        - main
+        - staging
+        - production
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  sonarcloud:
+    name: SonarCloud
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
+      - name: SonarCloud Scan
+        uses: SonarSource/sonarcloud-github-action@master
+        if: ${{ github.actor != 'dependabot[bot]' }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  # Needed to get PR information, if any
+          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
diff --git a/apps/fraud/compliance/.gitignore b/apps/fraud/compliance/.gitignore
@@ -0,0 +1 @@
+api/src/pcpapillon/local_model
diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance_model.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance_model.py
@@ -1,12 +1,11 @@
 from main import custom_logger
-from pcpapillon.utils.constants import ModelName, ModelType
+from pcpapillon.utils.constants import ModelName
 from pcpapillon.utils.data_model import ComplianceInput, ComplianceOutput
 from pcpapillon.utils.model_handler import ModelHandler, ModelWithMetadata
 
 
 class ComplianceModel:
     MODEL_NAME = ModelName.COMPLIANCE
-    MODEL_TYPE = ModelType.DEFAULT
 
     def __init__(self):
         self.model_handler = ModelHandler()
@@ -19,7 +18,7 @@ def _load_models(
     ) -> ModelWithMetadata:
         custom_logger.info(f"load {self.MODEL_NAME} model..")
         return self.model_handler.get_model_with_metadata_by_name(
-            model_name=self.MODEL_NAME, model_type=self.MODEL_TYPE
+            model_name=self.MODEL_NAME
         )
 
     def predict(self, data: ComplianceInput) -> ComplianceOutput:

diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py b/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py
@@ -1,118 +1,56 @@
-import time
-
-import numpy as np
 import pandas as pd
-from catboost import CatBoostClassifier
 from main import custom_logger
 from pcpapillon.utils.constants import (
     ModelName,
-    ModelType,
 )
 from pcpapillon.utils.data_model import (
     OfferCategorisationInput,
+    OfferCategorisationOutput,
 )
-from pcpapillon.utils.model_handler import ModelHandler
-from sentence_transformers import SentenceTransformer
+from pcpapillon.utils.model_handler import ModelHandler, ModelWithMetadata
 
 
 class OfferCategorisationModel:
-    LABEL_MAPPING_PATH = "pcpapillon/data/offer_categorisation_label_mapping.parquet"  # Will be removed when model predict is updated
-    PREPROCESSOR_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Will be removed when model predict is updated
     MODEL_NAME = ModelName.OFFER_CATEGORISATION
-    MODEL_TYPE = ModelType.DEFAULT
-    PREPROC_MODEL_TYPE = ModelType.PREPROCESSING
 
     def __init__(self):
         self.model_handler = ModelHandler()
-        self.model_classifier, self.sementinc_encoder = self._load_models()
-        self.classes_to_label_mapping = self._load_classes_to_label_mapping(
-            self.model_classifier.classes_
-        )
+        model_data = self._load_models()
+        self.model = model_data.model
 
     def predict(
-        self, input: OfferCategorisationInput, num_offers_to_return: int
-    ) -> pd.Series:
-        preprocessed_input = self._preprocess(input=input)
-
-        probabilities = self._classify(
-            preprocessed_input=preprocessed_input,
-        )
-
-        return self._postprocess(
-            probabilities=probabilities,
-            n_top=num_offers_to_return,
-        )
-
-    def _preprocess(self, input: OfferCategorisationInput):
-        t0 = time.time()
-
-        input_series = pd.Series(input.dict()).fillna("unkn")
-        content = [
-            "offer_name",
-            "offer_description",
-        ]
-        sementic_content = " ".join(input_series[content].astype(str))
-        custom_logger.debug(f"sementic_content: {sementic_content}")
-
-        output_series = pd.Series(
-            {
-                "venue_type_label": input.venue_type_label,
-                "offerer_name": input.offerer_name,
-                "embedding": self.sementinc_encoder.encode(sementic_content),
-            }
-        )
-
-        custom_logger.debug(
-            f"elapsed time for preprocessing the input (LLM embedding extraction) {time.time() - t0}"
+        self, data: OfferCategorisationInput, num_offers_to_return: int
+    ) -> OfferCategorisationOutput:
+        """
+        Predicts the class labels for the given data using the trained classifier model.
+
+        Args:
+            data (ComplianceInput): Input data to be predicted.
+
+        Returns:
+            ComplianceOutput: An object containing the predicted class labels
+                and the main contributions.
+        """
+        predictions = self.model.predict(data.dict())
+
+        num_offers_to_return = min(num_offers_to_return, len(predictions.subcategory))
+        predictions_df = (
+            pd.DataFrame(
+                {
+                    "subcategory": predictions.subcategory,
+                    "probability": predictions.probability,
+                }
+            )
+            .sort_values("probability", ascending=False)
+            .iloc[:num_offers_to_return]
         )
-        return output_series
 
-    def _classify(
-        self,
-        preprocessed_input: pd.Series,
-    ):
-        t0 = time.time()
-        probabilities = self.model_classifier.predict_proba(preprocessed_input)
-        custom_logger.debug(
-            f"elapsed time for classification (CatBoost) {time.time() - t0}"
+        return OfferCategorisationOutput(
+            most_probable_subcategories=predictions_df.to_dict(orient="records")
         )
 
-        return probabilities
-
-    def _postprocess(
-        self,
-        probabilities: pd.Series,
-        n_top: int,
-    ):
-        t0 = time.time()
-
-        top_indexes = probabilities.argsort()[-n_top:][::-1]
-        top_categories = self.classes_to_label_mapping.iloc[top_indexes]
-
-        custom_logger.debug(f"elapsed time for postprocessing {time.time() - t0}")
-
-        return pd.DataFrame(
-            {
-                "subcategory": top_categories,
-                "probability": probabilities[top_indexes],
-            }
-        ).to_dict(orient="records")
-
-    def _load_models(self) -> tuple[CatBoostClassifier, SentenceTransformer]:
+    def _load_models(self) -> ModelWithMetadata:
         custom_logger.info("Load offer categorisation model..")
-        model_classifier = self.model_handler.get_model_with_metadata_by_name(
-            model_name=self.MODEL_NAME, model_type=self.MODEL_TYPE
-        ).model
-
-        custom_logger.info("Load offer categorisation model preprocessor..")
-        text_preprocessor = self.model_handler.get_model_with_metadata_by_name(
-            model_name=self.PREPROCESSOR_NAME,
-            model_type=self.PREPROC_MODEL_TYPE,
-        ).model
-
-        return model_classifier, text_preprocessor
-
-    @classmethod
-    def _load_classes_to_label_mapping(cls, model_classes: np.ndarray) -> pd.Series:
-        label_mapping = pd.read_parquet(cls.LABEL_MAPPING_PATH)
-        return label_mapping.iloc[model_classes]["offer_subcategoryId"]
+        return self.model_handler.get_model_with_metadata_by_name(
+            model_name=self.MODEL_NAME
+        )
diff --git a/apps/fraud/compliance/api/src/pcpapillon/data/offer_categorisation_label_mapping.parquet b/apps/fraud/compliance/api/src/pcpapillon/data/offer_categorisation_label_mapping.parquet
diff --git a/apps/fraud/compliance/api/src/pcpapillon/utils/config_handler.py b/apps/fraud/compliance/api/src/pcpapillon/utils/config_handler.py
diff --git a/apps/fraud/compliance/api/src/pcpapillon/utils/configs.py b/apps/fraud/compliance/api/src/pcpapillon/utils/configs.py
diff --git a/apps/fraud/compliance/api/src/pcpapillon/utils/constants.py b/apps/fraud/compliance/api/src/pcpapillon/utils/constants.py
@@ -6,31 +6,5 @@ class ModelName(Enum):
     Enum class for model names
     """
 
-    OFFER_CATEGORISATION = "offer_categorisation"
-    COMPLIANCE = "compliance"
-
-
-class ModelType(Enum):
-    """
-    Enum class for model types
-    """
-
-    DEFAULT = "default"
-    PREPROCESSING = "custom_sentence_transformer"
-
-
-class ConfigName(Enum):
-    """
-    Enum class for config names
-    """
-
-    API = "API"
-    MODEL = "model"
-
-
-class APIType(Enum):
-    """
-    Enum class for API types
-    """
-
-    DEFAULT = "default"
+    OFFER_CATEGORISATION = "offer_categorization"
+    COMPLIANCE = "compliance_default"
diff --git a/apps/fraud/compliance/api/src/pcpapillon/utils/data_model.py b/apps/fraud/compliance/api/src/pcpapillon/utils/data_model.py
@@ -1,23 +1,9 @@
 # from __future__ import annotations
-from dataclasses import dataclass
 from typing import Union
 
-from dataclass_wizard import JSONWizard
 from pydantic import BaseModel
 
 
-@dataclass
-class APIConfig(JSONWizard):
-    features_to_extract_embedding: list[dict]
-    preprocess_features_type: dict[str]
-
-
-@dataclass
-class ModelConfig(JSONWizard):
-    pre_trained_model_for_embedding_extraction: dict[str]
-    catboost_features_types: dict[str]
-
-
 class User(BaseModel):
     username: str
     disabled: Union[bool, None] = None