-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
20 changed files
with
172 additions
and
263 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
name: Code Review GPT | ||
|
||
on: | ||
pull_request_review_comment: | ||
|
||
jobs: | ||
run_code_review: | ||
runs-on: ubuntu-latest | ||
if: ${{ contains('\gpt,/gpt', github.event.comment.body) && contains('lmontier-pass,valoumiaou,dcuesta-pass,LucileRainteau,cdelabre', github.event.pull_request.user.login) }} | ||
steps: | ||
- uses: actions/checkout@v3 | ||
with: | ||
fetch-depth: 0 | ||
- name: Code Review GPT | ||
uses: mattzcarey/code-review-gpt@v0.1.10 | ||
with: | ||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
MODEL: 'gpt-4o' | ||
GITHUB_TOKEN: ${{ github.token }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
name: Build SonarCloud Scan | ||
on: | ||
push: | ||
branches: | ||
- main | ||
- staging | ||
- production | ||
pull_request: | ||
types: [opened, synchronize, reopened] | ||
jobs: | ||
sonarcloud: | ||
name: SonarCloud | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis | ||
- name: SonarCloud Scan | ||
uses: SonarSource/sonarcloud-github-action@master | ||
if: ${{ github.actor != 'dependabot[bot]' }} | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any | ||
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
api/src/pcpapillon/local_model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
130 changes: 34 additions & 96 deletions
130
apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,118 +1,56 @@ | ||
import time | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from catboost import CatBoostClassifier | ||
from main import custom_logger | ||
from pcpapillon.utils.constants import ( | ||
ModelName, | ||
ModelType, | ||
) | ||
from pcpapillon.utils.data_model import ( | ||
OfferCategorisationInput, | ||
OfferCategorisationOutput, | ||
) | ||
from pcpapillon.utils.model_handler import ModelHandler | ||
from sentence_transformers import SentenceTransformer | ||
from pcpapillon.utils.model_handler import ModelHandler, ModelWithMetadata | ||
|
||
|
||
class OfferCategorisationModel: | ||
LABEL_MAPPING_PATH = "pcpapillon/data/offer_categorisation_label_mapping.parquet" # Will be removed when model predict is updated | ||
PREPROCESSOR_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Will be removed when model predict is updated | ||
MODEL_NAME = ModelName.OFFER_CATEGORISATION | ||
MODEL_TYPE = ModelType.DEFAULT | ||
PREPROC_MODEL_TYPE = ModelType.PREPROCESSING | ||
|
||
def __init__(self): | ||
self.model_handler = ModelHandler() | ||
self.model_classifier, self.sementinc_encoder = self._load_models() | ||
self.classes_to_label_mapping = self._load_classes_to_label_mapping( | ||
self.model_classifier.classes_ | ||
) | ||
model_data = self._load_models() | ||
self.model = model_data.model | ||
|
||
def predict( | ||
self, input: OfferCategorisationInput, num_offers_to_return: int | ||
) -> pd.Series: | ||
preprocessed_input = self._preprocess(input=input) | ||
|
||
probabilities = self._classify( | ||
preprocessed_input=preprocessed_input, | ||
) | ||
|
||
return self._postprocess( | ||
probabilities=probabilities, | ||
n_top=num_offers_to_return, | ||
) | ||
|
||
def _preprocess(self, input: OfferCategorisationInput): | ||
t0 = time.time() | ||
|
||
input_series = pd.Series(input.dict()).fillna("unkn") | ||
content = [ | ||
"offer_name", | ||
"offer_description", | ||
] | ||
sementic_content = " ".join(input_series[content].astype(str)) | ||
custom_logger.debug(f"sementic_content: {sementic_content}") | ||
|
||
output_series = pd.Series( | ||
{ | ||
"venue_type_label": input.venue_type_label, | ||
"offerer_name": input.offerer_name, | ||
"embedding": self.sementinc_encoder.encode(sementic_content), | ||
} | ||
) | ||
|
||
custom_logger.debug( | ||
f"elapsed time for preprocessing the input (LLM embedding extraction) {time.time() - t0}" | ||
self, data: OfferCategorisationInput, num_offers_to_return: int | ||
) -> OfferCategorisationOutput: | ||
""" | ||
Predicts the class labels for the given data using the trained classifier model. | ||
Args: | ||
data (ComplianceInput): Input data to be predicted. | ||
Returns: | ||
ComplianceOutput: An object containing the predicted class labels | ||
and the main contributions. | ||
""" | ||
predictions = self.model.predict(data.dict()) | ||
|
||
num_offers_to_return = min(num_offers_to_return, len(predictions.subcategory)) | ||
predictions_df = ( | ||
pd.DataFrame( | ||
{ | ||
"subcategory": predictions.subcategory, | ||
"probability": predictions.probability, | ||
} | ||
) | ||
.sort_values("probability", ascending=False) | ||
.iloc[:num_offers_to_return] | ||
) | ||
return output_series | ||
|
||
def _classify( | ||
self, | ||
preprocessed_input: pd.Series, | ||
): | ||
t0 = time.time() | ||
probabilities = self.model_classifier.predict_proba(preprocessed_input) | ||
custom_logger.debug( | ||
f"elapsed time for classification (CatBoost) {time.time() - t0}" | ||
return OfferCategorisationOutput( | ||
most_probable_subcategories=predictions_df.to_dict(orient="records") | ||
) | ||
|
||
return probabilities | ||
|
||
def _postprocess( | ||
self, | ||
probabilities: pd.Series, | ||
n_top: int, | ||
): | ||
t0 = time.time() | ||
|
||
top_indexes = probabilities.argsort()[-n_top:][::-1] | ||
top_categories = self.classes_to_label_mapping.iloc[top_indexes] | ||
|
||
custom_logger.debug(f"elapsed time for postprocessing {time.time() - t0}") | ||
|
||
return pd.DataFrame( | ||
{ | ||
"subcategory": top_categories, | ||
"probability": probabilities[top_indexes], | ||
} | ||
).to_dict(orient="records") | ||
|
||
def _load_models(self) -> tuple[CatBoostClassifier, SentenceTransformer]: | ||
def _load_models(self) -> ModelWithMetadata: | ||
custom_logger.info("Load offer categorisation model..") | ||
model_classifier = self.model_handler.get_model_with_metadata_by_name( | ||
model_name=self.MODEL_NAME, model_type=self.MODEL_TYPE | ||
).model | ||
|
||
custom_logger.info("Load offer categorisation model preprocessor..") | ||
text_preprocessor = self.model_handler.get_model_with_metadata_by_name( | ||
model_name=self.PREPROCESSOR_NAME, | ||
model_type=self.PREPROC_MODEL_TYPE, | ||
).model | ||
|
||
return model_classifier, text_preprocessor | ||
|
||
@classmethod | ||
def _load_classes_to_label_mapping(cls, model_classes: np.ndarray) -> pd.Series: | ||
label_mapping = pd.read_parquet(cls.LABEL_MAPPING_PATH) | ||
return label_mapping.iloc[model_classes]["offer_subcategoryId"] | ||
return self.model_handler.get_model_with_metadata_by_name( | ||
model_name=self.MODEL_NAME | ||
) |
Binary file removed
BIN
-4.34 KB
apps/fraud/compliance/api/src/pcpapillon/data/offer_categorisation_label_mapping.parquet
Binary file not shown.
12 changes: 0 additions & 12 deletions
12
apps/fraud/compliance/api/src/pcpapillon/utils/config_handler.py
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
14 changes: 0 additions & 14 deletions
14
apps/fraud/compliance/api/src/pcpapillon/utils/data_model.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.