Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
cdelabre committed Oct 11, 2024
2 parents 5b23579 + 61b8713 commit 1f56b34
Show file tree
Hide file tree
Showing 20 changed files with 172 additions and 263 deletions.
19 changes: 19 additions & 0 deletions .github/workflows/ci_gpt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Code Review GPT

on:
pull_request_review_comment:

jobs:
run_code_review:
runs-on: ubuntu-latest
if: ${{ contains('\gpt,/gpt', github.event.comment.body) && contains('lmontier-pass,valoumiaou,dcuesta-pass,LucileRainteau,cdelabre', github.event.pull_request.user.login) }}
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Code Review GPT
uses: mattzcarey/code-review-gpt@v0.1.10
with:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MODEL: 'gpt-4o'
GITHUB_TOKEN: ${{ github.token }}
23 changes: 23 additions & 0 deletions .github/workflows/sonar_build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Build SonarCloud Scan
on:
push:
branches:
- main
- staging
- production
pull_request:
types: [opened, synchronize, reopened]
jobs:
sonarcloud:
name: SonarCloud
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
- name: SonarCloud Scan
uses: SonarSource/sonarcloud-github-action@master
if: ${{ github.actor != 'dependabot[bot]' }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
1 change: 1 addition & 0 deletions apps/fraud/compliance/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
api/src/pcpapillon/local_model
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from main import custom_logger
from pcpapillon.utils.constants import ModelName, ModelType
from pcpapillon.utils.constants import ModelName
from pcpapillon.utils.data_model import ComplianceInput, ComplianceOutput
from pcpapillon.utils.model_handler import ModelHandler, ModelWithMetadata


class ComplianceModel:
MODEL_NAME = ModelName.COMPLIANCE
MODEL_TYPE = ModelType.DEFAULT

def __init__(self):
self.model_handler = ModelHandler()
Expand All @@ -19,7 +18,7 @@ def _load_models(
) -> ModelWithMetadata:
custom_logger.info(f"load {self.MODEL_NAME} model..")
return self.model_handler.get_model_with_metadata_by_name(
model_name=self.MODEL_NAME, model_type=self.MODEL_TYPE
model_name=self.MODEL_NAME
)

def predict(self, data: ComplianceInput) -> ComplianceOutput:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,118 +1,56 @@
import time

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from main import custom_logger
from pcpapillon.utils.constants import (
ModelName,
ModelType,
)
from pcpapillon.utils.data_model import (
OfferCategorisationInput,
OfferCategorisationOutput,
)
from pcpapillon.utils.model_handler import ModelHandler
from sentence_transformers import SentenceTransformer
from pcpapillon.utils.model_handler import ModelHandler, ModelWithMetadata


class OfferCategorisationModel:
LABEL_MAPPING_PATH = "pcpapillon/data/offer_categorisation_label_mapping.parquet" # Will be removed when model predict is updated
PREPROCESSOR_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Will be removed when model predict is updated
MODEL_NAME = ModelName.OFFER_CATEGORISATION
MODEL_TYPE = ModelType.DEFAULT
PREPROC_MODEL_TYPE = ModelType.PREPROCESSING

def __init__(self):
self.model_handler = ModelHandler()
self.model_classifier, self.sementinc_encoder = self._load_models()
self.classes_to_label_mapping = self._load_classes_to_label_mapping(
self.model_classifier.classes_
)
model_data = self._load_models()
self.model = model_data.model

def predict(
self, input: OfferCategorisationInput, num_offers_to_return: int
) -> pd.Series:
preprocessed_input = self._preprocess(input=input)

probabilities = self._classify(
preprocessed_input=preprocessed_input,
)

return self._postprocess(
probabilities=probabilities,
n_top=num_offers_to_return,
)

def _preprocess(self, input: OfferCategorisationInput):
t0 = time.time()

input_series = pd.Series(input.dict()).fillna("unkn")
content = [
"offer_name",
"offer_description",
]
sementic_content = " ".join(input_series[content].astype(str))
custom_logger.debug(f"sementic_content: {sementic_content}")

output_series = pd.Series(
{
"venue_type_label": input.venue_type_label,
"offerer_name": input.offerer_name,
"embedding": self.sementinc_encoder.encode(sementic_content),
}
)

custom_logger.debug(
f"elapsed time for preprocessing the input (LLM embedding extraction) {time.time() - t0}"
self, data: OfferCategorisationInput, num_offers_to_return: int
) -> OfferCategorisationOutput:
"""
Predicts the class labels for the given data using the trained classifier model.
Args:
data (ComplianceInput): Input data to be predicted.
Returns:
ComplianceOutput: An object containing the predicted class labels
and the main contributions.
"""
predictions = self.model.predict(data.dict())

num_offers_to_return = min(num_offers_to_return, len(predictions.subcategory))
predictions_df = (
pd.DataFrame(
{
"subcategory": predictions.subcategory,
"probability": predictions.probability,
}
)
.sort_values("probability", ascending=False)
.iloc[:num_offers_to_return]
)
return output_series

def _classify(
self,
preprocessed_input: pd.Series,
):
t0 = time.time()
probabilities = self.model_classifier.predict_proba(preprocessed_input)
custom_logger.debug(
f"elapsed time for classification (CatBoost) {time.time() - t0}"
return OfferCategorisationOutput(
most_probable_subcategories=predictions_df.to_dict(orient="records")
)

return probabilities

def _postprocess(
self,
probabilities: pd.Series,
n_top: int,
):
t0 = time.time()

top_indexes = probabilities.argsort()[-n_top:][::-1]
top_categories = self.classes_to_label_mapping.iloc[top_indexes]

custom_logger.debug(f"elapsed time for postprocessing {time.time() - t0}")

return pd.DataFrame(
{
"subcategory": top_categories,
"probability": probabilities[top_indexes],
}
).to_dict(orient="records")

def _load_models(self) -> tuple[CatBoostClassifier, SentenceTransformer]:
def _load_models(self) -> ModelWithMetadata:
custom_logger.info("Load offer categorisation model..")
model_classifier = self.model_handler.get_model_with_metadata_by_name(
model_name=self.MODEL_NAME, model_type=self.MODEL_TYPE
).model

custom_logger.info("Load offer categorisation model preprocessor..")
text_preprocessor = self.model_handler.get_model_with_metadata_by_name(
model_name=self.PREPROCESSOR_NAME,
model_type=self.PREPROC_MODEL_TYPE,
).model

return model_classifier, text_preprocessor

@classmethod
def _load_classes_to_label_mapping(cls, model_classes: np.ndarray) -> pd.Series:
label_mapping = pd.read_parquet(cls.LABEL_MAPPING_PATH)
return label_mapping.iloc[model_classes]["offer_subcategoryId"]
return self.model_handler.get_model_with_metadata_by_name(
model_name=self.MODEL_NAME
)
Binary file not shown.
12 changes: 0 additions & 12 deletions apps/fraud/compliance/api/src/pcpapillon/utils/config_handler.py

This file was deleted.

49 changes: 0 additions & 49 deletions apps/fraud/compliance/api/src/pcpapillon/utils/configs.py

This file was deleted.

30 changes: 2 additions & 28 deletions apps/fraud/compliance/api/src/pcpapillon/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,5 @@ class ModelName(Enum):
Enum class for model names
"""

OFFER_CATEGORISATION = "offer_categorisation"
COMPLIANCE = "compliance"


class ModelType(Enum):
"""
Enum class for model types
"""

DEFAULT = "default"
PREPROCESSING = "custom_sentence_transformer"


class ConfigName(Enum):
"""
Enum class for config names
"""

API = "API"
MODEL = "model"


class APIType(Enum):
"""
Enum class for API types
"""

DEFAULT = "default"
OFFER_CATEGORISATION = "offer_categorization"
COMPLIANCE = "compliance_default"
14 changes: 0 additions & 14 deletions apps/fraud/compliance/api/src/pcpapillon/utils/data_model.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,9 @@
# from __future__ import annotations
from dataclasses import dataclass
from typing import Union

from dataclass_wizard import JSONWizard
from pydantic import BaseModel


@dataclass
class APIConfig(JSONWizard):
features_to_extract_embedding: list[dict]
preprocess_features_type: dict[str]


@dataclass
class ModelConfig(JSONWizard):
pre_trained_model_for_embedding_extraction: dict[str]
catboost_features_types: dict[str]


class User(BaseModel):
username: str
disabled: Union[bool, None] = None
Expand Down
Loading

0 comments on commit 1f56b34

Please sign in to comment.