diff --git a/.github/PULL_REQUEST_TEMPLATE/MEP_template.md b/.github/PULL_REQUEST_TEMPLATE/MEP_template.md index 06c74c6c..45b2a04a 100644 --- a/.github/PULL_REQUEST_TEMPLATE/MEP_template.md +++ b/.github/PULL_REQUEST_TEMPLATE/MEP_template.md @@ -6,4 +6,5 @@ version: - [ ] version number - [ ] CI/CD tests pass -- [ ] Add message on data review slack channel \ No newline at end of file +- [ ] Add message on data review slack channel +- [ ] ⚠️ I will click on **Merge Pull Request** and **NOT** *on Squash and merge* diff --git a/.github/PULL_REQUEST_TEMPLATE/MES_template.md b/.github/PULL_REQUEST_TEMPLATE/MES_template.md index fe4672c8..c6722748 100644 --- a/.github/PULL_REQUEST_TEMPLATE/MES_template.md +++ b/.github/PULL_REQUEST_TEMPLATE/MES_template.md @@ -6,4 +6,5 @@ version: - [ ] version number - [ ] CI/CD tests pass -- [ ] Add message on data review slack channel \ No newline at end of file +- [ ] Add message on data review slack channel +- [ ] ⚠️ I will click on **Merge Pull Request** and **NOT** *on Squash and merge* diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index af11a991..b7a2f8f9 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,8 +1,8 @@ ### Select PR template in preview mode: +* [Ticket](?expand=1&template=Generic_template.md) * [Hotfix](?expand=1&template=Hotfix_template.md) * [MES](?expand=1&template=MES_template.md) * [MEP](?expand=1&template=MEP_template.md) -* [Other](?expand=1&template=Generic_template.md) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7efd3a1c..1f22de6a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,7 @@ repos: stages: [commit] - id: lint-check name: Linting (ruff) - entry: ruff check + entry: ruff check --fix language: system types: [python] stages: [commit] diff --git a/apps/fraud/compliance/api/Dockerfile b/apps/fraud/compliance/api/Dockerfile index f438aa2d..2681304f 100644 --- a/apps/fraud/compliance/api/Dockerfile +++ b/apps/fraud/compliance/api/Dockerfile @@ -1,15 +1,23 @@ -FROM python:3.9-slim +FROM python:3.10-slim +COPY --from=ghcr.io/astral-sh/uv:0.3.4 /uv /bin/uv -ENV PYTHONUNBUFFERED True - -ENV APP_HOME /app +ENV PYTHONUNBUFFERED=True +ENV APP_HOME=/app WORKDIR $APP_HOME + # Install libraries COPY ./requirements.txt ./ -RUN pip install --no-cache-dir -r requirements.txt -COPY ./src/main.py . +RUN uv pip install -r requirements.txt --system COPY ./src . -# Copy local code to the container -COPY ./src/pcpapillon ./pcpapillon -CMD exec gunicorn -k uvicorn.workers.UvicornWorker --bind :$PORT --workers 1 --threads 10 --preload --timeout 0 main:app \ No newline at end of file +# Conditionally copy the Google Cloud credentials file based on the argument +ARG LOCAL=false +ARG GOOGLE_CLOUD_PROJECT +ENV GOOGLE_CLOUD_PROJECT=${GOOGLE_CLOUD_PROJECT} +RUN if [ "$LOCAL" = "true" ]; then \ + mkdir -p /root/.config/gcloud && \ + cp ./application_default_credentials.json /root/.config/gcloud/application_default_credentials.json; \ + fi + +# Run the container +CMD exec gunicorn -k uvicorn.workers.UvicornWorker --bind :$PORT --workers 1 --threads 10 --preload --timeout 0 main:app diff --git a/apps/fraud/compliance/api/Makefile b/apps/fraud/compliance/api/Makefile index 4a66fd99..1af179fb 100644 --- a/apps/fraud/compliance/api/Makefile +++ b/apps/fraud/compliance/api/Makefile @@ -1,5 +1,5 @@ init: - pyenv install 3.9 -s + pyenv install 3.10 -s @eval "$$(pyenv init -)" && pyenv virtualenv 3.9 api-data-fraud-compliance && pyenv local api-data-fraud-compliance install_with_uv: diff --git a/apps/fraud/compliance/api/deploy_local.sh b/apps/fraud/compliance/api/deploy_local.sh old mode 100644 new mode 100755 index 481dcd53..b8a7363a --- a/apps/fraud/compliance/api/deploy_local.sh +++ b/apps/fraud/compliance/api/deploy_local.sh @@ -1,2 +1,7 @@ -docker build -t pcpapillon --progress=plain . -docker run -it -p 8080:8080 pcpapillon \ No newline at end of file +if [ ! -f ~/.config/gcloud/application_default_credentials.json ]; then + gcloud auth application-default login +fi +cp ~/.config/gcloud/application_default_credentials.json src/application_default_credentials.json +docker build -t pcpapillon --build-arg LOCAL=true --build-arg GOOGLE_CLOUD_PROJECT=passculture-data-ehp . +rm src/application_default_credentials.json +docker run -p 8080:8080 -e "PORT=8080" pcpapillon diff --git a/apps/fraud/compliance/api/requirements.in b/apps/fraud/compliance/api/requirements.in index 1d5935f4..fd5ba553 100644 --- a/apps/fraud/compliance/api/requirements.in +++ b/apps/fraud/compliance/api/requirements.in @@ -1,4 +1,6 @@ -catboost==1.2 +apscheduler==3.10.4 +catboost==1.2.5 +cloudpickle==3.0.0 dataclass-wizard==0.22.2 fastapi-cloud-logging==1.1.0 fastapi-versioning==0.10.0 @@ -8,14 +10,19 @@ google-cloud-logging==3.5.0 google-cloud-secret-manager==2.2.0 google-cloud-storage==2.9.0 gunicorn==20.0.4 +ipython==8.24.0 loguru==0.7.0 -mlflow==2.3.1 -numpy==1.22 +matplotlib==3.9.2 +mlflow==2.10.2 +numpy==1.26.4 oauth2client==4.1.3 -pandas==2.0.1 +pandas==2.2.0 +psutil==5.9.8 python-jose==3.3.0 python-multipart==0.0.6 ruff==0.4.7 +scikit-learn==1.5.0 +scipy==1.13.1 sentence-transformers==2.2.2 -shap==0.41.0 +shap==0.46.0 uvicorn==0.22.0 diff --git a/apps/fraud/compliance/api/requirements.txt b/apps/fraud/compliance/api/requirements.txt index 2c126840..27eaddd2 100644 --- a/apps/fraud/compliance/api/requirements.txt +++ b/apps/fraud/compliance/api/requirements.txt @@ -4,12 +4,15 @@ alembic==1.13.2 # via mlflow anyio==4.4.0 # via starlette +apscheduler==3.10.4 + # via -r requirements.in +asttokens==2.4.1 + # via stack-data blinker==1.8.2 # via flask cachetools==5.4.0 # via google-auth -apscheduler==3.10.4 -catboost==1.2 +catboost==1.2.5 # via -r requirements.in certifi==2024.7.4 # via requests @@ -17,23 +20,23 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via - # databricks-cli # flask # mlflow # nltk # uvicorn -cloudpickle==2.2.1 +cloudpickle==3.0.0 # via + # -r requirements.in # mlflow # shap contourpy==1.2.1 # via matplotlib cycler==0.12.1 # via matplotlib -databricks-cli==0.18.0 - # via mlflow dataclass-wizard==0.22.2 # via -r requirements.in +decorator==5.1.1 + # via ipython docker==6.1.3 # via mlflow ecdsa==0.19.0 @@ -41,7 +44,11 @@ ecdsa==0.19.0 entrypoints==0.4 # via mlflow exceptiongroup==1.2.2 - # via anyio + # via + # anyio + # ipython +executing==2.0.1 + # via stack-data fastapi==0.95.1 # via # -r requirements.in @@ -152,14 +159,13 @@ idna==3.7 # anyio # requests importlib-metadata==6.11.0 - # via - # flask - # markdown - # mlflow -importlib-resources==6.4.0 - # via matplotlib + # via mlflow +ipython==8.24.0 + # via -r requirements.in itsdangerous==2.2.0 # via flask +jedi==0.19.1 + # via ipython jinja2==3.1.4 # via # flask @@ -186,11 +192,14 @@ markupsafe==2.1.5 # jinja2 # mako # werkzeug -matplotlib==3.8.4 +matplotlib==3.9.2 # via + # -r requirements.in # catboost # mlflow -mlflow==2.3.1 +matplotlib-inline==0.1.7 + # via ipython +mlflow==2.10.2 # via -r requirements.in mpmath==1.3.0 # via sympy @@ -200,7 +209,7 @@ nltk==3.8.1 # via sentence-transformers numba==0.60.0 # via shap -numpy==1.22.0 +numpy==1.26.4 # via # -r requirements.in # catboost @@ -249,8 +258,6 @@ nvidia-nvtx-cu12==12.1.105 # via torch oauth2client==4.1.3 # via -r requirements.in -oauthlib==3.2.2 - # via databricks-cli packaging==23.2 # via # docker @@ -260,18 +267,24 @@ packaging==23.2 # plotly # shap # transformers -pandas==2.0.1 +pandas==2.2.0 # via # -r requirements.in # catboost # mlflow # shap +parso==0.8.4 + # via jedi +pexpect==4.9.0 + # via ipython pillow==10.4.0 # via # matplotlib # torchvision plotly==5.22.0 # via catboost +prompt-toolkit==3.0.47 + # via ipython proto-plus==1.24.0 # via # google-cloud-appengine-logging @@ -288,6 +301,12 @@ protobuf==3.20.3 # grpcio-status # mlflow # proto-plus +psutil==5.9.8 + # via -r requirements.in +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data pyarrow==11.0.0 # via mlflow pyasn1==0.6.0 @@ -302,8 +321,8 @@ pyasn1-modules==0.4.0 # oauth2client pydantic==1.10.17 # via fastapi -pyjwt==2.8.0 - # via databricks-cli +pygments==2.18.0 + # via ipython pyparsing==3.1.2 # via # httplib2 @@ -318,6 +337,7 @@ python-multipart==0.0.6 # via -r requirements.in pytz==2023.4 # via + # apscheduler # mlflow # pandas pyyaml==6.0.1 @@ -334,7 +354,6 @@ regex==2024.5.15 # transformers requests==2.32.3 # via - # databricks-cli # docker # google-api-core # google-cloud-storage @@ -350,13 +369,15 @@ ruff==0.4.7 # via -r requirements.in safetensors==0.4.3 # via transformers -scikit-learn==1.5.1 +scikit-learn==1.5.0 # via + # -r requirements.in # mlflow # sentence-transformers # shap -scipy==1.11.4 +scipy==1.13.1 # via + # -r requirements.in # catboost # mlflow # scikit-learn @@ -368,18 +389,19 @@ sentencepiece==0.2.0 # via sentence-transformers setuptools==70.3.0 # via gunicorn -shap==0.41.0 +shap==0.46.0 # via -r requirements.in six==1.16.0 # via + # apscheduler + # asttokens # catboost - # databricks-cli # ecdsa # google-api-python-client # oauth2client # python-dateutil # querystring-parser -slicer==0.0.7 +slicer==0.0.8 # via shap smmap==5.0.1 # via gitdb @@ -391,14 +413,14 @@ sqlalchemy==2.0.31 # mlflow sqlparse==0.5.1 # via mlflow +stack-data==0.6.3 + # via ipython starlette==0.26.1 # via # fastapi # fastapi-versioning sympy==1.13.0 # via torch -tabulate==0.9.0 - # via databricks-cli tenacity==8.5.0 # via plotly threadpoolctl==3.5.0 @@ -418,6 +440,10 @@ tqdm==4.66.4 # sentence-transformers # shap # transformers +traitlets==5.14.3 + # via + # ipython + # matplotlib-inline transformers==4.42.4 # via sentence-transformers triton==2.3.1 @@ -426,28 +452,28 @@ typing-extensions==4.12.2 # via # alembic # anyio - # dataclass-wizard # huggingface-hub + # ipython # pydantic # sqlalchemy - # starlette # torch tzdata==2024.1 # via pandas +tzlocal==5.2 + # via apscheduler uritemplate==3.0.1 # via google-api-python-client urllib3==2.2.2 # via - # databricks-cli # docker # requests uvicorn==0.22.0 # via -r requirements.in +wcwidth==0.2.13 + # via prompt-toolkit websocket-client==1.8.0 # via docker werkzeug==3.0.3 # via flask zipp==3.19.2 - # via - # importlib-metadata - # importlib-resources + # via importlib-metadata diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/__init__.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/compliance_model.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/compliance_model.py deleted file mode 100644 index 793f8965..00000000 --- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/compliance_model.py +++ /dev/null @@ -1,120 +0,0 @@ -from dataclasses import dataclass -from typing import Union - -import mlflow -from main import custom_logger -from pcpapillon.core.compliance.predict import get_prediction_and_main_contribution -from pcpapillon.core.compliance.preprocess import preprocess -from pcpapillon.utils.config_handler import ConfigHandler -from pcpapillon.utils.constants import APIType, ModelName, ModelType -from pcpapillon.utils.env_vars import ( - IS_API_LOCAL, -) -from pcpapillon.utils.model_handler import ModelHandler -from sentence_transformers import SentenceTransformer - - -@dataclass -class ModelData: - classification_model: Union[mlflow.pyfunc.PythonModel, SentenceTransformer] - model_identifier: str - preprocessing_models: dict[str, SentenceTransformer] - - -class ComplianceModel: - MODEL_NAME = ModelName.COMPLIANCE - MODEL_TYPE = ModelType.LOCAL if IS_API_LOCAL else ModelType.DEFAULT - PREPROC_MODEL_TYPE = MODEL_TYPE.PREPROCESSING - - def __init__(self): - self.api_config = ConfigHandler.get_api_config(APIType.DEFAULT) - self.model_config = ConfigHandler.get_model_config(ModelType.DEFAULT) - self.model_handler = ModelHandler() - model_data = self._load_models() - self.classfier_model = model_data.classification_model - self.classifier_model_identifier = model_data.model_identifier - self.prepoc_models = model_data.preprocessing_models - - def _load_models( - self, - ) -> ModelData: - custom_logger.info("load classification model..") - catboost_model_with_metadata = ( - self.model_handler.get_model_with_metadata_by_name( - model_name=self.MODEL_NAME, model_type=self.MODEL_TYPE - ) - ) - - custom_logger.info("load preprocessings model..") - prepoc_models = {} - for ( - feature_type, - sentence_transformer_name, - ) in self.model_config.pre_trained_model_for_embedding_extraction.items(): - prepoc_models[feature_type] = ( - self.model_handler.get_model_with_metadata_by_name( - model_name=sentence_transformer_name, - model_type=self.PREPROC_MODEL_TYPE, - ).model - ) - - custom_logger.info( - f"Preprocessing models for {self.MODEL_NAME} : {prepoc_models}" - ) - return ModelData( - classification_model=catboost_model_with_metadata.model, - model_identifier=catboost_model_with_metadata.model_identifier, - preprocessing_models=prepoc_models, - ) - - def predict(self, data): - """ - Predicts the class labels for the given data using the trained classifier model. - - Args: - api_config (dict): Configuration parameters for the API. - model_config (dict): Configuration parameters for the model. - data (list): Input data to be predicted. - - Returns: - tuple: A tuple containing the predicted class labels and the main contribution. - offer validition probability - offer rejection probability (=1-proba_val) - main features contributing to increase validation probability - main features contributing to reduce validation probability - """ - - # Preprocess the data and the embedder - pool, data_w_emb = preprocess( - self.api_config, self.model_config, data, self.prepoc_models - ) - - # Run the prediction - return get_prediction_and_main_contribution( - self.classfier_model, data_w_emb, pool - ) - - def _is_newer_model_available(self) -> bool: - return ( - self.classifier_model_identifier - != self.model_handler.get_model_hash_from_mlflow(self.MODEL_NAME) - ) - - def reload_model_if_newer_is_available(self): - custom_logger.debug("Checking if newer model is available...") - if self._is_newer_model_available(): - custom_logger.info("New model available: Loading it...") - classfier_model_with_metadata = ( - self.model_handler.get_model_with_metadata_by_name( - model_name=self.MODEL_NAME, model_type=self.MODEL_TYPE - ) - ) - self.classfier_model, self.classifier_model_identifier = ( - classfier_model_with_metadata.model, - classfier_model_with_metadata.model_identifier, - ) - custom_logger.info( - f"...New model loaded with hash {self.classifier_model_identifier}" - ) - else: - custom_logger.debug("...No newer model available") diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py deleted file mode 100644 index 6b6310fb..00000000 --- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py +++ /dev/null @@ -1,53 +0,0 @@ -import contextlib -import io - -import numpy as np -import requests -from PIL import Image - - -def extract_embedding(data, params, prepoc_models): - """ - Extract embedding with pretrained models - Two types available: - - image : - - Input: list of urls - - text : - - Input: list of string - Params template: - [ - {"name": "offer_name", "type": "text"}, - {"name": "offer_description", "type": "text"}, - {"name": "image_url", "type": "image"}, - ] - """ - for feature in params: - if feature["type"] == "image": - model = prepoc_models[feature["type"]] - url = data[feature["name"]] - data["image_embedding"] = _encode_img_from_url(model, url) - with contextlib.suppress(KeyError): - del data[feature["name"]] - if feature["type"] == "text": - model = prepoc_models[feature["type"]] - embedding = model.encode(data[feature["name"]]) - data[f"""{feature["name"]}_embedding"""] = embedding - - return data - - -def _encode_img_from_url(model, url): - """ - Encode image with pre-trained model from url - - inputs: - - model : HugginFaces pre-trained model using Sentence-Transformers - - url : string of image url - """ - offer_img_embs = [] - try: - img_emb = model.encode(Image.open(io.BytesIO(requests.get(url).content))) - offer_img_embs = img_emb - except Exception: - offer_img_embs = np.array([0] * 512) - return offer_img_embs diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py deleted file mode 100644 index cb82ff10..00000000 --- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py +++ /dev/null @@ -1,64 +0,0 @@ -from heapq import nlargest, nsmallest - -import shap - - -def get_prediction_and_main_contribution(model, data_w_emb, pool): - """ - Prediction: - Predict validation/rejection probability for a given input as catboost pool - inputs: - - pool: Catboost pool with offer features - - model: Catboost custom trained model - outputs: - proba_val: offer validition probability - proba_rej: offer rejection probability (=1-proba_val) - Main contribution: - Extract prediction main contribution features from shap values - inputs: - - model: Catboost custom trained model - - data: json with offer features - - pool: Catboost with offer features - outputs: - top_val: main features contributing to increase validation probability - top_reg: main features contributing to reduce validation probability - """ - proba_predicted = list( - model.predict( - pool, - prediction_type="Probability", - ntree_start=0, - ntree_end=0, - thread_count=1, - verbose=None, - )[0] - ) - proba_rej = proba_predicted[0] * 100 - proba_val = proba_predicted[1] * 100 - top_val, top_rej = _get_prediction_main_contribution(model, data_w_emb, pool) - return proba_val, proba_rej, top_val, top_rej - - -def _get_prediction_main_contribution(model, data, pool): - explainer = shap.Explainer(model, link=shap.links.logit) - shap_values = explainer.shap_values(pool) - top_val, top_rej = __get_contribution_from_shap_values(shap_values, data) - return top_val, top_rej - - -def __get_contribution_from_shap_values(shap_values, data): - topk_validation_factor = [] - topk_rejection_factor = [] - data_keys = list(data.keys()) - # for i in range(len(data)): - individual_shap_values = list(shap_values[0, :]) - klargest = nlargest(3, individual_shap_values) - ksmallest = nsmallest(3, individual_shap_values) - topk_validation_factor = [ - data_keys[individual_shap_values.index(max_val)] for max_val in klargest - ] - - topk_rejection_factor = [ - data_keys[individual_shap_values.index(min_val)] for min_val in ksmallest - ] - return topk_validation_factor, topk_rejection_factor diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py deleted file mode 100644 index 804abbcc..00000000 --- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py +++ /dev/null @@ -1,73 +0,0 @@ -import contextlib -import itertools - -from catboost import Pool -from pcpapillon.core.compliance.extract_embedding import extract_embedding - - -def preprocess(api_config, model_config, data, prepoc_models): - """ - Preprocessing steps: - - prepare features - - convert json data to catboost pool - """ - data_clean = prepare_features(data, api_config.preprocess_features_type) - data_w_emb = extract_embedding( - data_clean, api_config.features_to_extract_embedding, prepoc_models - ) - - scoring_features = list( - itertools.chain.from_iterable(model_config.catboost_features_types.values()) - ) - data_w_emb_clean = {} - for feature in scoring_features: - data_w_emb_clean[feature] = data_w_emb[feature] - pool = convert_data_to_catboost_pool( - data_w_emb_clean, model_config.catboost_features_types - ) - return pool, data_w_emb - - -def prepare_features(data, params): - """ - Prepare features: - - Fill integer null values with 0 - - Fill string null values with "none" - - Convert boolean columns to int - """ - with contextlib.suppress(KeyError): - del data["offer_id"] - - for key in data: - if key in params["text_features"]: - data[key] = "" if data[key] is None else str(data[key]) - if key in params["numerical_features"]: - data[key] = 0 if data[key] is None else int(data[key]) - if "macro_text" in params: - semantic_content = " ".join( - [semantic_feature.lower() for semantic_feature in params["macro_text"]] - ) - data["semantic_content"] = semantic_content - return data - - -def convert_data_to_catboost_pool(data, features_type_dict): - """ - Convert json data to catboost pool: - - inputs: - - Features names: list of features name (same order as list of features) - - cat_features: list of categorical features names - - text_features: list of text features names - - embedding_features: list of embedding features names - - output: - - catboost pool - """ - data_input = [list(data.values())] - pool = Pool( - data=data_input, - feature_names=list(data.keys()), - cat_features=features_type_dict["cat_features"], - text_features=features_type_dict["text_features"], - embedding_features=features_type_dict["embedding_features"], - ) - return pool diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance_model.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance_model.py new file mode 100644 index 00000000..40e0e90f --- /dev/null +++ b/apps/fraud/compliance/api/src/pcpapillon/core/compliance_model.py @@ -0,0 +1,63 @@ +from main import custom_logger +from pcpapillon.utils.constants import ModelName, ModelType +from pcpapillon.utils.data_model import ComplianceInput, ComplianceOutput +from pcpapillon.utils.model_handler import ModelHandler, ModelWithMetadata + + +class ComplianceModel: + MODEL_NAME = ModelName.COMPLIANCE + MODEL_TYPE = ModelType.DEFAULT + + def __init__(self): + self.model_handler = ModelHandler() + model_data = self._load_models() + self.model = model_data.model + self.model_identifier = model_data.model_identifier + + def _load_models( + self, + ) -> ModelWithMetadata: + custom_logger.info(f"load {self.MODEL_NAME} model..") + return self.model_handler.get_model_with_metadata_by_name( + model_name=self.MODEL_NAME, model_type=self.MODEL_TYPE + ) + + def predict(self, data: ComplianceInput) -> ComplianceOutput: + """ + Predicts the class labels for the given data using the trained classifier model. + + Args: + data (ComplianceInput): Input data to be predicted. + + Returns: + ComplianceOutput: An object containing the predicted class labels + and the main contributions. + """ + predictions = self.model.predict(data.dict()) + return ComplianceOutput( + offer_id=data.offer_id, + probability_validated=predictions.probability_validated, + validation_main_features=predictions.validation_main_features, + probability_rejected=predictions.probability_rejected, + rejection_main_features=predictions.rejection_main_features, + ) + + def _is_newer_model_available(self) -> bool: + return self.model_identifier != self.model_handler.get_model_hash_from_mlflow( + self.MODEL_NAME + ) + + def reload_model_if_newer_is_available(self): + custom_logger.debug("Checking if newer model is available...") + if self._is_newer_model_available(): + custom_logger.info("New model available: Loading it...") + new_model = self.model_handler.get_model_with_metadata_by_name( + model_name=self.MODEL_NAME, model_type=self.MODEL_TYPE + ) + self.classfier_model, self.model_identifier = ( + new_model.model, + new_model.model_identifier, + ) + custom_logger.info(f"...New model loaded with hash {self.model_identifier}") + else: + custom_logger.debug("...No newer model available") diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation/offer_categorisation_model.py b/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py similarity index 96% rename from apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation/offer_categorisation_model.py rename to apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py index d234da61..1b4ecc1a 100644 --- a/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation/offer_categorisation_model.py +++ b/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py @@ -11,9 +11,6 @@ from pcpapillon.utils.data_model import ( OfferCategorisationInput, ) -from pcpapillon.utils.env_vars import ( - IS_API_LOCAL, -) from pcpapillon.utils.model_handler import ModelHandler from sentence_transformers import SentenceTransformer @@ -22,7 +19,7 @@ class OfferCategorisationModel: LABEL_MAPPING_PATH = "pcpapillon/data/offer_categorisation_label_mapping.parquet" # Will be removed when model predict is updated PREPROCESSOR_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Will be removed when model predict is updated MODEL_NAME = ModelName.OFFER_CATEGORISATION - MODEL_TYPE = ModelType.LOCAL if IS_API_LOCAL else ModelType.DEFAULT + MODEL_TYPE = ModelType.DEFAULT PREPROC_MODEL_TYPE = ModelType.PREPROCESSING def __init__(self): diff --git a/apps/fraud/compliance/api/src/pcpapillon/utils/constants.py b/apps/fraud/compliance/api/src/pcpapillon/utils/constants.py index 3821f879..de2e82fd 100644 --- a/apps/fraud/compliance/api/src/pcpapillon/utils/constants.py +++ b/apps/fraud/compliance/api/src/pcpapillon/utils/constants.py @@ -1,10 +1,5 @@ from enum import Enum -from pcpapillon.utils.env_vars import ( - COMPLIANCE_MODEL_PATH, - OFFER_CATEGORISATION_MODEL_PATH, -) - class ModelName(Enum): """ @@ -20,7 +15,6 @@ class ModelType(Enum): Enum class for model types """ - LOCAL = "local" DEFAULT = "default" PREPROCESSING = "custom_sentence_transformer" @@ -40,9 +34,3 @@ class APIType(Enum): """ DEFAULT = "default" - - -MODEL_PATHS = { - ModelName.COMPLIANCE: COMPLIANCE_MODEL_PATH, - ModelName.OFFER_CATEGORISATION: OFFER_CATEGORISATION_MODEL_PATH, -} diff --git a/apps/fraud/compliance/api/src/pcpapillon/utils/env_vars.py b/apps/fraud/compliance/api/src/pcpapillon/utils/env_vars.py index 1d9a8d28..ab39c63f 100644 --- a/apps/fraud/compliance/api/src/pcpapillon/utils/env_vars.py +++ b/apps/fraud/compliance/api/src/pcpapillon/utils/env_vars.py @@ -1,27 +1,26 @@ import contextvars import os -from google.auth.exceptions import DefaultCredentialsError from google.cloud import secretmanager -def access_secret(project_id, secret_id, version_id="latest", default=None): - try: - client = secretmanager.SecretManagerServiceClient() - name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}" - response = client.access_secret_version(request={"name": name}) - return response.payload.data.decode("UTF-8") - except DefaultCredentialsError: - return default +def access_secret(project_id, secret_id): + client = secretmanager.SecretManagerServiceClient() + name = f"projects/{project_id}/secrets/{secret_id}/versions/latest" + response = client.access_secret_version(request={"name": name}) + return response.payload.data.decode("UTF-8") # Project vars GCS_BUCKET = os.environ.get("GCS_BUCKET", "data-bucket-dev") GCP_PROJECT = os.environ.get("GCP_PROJECT", "passculture-data-ehp") ENV_SHORT_NAME = os.environ.get("ENV_SHORT_NAME", "dev") +SA_ACCOUNT = f"algo-training-{ENV_SHORT_NAME}" + # API_LOCAL is string to match terraform boolean handling API_LOCAL = os.environ.get("API_LOCAL", False) IS_API_LOCAL = API_LOCAL == "True" + # API API_SECRET_KET_SECRET_ID = os.environ.get( "API_SECRET_KET_SECRET_ID", "api-papillon-auth-secret-key-dev" @@ -30,15 +29,10 @@ def access_secret(project_id, secret_id, version_id="latest", default=None): HASH_ALGORITHM = os.environ.get("VALIDATION_LOGIN_KEY", "HS256") LOGIN_TOKEN_EXPIRATION = os.environ.get("LOGIN_TOKEN_EXPIRATION", 30) -if IS_API_LOCAL: - API_USER = "user_local" - API_PWD = "pwd_local" - -else: - API_USER_SECRET_ID = os.environ.get("API_USER_SECRET_ID", "api-papillon-user-dev") - API_USER = access_secret(GCP_PROJECT, API_USER_SECRET_ID) - API_PWD_SECRET_ID = os.environ.get("API_PWD_SECRET_ID", "api-papillon-password-dev") - API_PWD = access_secret(GCP_PROJECT, API_PWD_SECRET_ID) +API_USER_SECRET_ID = os.environ.get("API_USER_SECRET_ID", "api-papillon-user-dev") +API_PWD_SECRET_ID = os.environ.get("API_PWD_SECRET_ID", "api-papillon-password-dev") +API_USER = access_secret(GCP_PROJECT, API_USER_SECRET_ID) +API_PWD = access_secret(GCP_PROJECT, API_PWD_SECRET_ID) users_db = { API_USER: { "username": API_USER, @@ -46,10 +40,11 @@ def access_secret(project_id, secret_id, version_id="latest", default=None): "disabled": False, } } -# Configs + # logger cloud_trace_context = contextvars.ContextVar("cloud_trace_context", default="") http_request_context = contextvars.ContextVar("http_request_context", default={}) + # MLFlow MLFLOW_SECRET_ID = os.environ.get("MLFLOW_SECRET_ID", "mlflow_client_id") MLFLOW_CLIENT_ID = access_secret(GCP_PROJECT, MLFLOW_SECRET_ID) @@ -59,10 +54,3 @@ def access_secret(project_id, secret_id, version_id="latest", default=None): # Model metadata MODEL_DEFAULT = os.environ.get("MODEL_DEFAULT", "compliance_model_dev") MODEL_STAGE = os.environ.get("MODEL_STAGE", "Production") -COMPLIANCE_MODEL_PATH = os.environ.get( - "COMPLIANCE_MODEL_PATH", "pcpapillon/local_model/compliance_model.cb" -) -OFFER_CATEGORISATION_MODEL_PATH = os.environ.get( - "OFFER_CATEGORISATION_MODEL_PATH", - "pcpapillon/local_model/offer_categorisation_model.cb", -) diff --git a/apps/fraud/compliance/api/src/pcpapillon/utils/model_handler.py b/apps/fraud/compliance/api/src/pcpapillon/utils/model_handler.py index 83856849..324283fc 100644 --- a/apps/fraud/compliance/api/src/pcpapillon/utils/model_handler.py +++ b/apps/fraud/compliance/api/src/pcpapillon/utils/model_handler.py @@ -1,18 +1,16 @@ import hashlib import pickle +import re from dataclasses import dataclass from typing import Union import mlflow import mlflow.pyfunc -from catboost import CatBoostClassifier from main import custom_logger from mlflow import MlflowClient -from pcpapillon.utils.constants import MODEL_PATHS, ModelName, ModelType +from pcpapillon.utils.constants import ModelName, ModelType from pcpapillon.utils.env_vars import ( ENV_SHORT_NAME, - IS_API_LOCAL, - MLFLOW_CLIENT_ID, ) from pcpapillon.utils.tools import connect_remote_mlflow from sentence_transformers import SentenceTransformer @@ -25,38 +23,37 @@ class ModelWithMetadata: class ModelHandler: - MODEL_STAGE = "Production" - def __init__(self) -> None: - if not IS_API_LOCAL: - custom_logger.info("Connecting to mlflow") - connect_remote_mlflow(MLFLOW_CLIENT_ID) - self.mlflow_client = MlflowClient() - else: - custom_logger.info("Local API, no mlflow client") - self.mlflow_client = None + custom_logger.info("Connecting to mlflow") + connect_remote_mlflow() + self.mlflow_client = MlflowClient() def get_model_with_metadata_by_name( - self, model_name, model_type=ModelType.DEFAULT + self, model_name: str, model_type=ModelType.DEFAULT ) -> ModelWithMetadata: - if model_type == ModelType.DEFAULT: - loaded_model = mlflow.catboost.load_model( - model_uri=f"models:/{self._get_mlflow_model_name(model_name)}/{self.MODEL_STAGE}" + if model_name == ModelName.COMPLIANCE: + loaded_model = mlflow.pyfunc.load_model( + model_uri=f"models:/{self._get_mlflow_model_name(model_name)}" ) model_hash = self.get_model_hash_from_mlflow(model_name=model_name) - return ModelWithMetadata(model=loaded_model, model_identifier=model_hash) - - elif model_type == ModelType.LOCAL: - model = CatBoostClassifier(one_hot_max_size=65) - loaded_model = model.load_model(MODEL_PATHS[model_name]) - model_hash = self._get_hash(obj=loaded_model) - return ModelWithMetadata(model=loaded_model, model_identifier=model_hash) - elif model_type == ModelType.PREPROCESSING: - return ModelWithMetadata( - model=SentenceTransformer(model_name), - model_identifier=model_name, + elif model_name == ModelName.OFFER_CATEGORISATION: + loaded_model = mlflow.catboost.load_model( + model_uri=f"models:/{self._get_mlflow_model_name(model_name)}" ) + model_hash = self.get_model_hash_from_mlflow(model_name=model_name) + else: + if model_type == ModelType.PREPROCESSING: + loaded_model = SentenceTransformer(model_name, device="cpu") + model_hash = f"hash_preproc_{model_name}" + else: + raise ValueError( + f"Model name {model_name} not found with type {model_type}" + ) + return ModelWithMetadata( + model=loaded_model, + model_identifier=model_hash, + ) @staticmethod def _get_hash(obj): @@ -64,20 +61,19 @@ def _get_hash(obj): return hashlib.md5(obj_bytes).hexdigest() def get_model_hash_from_mlflow(self, model_name: str): + SPLIT_PATTERN = "/|@" + mlflow_model_name = self._get_mlflow_model_name(model_name=model_name) - if not self.mlflow_client: - raise ValueError("No mlflow client connected") + mlflow_model_name_stripped = re.split(SPLIT_PATTERN, mlflow_model_name)[0] model_version = self.mlflow_client.get_latest_versions( - mlflow_model_name, stages=[self.MODEL_STAGE] + mlflow_model_name_stripped ) - return self._get_hash(model_version) @staticmethod def _get_mlflow_model_name(model_name: ModelName): if model_name == ModelName.COMPLIANCE: - return f"{model_name.value}_default_{ENV_SHORT_NAME}" + return f"api_{model_name.value}_default_{ENV_SHORT_NAME}@production" elif model_name == ModelName.OFFER_CATEGORISATION: - return f"{model_name.value}_{ENV_SHORT_NAME}" - raise ValueError(f"Only {ModelName.__members__} are registered in mlflow") + return f"{model_name.value}_{ENV_SHORT_NAME}/production" diff --git a/apps/fraud/compliance/api/src/pcpapillon/utils/tools.py b/apps/fraud/compliance/api/src/pcpapillon/utils/tools.py index e743887d..68e0f693 100644 --- a/apps/fraud/compliance/api/src/pcpapillon/utils/tools.py +++ b/apps/fraud/compliance/api/src/pcpapillon/utils/tools.py @@ -1,15 +1,25 @@ +import json import os import mlflow from google.auth.transport.requests import Request -from google.oauth2 import id_token -from pcpapillon.utils.env_vars import MLFLOW_TRACKING_TOKEN, MLFLOW_URL +from google.oauth2 import service_account +from pcpapillon.utils.env_vars import ( + GCP_PROJECT, + MLFLOW_CLIENT_ID, + MLFLOW_URL, + SA_ACCOUNT, + access_secret, +) -def connect_remote_mlflow(client_id): - if not MLFLOW_TRACKING_TOKEN: - os.environ["MLFLOW_TRACKING_TOKEN"] = id_token.fetch_id_token( - Request(), client_id - ) - uri = MLFLOW_URL - mlflow.set_tracking_uri(uri) +def connect_remote_mlflow(): + service_account_dict = json.loads(access_secret(GCP_PROJECT, SA_ACCOUNT)) + + id_token_credentials = service_account.IDTokenCredentials.from_service_account_info( + service_account_dict, target_audience=MLFLOW_CLIENT_ID + ) + id_token_credentials.refresh(Request()) + + os.environ["MLFLOW_TRACKING_TOKEN"] = id_token_credentials.token + mlflow.set_tracking_uri(MLFLOW_URL) diff --git a/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py b/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py index e7085edc..e9018b1c 100644 --- a/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py +++ b/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py @@ -1,14 +1,13 @@ from fastapi import APIRouter, Depends from fastapi_versioning import version from main import custom_logger, setup_trace -from pcpapillon.core.compliance.compliance_model import ( +from pcpapillon.core.compliance_model import ( ComplianceModel, ) from pcpapillon.utils.data_model import ( ComplianceInput, ComplianceOutput, ) -from pcpapillon.utils.env_vars import IS_API_LOCAL from pcpapillon.utils.scheduler import init_scheduler compliance_router = APIRouter(tags=["compliance"]) @@ -16,10 +15,9 @@ # Init model and scheduler compliance_model = ComplianceModel() -if not IS_API_LOCAL: - compliance_scheduler = init_scheduler( - compliance_model.reload_model_if_newer_is_available, time_interval=600 - ) +compliance_scheduler = init_scheduler( + compliance_model.reload_model_if_newer_is_available, time_interval=600 +) @compliance_router.post( @@ -35,19 +33,7 @@ def model_compliance_scoring(scoring_input: ComplianceInput): "scoring_input": scoring_input.dict(), } - ( - proba_validation, - proba_rejection, - top_validation, - top_rejection, - ) = compliance_model.predict(data=scoring_input.dict()) + predictions = compliance_model.predict(data=scoring_input) - validation_response_dict = { - "offer_id": scoring_input.dict()["offer_id"], - "probability_validated": proba_validation, - "validation_main_features": top_validation, - "probability_rejected": proba_rejection, - "rejection_main_features": top_rejection, - } - custom_logger.info(validation_response_dict, extra=log_extra_data) - return validation_response_dict + custom_logger.info(predictions.dict(), extra=log_extra_data) + return predictions diff --git a/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py b/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py index bc8c6fe0..e05223ac 100644 --- a/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py +++ b/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py @@ -1,7 +1,7 @@ from fastapi import APIRouter, Depends from fastapi_versioning import version from main import custom_logger, setup_trace -from pcpapillon.core.offer_categorisation.offer_categorisation_model import ( +from pcpapillon.core.offer_categorisation_model import ( OfferCategorisationModel, ) from pcpapillon.utils.data_model import ( diff --git a/apps/recommendation/api/Makefile b/apps/recommendation/api/Makefile index 148b9d2a..df380400 100644 --- a/apps/recommendation/api/Makefile +++ b/apps/recommendation/api/Makefile @@ -9,4 +9,4 @@ install: @eval "$$(pyenv init -)" && pyenv activate api-data-reco && pip install -r requirements.txt start: - @eval "$$(pyenv init -)" && cd src && API_LOCAL=1 uvicorn main:app --reload \ No newline at end of file + @eval "$$(pyenv init -)" && cd src && API_LOCAL=1 uvicorn main:app --reload diff --git a/apps/recommendation/api/requirements.in b/apps/recommendation/api/requirements.in index e200fb7b..eff28f4e 100644 --- a/apps/recommendation/api/requirements.in +++ b/apps/recommendation/api/requirements.in @@ -19,4 +19,4 @@ pytest-asyncio==0.23.7 pytest-postgresql==6.0.0 aiocache==0.12.2 python-dateutil==2.9.0.post0 -httpx==0.27.0 \ No newline at end of file +httpx==0.27.0