From ebf5701091fea52d7b6f998d1e3472654ed5e5a4 Mon Sep 17 00:00:00 2001 From: LaurentM Pass Date: Wed, 28 Aug 2024 11:11:21 +0200 Subject: [PATCH] refactor: Remove compliance prediction files and move others --- .pre-commit-config.yaml | 2 +- .../pcpapillon/core/compliance/__init__.py | 0 .../core/compliance/extract_embedding.py | 53 -------------- .../src/pcpapillon/core/compliance/predict.py | 64 ---------------- .../pcpapillon/core/compliance/preprocess.py | 73 ------------------- .../core/{compliance => }/compliance_model.py | 0 .../offer_categorisation_model.py | 0 .../api/src/pcpapillon/views/compliance.py | 2 +- .../pcpapillon/views/offer_categorisation.py | 2 +- 9 files changed, 3 insertions(+), 193 deletions(-) delete mode 100644 apps/fraud/compliance/api/src/pcpapillon/core/compliance/__init__.py delete mode 100644 apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py delete mode 100644 apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py delete mode 100644 apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py rename apps/fraud/compliance/api/src/pcpapillon/core/{compliance => }/compliance_model.py (100%) rename apps/fraud/compliance/api/src/pcpapillon/core/{offer_categorisation => }/offer_categorisation_model.py (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7efd3a1c..1f22de6a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,7 @@ repos: stages: [commit] - id: lint-check name: Linting (ruff) - entry: ruff check + entry: ruff check --fix language: system types: [python] stages: [commit] diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/__init__.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py deleted file mode 100644 index 6b6310fb..00000000 --- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py +++ /dev/null @@ -1,53 +0,0 @@ -import contextlib -import io - -import numpy as np -import requests -from PIL import Image - - -def extract_embedding(data, params, prepoc_models): - """ - Extract embedding with pretrained models - Two types available: - - image : - - Input: list of urls - - text : - - Input: list of string - Params template: - [ - {"name": "offer_name", "type": "text"}, - {"name": "offer_description", "type": "text"}, - {"name": "image_url", "type": "image"}, - ] - """ - for feature in params: - if feature["type"] == "image": - model = prepoc_models[feature["type"]] - url = data[feature["name"]] - data["image_embedding"] = _encode_img_from_url(model, url) - with contextlib.suppress(KeyError): - del data[feature["name"]] - if feature["type"] == "text": - model = prepoc_models[feature["type"]] - embedding = model.encode(data[feature["name"]]) - data[f"""{feature["name"]}_embedding"""] = embedding - - return data - - -def _encode_img_from_url(model, url): - """ - Encode image with pre-trained model from url - - inputs: - - model : HugginFaces pre-trained model using Sentence-Transformers - - url : string of image url - """ - offer_img_embs = [] - try: - img_emb = model.encode(Image.open(io.BytesIO(requests.get(url).content))) - offer_img_embs = img_emb - except Exception: - offer_img_embs = np.array([0] * 512) - return offer_img_embs diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py deleted file mode 100644 index cb82ff10..00000000 --- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py +++ /dev/null @@ -1,64 +0,0 @@ -from heapq import nlargest, nsmallest - -import shap - - -def get_prediction_and_main_contribution(model, data_w_emb, pool): - """ - Prediction: - Predict validation/rejection probability for a given input as catboost pool - inputs: - - pool: Catboost pool with offer features - - model: Catboost custom trained model - outputs: - proba_val: offer validition probability - proba_rej: offer rejection probability (=1-proba_val) - Main contribution: - Extract prediction main contribution features from shap values - inputs: - - model: Catboost custom trained model - - data: json with offer features - - pool: Catboost with offer features - outputs: - top_val: main features contributing to increase validation probability - top_reg: main features contributing to reduce validation probability - """ - proba_predicted = list( - model.predict( - pool, - prediction_type="Probability", - ntree_start=0, - ntree_end=0, - thread_count=1, - verbose=None, - )[0] - ) - proba_rej = proba_predicted[0] * 100 - proba_val = proba_predicted[1] * 100 - top_val, top_rej = _get_prediction_main_contribution(model, data_w_emb, pool) - return proba_val, proba_rej, top_val, top_rej - - -def _get_prediction_main_contribution(model, data, pool): - explainer = shap.Explainer(model, link=shap.links.logit) - shap_values = explainer.shap_values(pool) - top_val, top_rej = __get_contribution_from_shap_values(shap_values, data) - return top_val, top_rej - - -def __get_contribution_from_shap_values(shap_values, data): - topk_validation_factor = [] - topk_rejection_factor = [] - data_keys = list(data.keys()) - # for i in range(len(data)): - individual_shap_values = list(shap_values[0, :]) - klargest = nlargest(3, individual_shap_values) - ksmallest = nsmallest(3, individual_shap_values) - topk_validation_factor = [ - data_keys[individual_shap_values.index(max_val)] for max_val in klargest - ] - - topk_rejection_factor = [ - data_keys[individual_shap_values.index(min_val)] for min_val in ksmallest - ] - return topk_validation_factor, topk_rejection_factor diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py deleted file mode 100644 index 804abbcc..00000000 --- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py +++ /dev/null @@ -1,73 +0,0 @@ -import contextlib -import itertools - -from catboost import Pool -from pcpapillon.core.compliance.extract_embedding import extract_embedding - - -def preprocess(api_config, model_config, data, prepoc_models): - """ - Preprocessing steps: - - prepare features - - convert json data to catboost pool - """ - data_clean = prepare_features(data, api_config.preprocess_features_type) - data_w_emb = extract_embedding( - data_clean, api_config.features_to_extract_embedding, prepoc_models - ) - - scoring_features = list( - itertools.chain.from_iterable(model_config.catboost_features_types.values()) - ) - data_w_emb_clean = {} - for feature in scoring_features: - data_w_emb_clean[feature] = data_w_emb[feature] - pool = convert_data_to_catboost_pool( - data_w_emb_clean, model_config.catboost_features_types - ) - return pool, data_w_emb - - -def prepare_features(data, params): - """ - Prepare features: - - Fill integer null values with 0 - - Fill string null values with "none" - - Convert boolean columns to int - """ - with contextlib.suppress(KeyError): - del data["offer_id"] - - for key in data: - if key in params["text_features"]: - data[key] = "" if data[key] is None else str(data[key]) - if key in params["numerical_features"]: - data[key] = 0 if data[key] is None else int(data[key]) - if "macro_text" in params: - semantic_content = " ".join( - [semantic_feature.lower() for semantic_feature in params["macro_text"]] - ) - data["semantic_content"] = semantic_content - return data - - -def convert_data_to_catboost_pool(data, features_type_dict): - """ - Convert json data to catboost pool: - - inputs: - - Features names: list of features name (same order as list of features) - - cat_features: list of categorical features names - - text_features: list of text features names - - embedding_features: list of embedding features names - - output: - - catboost pool - """ - data_input = [list(data.values())] - pool = Pool( - data=data_input, - feature_names=list(data.keys()), - cat_features=features_type_dict["cat_features"], - text_features=features_type_dict["text_features"], - embedding_features=features_type_dict["embedding_features"], - ) - return pool diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/compliance_model.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance_model.py similarity index 100% rename from apps/fraud/compliance/api/src/pcpapillon/core/compliance/compliance_model.py rename to apps/fraud/compliance/api/src/pcpapillon/core/compliance_model.py diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation/offer_categorisation_model.py b/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py similarity index 100% rename from apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation/offer_categorisation_model.py rename to apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py diff --git a/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py b/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py index a3f6da30..61a7bd90 100644 --- a/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py +++ b/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py @@ -1,7 +1,7 @@ from fastapi import APIRouter, Depends from fastapi_versioning import version from main import custom_logger, setup_trace -from pcpapillon.core.compliance.compliance_model import ( +from pcpapillon.core.compliance_model import ( ComplianceModel, ) from pcpapillon.utils.data_model import ( diff --git a/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py b/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py index bc8c6fe0..e05223ac 100644 --- a/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py +++ b/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py @@ -1,7 +1,7 @@ from fastapi import APIRouter, Depends from fastapi_versioning import version from main import custom_logger, setup_trace -from pcpapillon.core.offer_categorisation.offer_categorisation_model import ( +from pcpapillon.core.offer_categorisation_model import ( OfferCategorisationModel, ) from pcpapillon.utils.data_model import (