From ebf5701091fea52d7b6f998d1e3472654ed5e5a4 Mon Sep 17 00:00:00 2001
From: LaurentM Pass <laurent.montier@passculture.app>
Date: Wed, 28 Aug 2024 11:11:21 +0200
Subject: [PATCH] refactor: Remove compliance prediction files and move others

---
 .pre-commit-config.yaml                       |  2 +-
 .../pcpapillon/core/compliance/__init__.py    |  0
 .../core/compliance/extract_embedding.py      | 53 --------------
 .../src/pcpapillon/core/compliance/predict.py | 64 ----------------
 .../pcpapillon/core/compliance/preprocess.py  | 73 -------------------
 .../core/{compliance => }/compliance_model.py |  0
 .../offer_categorisation_model.py             |  0
 .../api/src/pcpapillon/views/compliance.py    |  2 +-
 .../pcpapillon/views/offer_categorisation.py  |  2 +-
 9 files changed, 3 insertions(+), 193 deletions(-)
 delete mode 100644 apps/fraud/compliance/api/src/pcpapillon/core/compliance/__init__.py
 delete mode 100644 apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py
 delete mode 100644 apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py
 delete mode 100644 apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py
 rename apps/fraud/compliance/api/src/pcpapillon/core/{compliance => }/compliance_model.py (100%)
 rename apps/fraud/compliance/api/src/pcpapillon/core/{offer_categorisation => }/offer_categorisation_model.py (100%)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7efd3a1c..1f22de6a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
     stages: [commit]
   - id: lint-check
     name: Linting (ruff)
-    entry: ruff check
+    entry: ruff check --fix
     language: system
     types: [python]
     stages: [commit]
diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/__init__.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py
deleted file mode 100644
index 6b6310fb..00000000
--- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/extract_embedding.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import contextlib
-import io
-
-import numpy as np
-import requests
-from PIL import Image
-
-
-def extract_embedding(data, params, prepoc_models):
-    """
-    Extract embedding with pretrained models
-    Two types available:
-    - image :
-        - Input: list of urls
-    - text  :
-        - Input: list of string
-    Params template:
-    [
-        {"name": "offer_name", "type": "text"},
-        {"name": "offer_description", "type": "text"},
-        {"name": "image_url", "type": "image"},
-    ]
-    """
-    for feature in params:
-        if feature["type"] == "image":
-            model = prepoc_models[feature["type"]]
-            url = data[feature["name"]]
-            data["image_embedding"] = _encode_img_from_url(model, url)
-            with contextlib.suppress(KeyError):
-                del data[feature["name"]]
-        if feature["type"] == "text":
-            model = prepoc_models[feature["type"]]
-            embedding = model.encode(data[feature["name"]])
-            data[f"""{feature["name"]}_embedding"""] = embedding
-
-    return data
-
-
-def _encode_img_from_url(model, url):
-    """
-    Encode image with pre-trained model from url
-
-    inputs:
-        - model : HugginFaces pre-trained model using Sentence-Transformers
-        - url : string of image url
-    """
-    offer_img_embs = []
-    try:
-        img_emb = model.encode(Image.open(io.BytesIO(requests.get(url).content)))
-        offer_img_embs = img_emb
-    except Exception:
-        offer_img_embs = np.array([0] * 512)
-    return offer_img_embs
diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py
deleted file mode 100644
index cb82ff10..00000000
--- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/predict.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from heapq import nlargest, nsmallest
-
-import shap
-
-
-def get_prediction_and_main_contribution(model, data_w_emb, pool):
-    """
-    Prediction:
-        Predict validation/rejection probability for a given input as catboost pool
-        inputs:
-            - pool: Catboost pool with offer features
-            - model: Catboost custom trained model
-        outputs:
-            proba_val: offer validition probability
-            proba_rej: offer rejection probability (=1-proba_val)
-    Main contribution:
-        Extract prediction main contribution features from shap values
-        inputs:
-            - model: Catboost custom trained model
-            - data: json with offer features
-            - pool: Catboost with offer features
-        outputs:
-            top_val: main features contributing to increase validation probability
-            top_reg: main features contributing to reduce validation probability
-    """
-    proba_predicted = list(
-        model.predict(
-            pool,
-            prediction_type="Probability",
-            ntree_start=0,
-            ntree_end=0,
-            thread_count=1,
-            verbose=None,
-        )[0]
-    )
-    proba_rej = proba_predicted[0] * 100
-    proba_val = proba_predicted[1] * 100
-    top_val, top_rej = _get_prediction_main_contribution(model, data_w_emb, pool)
-    return proba_val, proba_rej, top_val, top_rej
-
-
-def _get_prediction_main_contribution(model, data, pool):
-    explainer = shap.Explainer(model, link=shap.links.logit)
-    shap_values = explainer.shap_values(pool)
-    top_val, top_rej = __get_contribution_from_shap_values(shap_values, data)
-    return top_val, top_rej
-
-
-def __get_contribution_from_shap_values(shap_values, data):
-    topk_validation_factor = []
-    topk_rejection_factor = []
-    data_keys = list(data.keys())
-    # for i in range(len(data)):
-    individual_shap_values = list(shap_values[0, :])
-    klargest = nlargest(3, individual_shap_values)
-    ksmallest = nsmallest(3, individual_shap_values)
-    topk_validation_factor = [
-        data_keys[individual_shap_values.index(max_val)] for max_val in klargest
-    ]
-
-    topk_rejection_factor = [
-        data_keys[individual_shap_values.index(min_val)] for min_val in ksmallest
-    ]
-    return topk_validation_factor, topk_rejection_factor
diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py
deleted file mode 100644
index 804abbcc..00000000
--- a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/preprocess.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import contextlib
-import itertools
-
-from catboost import Pool
-from pcpapillon.core.compliance.extract_embedding import extract_embedding
-
-
-def preprocess(api_config, model_config, data, prepoc_models):
-    """
-    Preprocessing steps:
-        - prepare features
-        - convert json data to catboost pool
-    """
-    data_clean = prepare_features(data, api_config.preprocess_features_type)
-    data_w_emb = extract_embedding(
-        data_clean, api_config.features_to_extract_embedding, prepoc_models
-    )
-
-    scoring_features = list(
-        itertools.chain.from_iterable(model_config.catboost_features_types.values())
-    )
-    data_w_emb_clean = {}
-    for feature in scoring_features:
-        data_w_emb_clean[feature] = data_w_emb[feature]
-    pool = convert_data_to_catboost_pool(
-        data_w_emb_clean, model_config.catboost_features_types
-    )
-    return pool, data_w_emb
-
-
-def prepare_features(data, params):
-    """
-    Prepare features:
-        - Fill integer null values with 0
-        - Fill string null values with "none"
-        - Convert boolean columns to int
-    """
-    with contextlib.suppress(KeyError):
-        del data["offer_id"]
-
-    for key in data:
-        if key in params["text_features"]:
-            data[key] = "" if data[key] is None else str(data[key])
-        if key in params["numerical_features"]:
-            data[key] = 0 if data[key] is None else int(data[key])
-    if "macro_text" in params:
-        semantic_content = " ".join(
-            [semantic_feature.lower() for semantic_feature in params["macro_text"]]
-        )
-        data["semantic_content"] = semantic_content
-    return data
-
-
-def convert_data_to_catboost_pool(data, features_type_dict):
-    """
-    Convert json data to catboost pool:
-        - inputs:
-            - Features names: list of features name (same order as list of features)
-            - cat_features: list of categorical features names
-            - text_features: list of text features names
-            - embedding_features: list of embedding features names
-        - output:
-            - catboost pool
-    """
-    data_input = [list(data.values())]
-    pool = Pool(
-        data=data_input,
-        feature_names=list(data.keys()),
-        cat_features=features_type_dict["cat_features"],
-        text_features=features_type_dict["text_features"],
-        embedding_features=features_type_dict["embedding_features"],
-    )
-    return pool
diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/compliance/compliance_model.py b/apps/fraud/compliance/api/src/pcpapillon/core/compliance_model.py
similarity index 100%
rename from apps/fraud/compliance/api/src/pcpapillon/core/compliance/compliance_model.py
rename to apps/fraud/compliance/api/src/pcpapillon/core/compliance_model.py
diff --git a/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation/offer_categorisation_model.py b/apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py
similarity index 100%
rename from apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation/offer_categorisation_model.py
rename to apps/fraud/compliance/api/src/pcpapillon/core/offer_categorisation_model.py
diff --git a/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py b/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py
index a3f6da30..61a7bd90 100644
--- a/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py
+++ b/apps/fraud/compliance/api/src/pcpapillon/views/compliance.py
@@ -1,7 +1,7 @@
 from fastapi import APIRouter, Depends
 from fastapi_versioning import version
 from main import custom_logger, setup_trace
-from pcpapillon.core.compliance.compliance_model import (
+from pcpapillon.core.compliance_model import (
     ComplianceModel,
 )
 from pcpapillon.utils.data_model import (
diff --git a/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py b/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py
index bc8c6fe0..e05223ac 100644
--- a/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py
+++ b/apps/fraud/compliance/api/src/pcpapillon/views/offer_categorisation.py
@@ -1,7 +1,7 @@
 from fastapi import APIRouter, Depends
 from fastapi_versioning import version
 from main import custom_logger, setup_trace
-from pcpapillon.core.offer_categorisation.offer_categorisation_model import (
+from pcpapillon.core.offer_categorisation_model import (
     OfferCategorisationModel,
 )
 from pcpapillon.utils.data_model import (