add report

pirocheto · Nov 28, 2023 · 4dc969d · 4dc969d
1 parent 18327b5
commit 4dc969d
Show file tree

Hide file tree

Showing 16 changed files with 231 additions and 79 deletions.
diff --git a/.github/workflows/run_test.yaml b/.github/workflows/run_test.yaml
@@ -33,4 +33,10 @@ jobs:
       - name: Run test
         run: |
           source .venv/bin/activate
-          pytest
+          pytest
+      - name: Create Pull Request
+        run: |
+          poetry install --no-root --no-interaction --only report
+          source .venv/bin/activate
+          make report
+          cml comment create report.md
diff --git a/Makefile b/Makefile
@@ -1,10 +1,15 @@
+download_data:
+	dvc repro download_data
 
 pull_model:
 	dvc pull live/model/model.pkl live/model/model.onnx
 
 modelcard:
 	python src/modelcard.py
 
+report:
+	python src/report.py
+
 optuna_dashboard:
 	optuna-dashboard notebooks/optunalog/optuna.db
 
diff --git a/live/model/README.md b/live/model/README.md
@@ -19,7 +19,7 @@ datasets:
 
 The model predicts the probability that a URL is a phishing site.  
 To understand what phishing is, refer to the Wikipedia page:  
-[https://en.wikipedia.org/wiki/Phishing](https://en.wikipedia.org/wiki/Phishing)
+[https://en.wikipedia.org/wiki/Phishing](https://en.wikipedia.org/wiki/Phishing) 
 -- this is not a phishing link 😜
 
 - **Model type:** LinearSVM
@@ -29,8 +29,8 @@ To understand what phishing is, refer to the Wikipedia page:
 
 ## Evaluation
 
-| Metric    | Value    |
-| --------- | -------- |
+| Metric    |    Value |
+|-----------|----------|
 | roc_auc   | 0.986002 |
 | accuracy  | 0.949364 |
 | f1        | 0.94867  |
@@ -49,8 +49,7 @@ In addition to being lighter and faster, it can be utilized by languages support
 Below are some examples to get you start. For others languages please refer to the ONNX documentation
 
 <details>
-  <summary><b>Python</b> - ONNX - [recommended 👍]
-  </summary>
+  <summary><b>Python</b> - ONNX - [recommended 👍]</summary>
 
 ```python
 import numpy as np
@@ -81,54 +80,52 @@ for url, proba in zip(urls, results):
     print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f} %")
     print("----")
 
-
-
 ```
-
 </details>
 
 <details>
   <summary><b>NodeJS</b>- ONNX - [recommended 👍]</summary>
 
 ```javascript
-const ort = require("onnxruntime-node");
+const ort = require('onnxruntime-node');
 
 async function main() {
-  try {
-    // Make sure you have downloaded the model.onnx
-    // Creating an ONNX inference session with the specified model
-    const model_path = "./model.onnx";
-    const session = await ort.InferenceSession.create(model_path);
-
-    const urls = [
-      "https://en.wikipedia.org/wiki/Phishing",
-      "http//weird-website.com",
-    ];
-
-    // Creating an ONNX tensor from the input data
-    const tensor = new ort.Tensor("string", urls, [urls.length]);
-
-    // Executing the inference session with the input tensor
-    const results = await session.run({ inputs: tensor });
-    const probas = results["probabilities"].data;
-
-    // Displaying results for each URL
-    urls.forEach((url, index) => {
-      const proba = probas[index * 2 + 1];
-      const percent = (proba * 100).toFixed(2);
-
-      console.log(`URL: ${url}`);
-      console.log(`Likelihood of being a phishing site: ${percent}%`);
-      console.log("----");
-    });
-  } catch (e) {
-    console.log(`failed to inference ONNX model: ${e}.`);
-  }
-}
+
+    try {
+        // Make sure you have downloaded the model.onnx
+        // Creating an ONNX inference session with the specified model
+        const model_path = "./model.onnx";
+        const session = await ort.InferenceSession.create(model_path);
+
+        const urls = [
+            "https://en.wikipedia.org/wiki/Phishing",
+            "http//weird-website.com",
+        ]
+
+        // Creating an ONNX tensor from the input data
+        const tensor = new ort.Tensor('string', urls, [urls.length,]);
+
+        // Executing the inference session with the input tensor
+        const results = await session.run({"inputs": tensor});
+        const probas = results['probabilities'].data;
+
+        // Displaying results for each URL
+        urls.forEach((url, index) => {
+            const proba = probas[index * 2 + 1];
+            const percent = (proba * 100).toFixed(2);
+
+            console.log(`URL: ${url}`);
+            console.log(`Likelihood of being a phishing site: ${percent}%`);
+            console.log("----");
+        });
+
+    } catch (e) {
+        console.log(`failed to inference ONNX model: ${e}.`);
+    }
+};
 
 main();
 ```
-
 </details>
 
 <details>
@@ -182,7 +179,6 @@ main();
   </body>
 </html>
 ```
-
 </details>
 
 <details>
@@ -215,13 +211,4 @@ for url, proba in zip(urls, probas):
     print("----")
 
 ```
-
 </details>
-
-# Plots
-
-![calibration curve](../images/calibration_curve.png)
-![confusion matrix](../images/confusion_matrix.png)
-![precision recall curve](../images/precision_recall_curve.png)
-![roc curve](../images/roc_curve.png)
-![score distribution](../images/score_distribution.png)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ jinja2 = "^3.1.2"
 seaborn = "^0.13.0"
 matplotlib = "^3.8.2"
 pandas = { extras = ["parquet"], version = "^2.1.3" }
+tabulate = "^0.9.0"
 
 [tool.poetry.group.test.dependencies]
 pytest = "^7.4.3"
@@ -25,6 +26,13 @@ scikit-learn = "^1.3.2"
 [tool.poetry.group.test]
 optional = true
 
+[tool.poetry.group.report.dependencies]
+jinja2 = "^3.1.2"
+tabulate = "^0.9.0"
+
+[tool.poetry.group.report]
+optional = true
+
 [tool.poetry.group.experiment.dependencies]
 xgboost = "^2.0.2"
 lightgbm = "^4.1.0"

diff --git a/report.md b/report.md
@@ -0,0 +1,36 @@
+## Metrics
+
+| Metric    |    Value |
+|-----------|----------|
+| roc_auc   | 0.986002 |
+| accuracy  | 0.949364 |
+| f1        | 0.94867  |
+| precision | 0.961853 |
+| recall    | 0.935843 |
+
+## Hyperparameters
+
+| Params         | Value                 |
+|----------------|-----------------------|
+| C              | 9.783081707940896     |
+| loss           | hinge                 |
+| lowercase      | True                  |
+| max_ngram_char | 5                     |
+| max_ngram_word | 2                     |
+| tol            | 0.0003837000703754547 |
+| use_idf        | False                 |
+
+## Model size
+
+| File       |   Size (Mo) |
+|------------|-------------|
+| model.onnx |    11.1121  |
+| model.pkl  |     7.18834 |
+
+## Plots
+
+![](live/images/confusion_matrix.png)
+![](live/images/calibration_curve.png)
+![](live/images/precision_recall_curve.png)
+![](live/images/roc_curve.png)
+![](live/images/score_distribution.png)
diff --git a/resources/modelcard/scripts/javascript/model.onnx b/resources/modelcard/scripts/javascript/model.onnx
diff --git a/...es/modelcard/scripts/javascript/README.md → resources/snippets/javascript/README.md b/...es/modelcard/scripts/javascript/README.md → resources/snippets/javascript/README.md
diff --git a/...s/modelcard/scripts/javascript/index.html → resources/snippets/javascript/index.html b/...s/modelcard/scripts/javascript/index.html → resources/snippets/javascript/index.html
diff --git a/resources/modelcard/scripts/nodejs/index.js → resources/snippets/nodejs/index.js b/resources/modelcard/scripts/nodejs/index.js → resources/snippets/nodejs/index.js
diff --git a/...ces/modelcard/scripts/python/load_onnx.py → resources/snippets/python/load_onnx.py b/...ces/modelcard/scripts/python/load_onnx.py → resources/snippets/python/load_onnx.py
diff --git a/...s/modelcard/scripts/python/load_pickle.py → resources/snippets/python/load_pickle.py b/...s/modelcard/scripts/python/load_pickle.py → resources/snippets/python/load_pickle.py
diff --git a/resources/modelcard/template.md.j2 → resources/templates/modelcard.md.j2 b/resources/modelcard/template.md.j2 → resources/templates/modelcard.md.j2
diff --git a/resources/templates/report.md.j2 b/resources/templates/report.md.j2
@@ -0,0 +1,15 @@
+## Metrics
+
+{{ metrics }}
+
+## Hyperparameters
+
+{{ hyperparams }}
+
+## Model size
+
+{{ sizes }}
+
+## Plots
+
+{{ plots }}
diff --git a/src/modelcard.py b/src/modelcard.py
@@ -24,7 +24,7 @@ def load_metrics(path: str) -> str:
 def load_code() -> dict:
     """Load code snippets from specified files."""
 
-    path = Path("resources/modelcard/scripts")
+    path = Path("resources/snippets")
 
     code = {
         "py": {
@@ -41,7 +41,7 @@ def load_code() -> dict:
 def render_modelcard(metrics: str, code: dict) -> str:
     """Render the model card using a Jinja2 template."""
 
-    template_str = Path("resources/modelcard/template.md.j2").read_text("utf8")
+    template_str = Path("resources/templates/modelcard.md.j2").read_text("utf8")
     template = Template(template_str)
 
     params = {"metrics": metrics, "code": code}