* introducing Model object for better customization

* upgrade versions to mitigate ONNXRuntime security issue
protectai · Mar 21, 2024 · 683cc93 · 683cc93
1 parent 0b50ca7
commit 683cc93
Show file tree

Hide file tree

Showing 29 changed files with 364 additions and 439 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -14,10 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `InvisibleText` scanner to allow control characters like `\n`, `\t`, etc.
 
 ### Changed
--
+- **[Breaking]**: Introducing `Model` object for better customization of the models.
 
 ### Removed
--
+- `model_kwargs` and `pipeline_kwargs` as they are part of the `Model` object.
 
 ## [0.3.10] - 2024-03-14
 

diff --git a/llm_guard/input_scanners/anonymize.py b/llm_guard/input_scanners/anonymize.py
@@ -56,12 +56,10 @@ def __init__(
         preamble: str = "",
         regex_patterns: Optional[List[Dict]] = None,
         use_faker: bool = False,
-        recognizer_conf: Optional[Dict] = DEBERTA_AI4PRIVACY_v2_CONF,
+        recognizer_conf: Optional[Dict] = None,
         threshold: float = 0.5,
         use_onnx: bool = False,
         language: str = "en",
-        model_kwargs: Optional[Dict] = None,
-        pipeline_kwargs: Optional[Dict] = None,
     ):
         """
         Initialize an instance of Anonymize class.
@@ -78,8 +76,6 @@ def __init__(
             threshold (float): Acceptance threshold. Default is 0.
             use_onnx (bool): Whether to use ONNX runtime for inference. Default is False.
             language (str): Language of the anonymize detect. Default is "en".
-            model_kwargs (Optional[Dict]): Keyword arguments passed to the model.
-            pipeline_kwargs (Optional[Dict]): Keyword arguments passed to the pipeline.
         """
 
         if language not in ALL_SUPPORTED_LANGUAGES:
@@ -108,12 +104,13 @@ def __init__(
         self._threshold = threshold
         self._language = language
 
+        if not recognizer_conf:
+            recognizer_conf = DEBERTA_AI4PRIVACY_v2_CONF
+
         transformers_recognizer = get_transformers_recognizer(
             recognizer_conf=recognizer_conf,
             use_onnx=use_onnx,
             supported_language=language,
-            model_kwargs=model_kwargs,
-            pipeline_kwargs=pipeline_kwargs,
         )
 
         self._analyzer = get_analyzer(

diff --git a/llm_guard/input_scanners/anonymize_helpers/analyzer.py b/llm_guard/input_scanners/anonymize_helpers/analyzer.py
@@ -1,5 +1,5 @@
 import copy
-from typing import Dict, List, Optional, Sequence
+from typing import Dict, List, Sequence
 
 import spacy
 from presidio_analyzer import (
@@ -109,8 +109,6 @@ def get_transformers_recognizer(
     recognizer_conf: Dict,
     use_onnx: bool = False,
     supported_language: str = "en",
-    model_kwargs: Optional[Dict] = None,
-    pipeline_kwargs: Optional[Dict] = None,
 ) -> EntityRecognizer:
     """
     This function loads a transformers recognizer given a recognizer configuration.
@@ -119,20 +117,16 @@ def get_transformers_recognizer(
         recognizer_conf (Dict): Configuration to recognize PII data.
         use_onnx (bool): Whether to use the ONNX version of the model. Default is False.
         supported_language (str): The language to use for the recognizer. Default is "en".
-        model_kwargs (Optional[Dict]): Keyword arguments passed to the model.
-        pipeline_kwargs (Optional[Dict]): Keyword arguments passed to the pipeline.
     """
-    model_path = recognizer_conf.get("DEFAULT_MODEL_PATH")
+    model = recognizer_conf.get("DEFAULT_MODEL")
     supported_entities = recognizer_conf.get("PRESIDIO_SUPPORTED_ENTITIES")
     transformers_recognizer = TransformersRecognizer(
-        model_path=model_path,
+        model=model,
         supported_entities=supported_entities,
         supported_language=supported_language,
     )
     transformers_recognizer.load_transformer(
         use_onnx=use_onnx,
-        model_kwargs=model_kwargs,
-        pipeline_kwargs=pipeline_kwargs,
         **recognizer_conf,
     )
     return transformers_recognizer

diff --git a/llm_guard/input_scanners/anonymize_helpers/ner_mapping.py b/llm_guard/input_scanners/anonymize_helpers/ner_mapping.py
@@ -1,11 +1,16 @@
+from llm_guard.model import Model
+
 BERT_BASE_NER_CONF = {
     "PRESIDIO_SUPPORTED_ENTITIES": [
         "LOCATION",
         "PERSON",
         "ORGANIZATION",
     ],
-    "DEFAULT_MODEL_PATH": "dslim/bert-base-NER",
-    "ONNX_MODEL_PATH": "dslim/bert-base-NER",
+    "DEFAULT_MODEL": Model(
+        path="dslim/bert-base-NER",
+        onnx_path="dslim/bert-base-NER",
+        onnx_subfolder="onnx",
+    ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-base-NER NER model",
     "SUB_WORD_AGGREGATION": "simple",
@@ -33,8 +38,11 @@
         "PERSON",
         "ORGANIZATION",
     ],
-    "DEFAULT_MODEL_PATH": "dslim/bert-large-NER",
-    "ONNX_MODEL_PATH": "dslim/bert-large-NER",
+    "DEFAULT_MODEL": Model(
+        path="dslim/bert-large-NER",
+        onnx_path="dslim/bert-large-NER",
+        onnx_subfolder="onnx",
+    ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-large-NER NER model",
     "SUB_WORD_AGGREGATION": "simple",
@@ -62,8 +70,10 @@
         "PERSON",
         "ORGANIZATION",
     ],
-    "DEFAULT_MODEL_PATH": "gyr66/bert-base-chinese-finetuned-ner",
-    "ONNX_MODEL_PATH": "ProtectAI/gyr66-bert-base-chinese-finetuned-ner-onnx",
+    "DEFAULT_MODEL": Model(
+        path="gyr66/bert-base-chinese-finetuned-ner",
+        onnx_path="ProtectAI/gyr66-bert-base-chinese-finetuned-ner-onnx",
+    ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the gyr66/bert-base-chinese-finetuned-ner NER model",
     "SUB_WORD_AGGREGATION": "simple",
@@ -99,8 +109,11 @@
         "IP_ADDRESS",
         "URL",
     ],
-    "DEFAULT_MODEL_PATH": "Isotonic/distilbert_finetuned_ai4privacy_v2",
-    "ONNX_MODEL_PATH": "Isotonic/distilbert_finetuned_ai4privacy_v2",
+    "DEFAULT_MODEL": Model(
+        path="Isotonic/distilbert_finetuned_ai4privacy_v2",
+        onnx_path="Isotonic/distilbert_finetuned_ai4privacy_v2",
+        subfolder="onnx",
+    ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/distilbert_finetuned_ai4privacy_v2 NER model",
     "SUB_WORD_AGGREGATION": "simple",
@@ -186,8 +199,11 @@
         "IP_ADDRESS",
         "URL",
     ],
-    "DEFAULT_MODEL_PATH": "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
-    "ONNX_MODEL_PATH": "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
+    "DEFAULT_MODEL": Model(
+        path="Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
+        onnx_path="Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
+        subfolder="onnx",
+    ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/deberta-v3-base_finetuned_ai4privacy_v2 NER model",
     "SUB_WORD_AGGREGATION": "simple",

diff --git a/llm_guard/input_scanners/anonymize_helpers/transformers_recognizer.py b/llm_guard/input_scanners/anonymize_helpers/transformers_recognizer.py
@@ -1,10 +1,11 @@
 import copy
-from typing import Dict, List, Optional
+from typing import List, Optional
 
 from presidio_analyzer import AnalysisExplanation, EntityRecognizer, RecognizerResult
 from presidio_analyzer.nlp_engine import NlpArtifacts
 from transformers import TokenClassificationPipeline
 
+from llm_guard.model import Model
 from llm_guard.transformers_helpers import device, get_tokenizer, is_onnx_supported
 from llm_guard.util import get_logger, lazy_load_dep, split_text_to_word_chunks
 
@@ -52,7 +53,7 @@ def load(self) -> None:
 
     def __init__(
         self,
-        model_path: Optional[str] = None,
+        model: Model,
         pipeline: Optional[TokenClassificationPipeline] = None,
         supported_entities: Optional[List[str]] = None,
         supported_language: str = "en",
@@ -61,10 +62,10 @@ def __init__(
             supported_entities = BERT_BASE_NER_CONF["PRESIDIO_SUPPORTED_ENTITIES"]
         super().__init__(
             supported_entities=supported_entities,
-            name=f"Transformers model {model_path}",
+            name=f"Transformers model {model.path}",
         )
 
-        self.model_path = model_path
+        self.model = model
         self.pipeline = pipeline
         self.is_loaded = False
 
@@ -77,24 +78,17 @@ def __init__(
         self.chunk_length = None
         self.id_entity_name = None
         self.id_score_reduction = None
-        self.onnx_model_path = None
         self.supported_language = supported_language
 
     def load_transformer(
         self,
         use_onnx: bool = False,
-        model_kwargs: Optional[Dict] = None,
-        pipeline_kwargs: Optional[Dict] = None,
         **kwargs,
     ) -> None:
         """Load external configuration parameters and set default values.
 
         :param use_onnx: flag to use ONNX optimized model
         :type use_onnx: bool, optional
-        :param model_kwargs: define default values for model attributes
-        :type model_kwargs: Optional[Dict], optional
-        :param pipeline_kwargs: define default values for pipeline attributes
-        :type pipeline_kwargs: Optional[Dict], optional
         :param kwargs: define default values for class attributes and modify pipeline behavior
         **DATASET_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from dataset format to Presidio format
         **MODEL_TO_PRESIDIO_MAPPING (dict) -  defines mapping entity strings from chosen model format to Presidio format
@@ -118,66 +112,57 @@ def load_transformer(
         self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
         self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
         self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
-        self.onnx_model_path = kwargs.get("ONNX_MODEL_PATH", None)
 
         if not self.pipeline:
-            if not self.model_path:
-                self.model_path = "dslim/bert-base-NER"
-                self.onnx_model_path = "optimum/bert-base-NER"
+            if not self.model:
+                self.model = Model(
+                    path="dslim/bert-base-NER",
+                    onnx_path="dslim/bert-base-NER",
+                    subfolder="onnx",
+                )
                 LOGGER.warning(
-                    "Both 'model' and 'model_path' arguments are None. Using default",
-                    model_path=self.model_path,
+                    "'model' argument is None. Using default",
+                    model=self.model,
                 )
 
-        self._load_pipeline(
-            use_onnx=use_onnx, model_kwargs=model_kwargs, pipeline_kwargs=pipeline_kwargs
-        )
+            self._load_pipeline(
+                use_onnx=use_onnx,
+            )
 
     def _load_pipeline(
         self,
         use_onnx: bool = False,
-        model_kwargs: Optional[Dict] = None,
-        pipeline_kwargs: Optional[Dict] = None,
     ) -> None:
         """Initialize NER transformers_rec pipeline using the model_path provided"""
-        model = self.model_path
-        onnx_model = self.onnx_model_path
-        pipeline_kwargs = pipeline_kwargs or {}
-        model_kwargs = model_kwargs or {}
-
         transformers = lazy_load_dep("transformers")
-        tf_tokenizer = get_tokenizer(model, **model_kwargs)
+        tf_tokenizer = get_tokenizer(self.model)
 
         if use_onnx and is_onnx_supported() is False:
             LOGGER.warning("ONNX is not supported on this machine. Using PyTorch instead of ONNX.")
             use_onnx = False
 
         if use_onnx:
-            subfolder = "onnx" if onnx_model == model else ""
-            if onnx_model is not None:
-                model = onnx_model
-
             optimum_onnxruntime = lazy_load_dep(
                 "optimum.onnxruntime",
                 "optimum[onnxruntime]" if device().type != "cuda" else "optimum[onnxruntime-gpu]",
             )
             tf_tokenizer.model_input_names = ["input_ids", "attention_mask"]
             tf_model = optimum_onnxruntime.ORTModelForTokenClassification.from_pretrained(
-                model,
-                export=onnx_model is None,
-                subfolder=subfolder,
+                self.model.onnx_path,
+                export=False,
+                subfolder=self.model.onnx_subfolder,
                 provider="CUDAExecutionProvider"
                 if device().type == "cuda"
                 else "CPUExecutionProvider",
                 use_io_binding=True if device().type == "cuda" else False,
-                **model_kwargs,
+                **self.model.kwargs,
             )
-            LOGGER.debug("Initialized NER ONNX model", model=model, device=device())
+            LOGGER.debug("Initialized NER ONNX model", model=self.model, device=device())
         else:
             tf_model = transformers.AutoModelForTokenClassification.from_pretrained(
-                model, **model_kwargs
+                self.model.path, subfolder=self.model.subfolder, **self.model.kwargs
             )
-            LOGGER.debug("Initialized NER model", model=model, device=device())
+            LOGGER.debug("Initialized NER model", model=self.model, device=device())
 
         self.pipeline = transformers.pipeline(
             "ner",
@@ -189,7 +174,7 @@ def _load_pipeline(
             aggregation_strategy=self.aggregation_mechanism,
             framework="pt",
             ignore_labels=self.ignore_labels,
-            **pipeline_kwargs,
+            **self.model.pipeline_kwargs,
         )
 
         self.is_loaded = True

diff --git a/llm_guard/input_scanners/ban_competitors.py b/llm_guard/input_scanners/ban_competitors.py
@@ -1,15 +1,16 @@
-from typing import Dict, Optional, Sequence
+from typing import Optional, Sequence
 
 from presidio_anonymizer.core.text_replace_builder import TextReplaceBuilder
 
+from llm_guard.model import Model
 from llm_guard.util import device, get_logger, lazy_load_dep
 
 from .base import Scanner
 
 LOGGER = get_logger()
 
-MODEL_BASE = "tomaarsen/span-marker-bert-base-orgs"
-MODEL_SMALL = "tomaarsen/span-marker-bert-small-orgs"
+MODEL_BASE = Model("tomaarsen/span-marker-bert-base-orgs")
+MODEL_SMALL = Model("tomaarsen/span-marker-bert-small-orgs")
 
 
 class BanCompetitors(Scanner):
@@ -25,8 +26,7 @@ def __init__(
         *,
         threshold: float = 0.5,
         redact: bool = True,
-        model: Optional[str] = None,
-        model_kwargs: Optional[Dict] = None,
+        model: Optional[Model] = None,
     ):
         """
         Initialize BanCompetitors object.
@@ -35,8 +35,7 @@ def __init__(
             competitors (Sequence[str]): List of competitors to detect.
             threshold (float, optional): Threshold to determine if a competitor is present in the prompt. Default is 0.5.
             redact (bool, optional): Whether to redact the competitor name. Default is True.
-            model (str, optional): Model to use for named-entity recognition. Default is BASE model.
-            model_kwargs (Dict, optional): Keyword arguments passed to the model.
+            model (Model, optional): Model to use for named-entity recognition. Default is BASE model.
 
         Raises:
             ValueError: If no topics are provided.
@@ -50,7 +49,7 @@ def __init__(
 
         span_marker = lazy_load_dep("span_marker", "span-marker")
         self._ner_pipeline = span_marker.SpanMarkerModel.from_pretrained(
-            model, labels=["ORG"], **(model_kwargs or {})
+            model.path, labels=["ORG"], **model.kwargs
         )
 
         if device().type == "cuda":