Skip to content

Commit

Permalink
* introducing Model object for better customization
Browse files Browse the repository at this point in the history
* upgrade versions to mitigate ONNXRuntime security issue
  • Loading branch information
asofter committed Mar 21, 2024
1 parent 0b50ca7 commit 683cc93
Show file tree
Hide file tree
Showing 29 changed files with 364 additions and 439 deletions.
4 changes: 2 additions & 2 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `InvisibleText` scanner to allow control characters like `\n`, `\t`, etc.

### Changed
-
- **[Breaking]**: Introducing `Model` object for better customization of the models.

### Removed
-
- `model_kwargs` and `pipeline_kwargs` as they are part of the `Model` object.

## [0.3.10] - 2024-03-14

Expand Down
11 changes: 4 additions & 7 deletions llm_guard/input_scanners/anonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,10 @@ def __init__(
preamble: str = "",
regex_patterns: Optional[List[Dict]] = None,
use_faker: bool = False,
recognizer_conf: Optional[Dict] = DEBERTA_AI4PRIVACY_v2_CONF,
recognizer_conf: Optional[Dict] = None,
threshold: float = 0.5,
use_onnx: bool = False,
language: str = "en",
model_kwargs: Optional[Dict] = None,
pipeline_kwargs: Optional[Dict] = None,
):
"""
Initialize an instance of Anonymize class.
Expand All @@ -78,8 +76,6 @@ def __init__(
threshold (float): Acceptance threshold. Default is 0.
use_onnx (bool): Whether to use ONNX runtime for inference. Default is False.
language (str): Language of the anonymize detect. Default is "en".
model_kwargs (Optional[Dict]): Keyword arguments passed to the model.
pipeline_kwargs (Optional[Dict]): Keyword arguments passed to the pipeline.
"""

if language not in ALL_SUPPORTED_LANGUAGES:
Expand Down Expand Up @@ -108,12 +104,13 @@ def __init__(
self._threshold = threshold
self._language = language

if not recognizer_conf:
recognizer_conf = DEBERTA_AI4PRIVACY_v2_CONF

transformers_recognizer = get_transformers_recognizer(
recognizer_conf=recognizer_conf,
use_onnx=use_onnx,
supported_language=language,
model_kwargs=model_kwargs,
pipeline_kwargs=pipeline_kwargs,
)

self._analyzer = get_analyzer(
Expand Down
12 changes: 3 additions & 9 deletions llm_guard/input_scanners/anonymize_helpers/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import copy
from typing import Dict, List, Optional, Sequence
from typing import Dict, List, Sequence

import spacy
from presidio_analyzer import (
Expand Down Expand Up @@ -109,8 +109,6 @@ def get_transformers_recognizer(
recognizer_conf: Dict,
use_onnx: bool = False,
supported_language: str = "en",
model_kwargs: Optional[Dict] = None,
pipeline_kwargs: Optional[Dict] = None,
) -> EntityRecognizer:
"""
This function loads a transformers recognizer given a recognizer configuration.
Expand All @@ -119,20 +117,16 @@ def get_transformers_recognizer(
recognizer_conf (Dict): Configuration to recognize PII data.
use_onnx (bool): Whether to use the ONNX version of the model. Default is False.
supported_language (str): The language to use for the recognizer. Default is "en".
model_kwargs (Optional[Dict]): Keyword arguments passed to the model.
pipeline_kwargs (Optional[Dict]): Keyword arguments passed to the pipeline.
"""
model_path = recognizer_conf.get("DEFAULT_MODEL_PATH")
model = recognizer_conf.get("DEFAULT_MODEL")
supported_entities = recognizer_conf.get("PRESIDIO_SUPPORTED_ENTITIES")
transformers_recognizer = TransformersRecognizer(
model_path=model_path,
model=model,
supported_entities=supported_entities,
supported_language=supported_language,
)
transformers_recognizer.load_transformer(
use_onnx=use_onnx,
model_kwargs=model_kwargs,
pipeline_kwargs=pipeline_kwargs,
**recognizer_conf,
)
return transformers_recognizer
Expand Down
36 changes: 26 additions & 10 deletions llm_guard/input_scanners/anonymize_helpers/ner_mapping.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from llm_guard.model import Model

BERT_BASE_NER_CONF = {
"PRESIDIO_SUPPORTED_ENTITIES": [
"LOCATION",
"PERSON",
"ORGANIZATION",
],
"DEFAULT_MODEL_PATH": "dslim/bert-base-NER",
"ONNX_MODEL_PATH": "dslim/bert-base-NER",
"DEFAULT_MODEL": Model(
path="dslim/bert-base-NER",
onnx_path="dslim/bert-base-NER",
onnx_subfolder="onnx",
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-base-NER NER model",
"SUB_WORD_AGGREGATION": "simple",
Expand Down Expand Up @@ -33,8 +38,11 @@
"PERSON",
"ORGANIZATION",
],
"DEFAULT_MODEL_PATH": "dslim/bert-large-NER",
"ONNX_MODEL_PATH": "dslim/bert-large-NER",
"DEFAULT_MODEL": Model(
path="dslim/bert-large-NER",
onnx_path="dslim/bert-large-NER",
onnx_subfolder="onnx",
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-large-NER NER model",
"SUB_WORD_AGGREGATION": "simple",
Expand Down Expand Up @@ -62,8 +70,10 @@
"PERSON",
"ORGANIZATION",
],
"DEFAULT_MODEL_PATH": "gyr66/bert-base-chinese-finetuned-ner",
"ONNX_MODEL_PATH": "ProtectAI/gyr66-bert-base-chinese-finetuned-ner-onnx",
"DEFAULT_MODEL": Model(
path="gyr66/bert-base-chinese-finetuned-ner",
onnx_path="ProtectAI/gyr66-bert-base-chinese-finetuned-ner-onnx",
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the gyr66/bert-base-chinese-finetuned-ner NER model",
"SUB_WORD_AGGREGATION": "simple",
Expand Down Expand Up @@ -99,8 +109,11 @@
"IP_ADDRESS",
"URL",
],
"DEFAULT_MODEL_PATH": "Isotonic/distilbert_finetuned_ai4privacy_v2",
"ONNX_MODEL_PATH": "Isotonic/distilbert_finetuned_ai4privacy_v2",
"DEFAULT_MODEL": Model(
path="Isotonic/distilbert_finetuned_ai4privacy_v2",
onnx_path="Isotonic/distilbert_finetuned_ai4privacy_v2",
subfolder="onnx",
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/distilbert_finetuned_ai4privacy_v2 NER model",
"SUB_WORD_AGGREGATION": "simple",
Expand Down Expand Up @@ -186,8 +199,11 @@
"IP_ADDRESS",
"URL",
],
"DEFAULT_MODEL_PATH": "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
"ONNX_MODEL_PATH": "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
"DEFAULT_MODEL": Model(
path="Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
onnx_path="Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
subfolder="onnx",
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/deberta-v3-base_finetuned_ai4privacy_v2 NER model",
"SUB_WORD_AGGREGATION": "simple",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import copy
from typing import Dict, List, Optional
from typing import List, Optional

from presidio_analyzer import AnalysisExplanation, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
from transformers import TokenClassificationPipeline

from llm_guard.model import Model
from llm_guard.transformers_helpers import device, get_tokenizer, is_onnx_supported
from llm_guard.util import get_logger, lazy_load_dep, split_text_to_word_chunks

Expand Down Expand Up @@ -52,7 +53,7 @@ def load(self) -> None:

def __init__(
self,
model_path: Optional[str] = None,
model: Model,
pipeline: Optional[TokenClassificationPipeline] = None,
supported_entities: Optional[List[str]] = None,
supported_language: str = "en",
Expand All @@ -61,10 +62,10 @@ def __init__(
supported_entities = BERT_BASE_NER_CONF["PRESIDIO_SUPPORTED_ENTITIES"]
super().__init__(
supported_entities=supported_entities,
name=f"Transformers model {model_path}",
name=f"Transformers model {model.path}",
)

self.model_path = model_path
self.model = model
self.pipeline = pipeline
self.is_loaded = False

Expand All @@ -77,24 +78,17 @@ def __init__(
self.chunk_length = None
self.id_entity_name = None
self.id_score_reduction = None
self.onnx_model_path = None
self.supported_language = supported_language

def load_transformer(
self,
use_onnx: bool = False,
model_kwargs: Optional[Dict] = None,
pipeline_kwargs: Optional[Dict] = None,
**kwargs,
) -> None:
"""Load external configuration parameters and set default values.
:param use_onnx: flag to use ONNX optimized model
:type use_onnx: bool, optional
:param model_kwargs: define default values for model attributes
:type model_kwargs: Optional[Dict], optional
:param pipeline_kwargs: define default values for pipeline attributes
:type pipeline_kwargs: Optional[Dict], optional
:param kwargs: define default values for class attributes and modify pipeline behavior
**DATASET_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from dataset format to Presidio format
**MODEL_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from chosen model format to Presidio format
Expand All @@ -118,66 +112,57 @@ def load_transformer(
self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
self.onnx_model_path = kwargs.get("ONNX_MODEL_PATH", None)

if not self.pipeline:
if not self.model_path:
self.model_path = "dslim/bert-base-NER"
self.onnx_model_path = "optimum/bert-base-NER"
if not self.model:
self.model = Model(
path="dslim/bert-base-NER",
onnx_path="dslim/bert-base-NER",
subfolder="onnx",
)
LOGGER.warning(
"Both 'model' and 'model_path' arguments are None. Using default",
model_path=self.model_path,
"'model' argument is None. Using default",
model=self.model,
)

self._load_pipeline(
use_onnx=use_onnx, model_kwargs=model_kwargs, pipeline_kwargs=pipeline_kwargs
)
self._load_pipeline(
use_onnx=use_onnx,
)

def _load_pipeline(
self,
use_onnx: bool = False,
model_kwargs: Optional[Dict] = None,
pipeline_kwargs: Optional[Dict] = None,
) -> None:
"""Initialize NER transformers_rec pipeline using the model_path provided"""
model = self.model_path
onnx_model = self.onnx_model_path
pipeline_kwargs = pipeline_kwargs or {}
model_kwargs = model_kwargs or {}

transformers = lazy_load_dep("transformers")
tf_tokenizer = get_tokenizer(model, **model_kwargs)
tf_tokenizer = get_tokenizer(self.model)

if use_onnx and is_onnx_supported() is False:
LOGGER.warning("ONNX is not supported on this machine. Using PyTorch instead of ONNX.")
use_onnx = False

if use_onnx:
subfolder = "onnx" if onnx_model == model else ""
if onnx_model is not None:
model = onnx_model

optimum_onnxruntime = lazy_load_dep(
"optimum.onnxruntime",
"optimum[onnxruntime]" if device().type != "cuda" else "optimum[onnxruntime-gpu]",
)
tf_tokenizer.model_input_names = ["input_ids", "attention_mask"]
tf_model = optimum_onnxruntime.ORTModelForTokenClassification.from_pretrained(
model,
export=onnx_model is None,
subfolder=subfolder,
self.model.onnx_path,
export=False,
subfolder=self.model.onnx_subfolder,
provider="CUDAExecutionProvider"
if device().type == "cuda"
else "CPUExecutionProvider",
use_io_binding=True if device().type == "cuda" else False,
**model_kwargs,
**self.model.kwargs,
)
LOGGER.debug("Initialized NER ONNX model", model=model, device=device())
LOGGER.debug("Initialized NER ONNX model", model=self.model, device=device())
else:
tf_model = transformers.AutoModelForTokenClassification.from_pretrained(
model, **model_kwargs
self.model.path, subfolder=self.model.subfolder, **self.model.kwargs
)
LOGGER.debug("Initialized NER model", model=model, device=device())
LOGGER.debug("Initialized NER model", model=self.model, device=device())

self.pipeline = transformers.pipeline(
"ner",
Expand All @@ -189,7 +174,7 @@ def _load_pipeline(
aggregation_strategy=self.aggregation_mechanism,
framework="pt",
ignore_labels=self.ignore_labels,
**pipeline_kwargs,
**self.model.pipeline_kwargs,
)

self.is_loaded = True
Expand Down
15 changes: 7 additions & 8 deletions llm_guard/input_scanners/ban_competitors.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
from typing import Dict, Optional, Sequence
from typing import Optional, Sequence

from presidio_anonymizer.core.text_replace_builder import TextReplaceBuilder

from llm_guard.model import Model
from llm_guard.util import device, get_logger, lazy_load_dep

from .base import Scanner

LOGGER = get_logger()

MODEL_BASE = "tomaarsen/span-marker-bert-base-orgs"
MODEL_SMALL = "tomaarsen/span-marker-bert-small-orgs"
MODEL_BASE = Model("tomaarsen/span-marker-bert-base-orgs")
MODEL_SMALL = Model("tomaarsen/span-marker-bert-small-orgs")


class BanCompetitors(Scanner):
Expand All @@ -25,8 +26,7 @@ def __init__(
*,
threshold: float = 0.5,
redact: bool = True,
model: Optional[str] = None,
model_kwargs: Optional[Dict] = None,
model: Optional[Model] = None,
):
"""
Initialize BanCompetitors object.
Expand All @@ -35,8 +35,7 @@ def __init__(
competitors (Sequence[str]): List of competitors to detect.
threshold (float, optional): Threshold to determine if a competitor is present in the prompt. Default is 0.5.
redact (bool, optional): Whether to redact the competitor name. Default is True.
model (str, optional): Model to use for named-entity recognition. Default is BASE model.
model_kwargs (Dict, optional): Keyword arguments passed to the model.
model (Model, optional): Model to use for named-entity recognition. Default is BASE model.
Raises:
ValueError: If no topics are provided.
Expand All @@ -50,7 +49,7 @@ def __init__(

span_marker = lazy_load_dep("span_marker", "span-marker")
self._ner_pipeline = span_marker.SpanMarkerModel.from_pretrained(
model, labels=["ORG"], **(model_kwargs or {})
model.path, labels=["ORG"], **model.kwargs
)

if device().type == "cuda":
Expand Down
Loading

0 comments on commit 683cc93

Please sign in to comment.