Skip to content

Commit

Permalink
* more up-to-date models for the NER
Browse files Browse the repository at this point in the history
  • Loading branch information
asofter committed Feb 19, 2024
1 parent 66f1b74 commit 4b9ff83
Show file tree
Hide file tree
Showing 5 changed files with 257 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] - 0.3.10

### Added
-
- **Anonymize**: New NER models from AI4Privacy [Isotonic/distilbert_finetuned_ai4privacy_v2](https://huggingface.co/Isotonic/distilbert_finetuned_ai4privacy_v2) and [Isotonic/deberta-v3-base_finetuned_ai4privacy_v2](https://huggingface.co/Isotonic/deberta-v3-base_finetuned_ai4privacy_v2).

### Fixed
-
Expand Down
4 changes: 3 additions & 1 deletion docs/input_scanners/anonymize.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ Some model providers may train their models on your requests, which can be a pri
- **Tailored recognizers**:
- Balance speed vs. accuracy of the recognizers.
- **Top Pick: [dslim/bert-base-NER](https://huggingface.co/dslim/bert-base-NER)**
- Alternatives: [dslim/bert-large-NER](https://huggingface.co/dslim/bert-large-NER).
- Alternative with more parameters: [dslim/bert-large-NER](https://huggingface.co/dslim/bert-large-NER).
- Chinese recognizer: [gyr66/bert-base-chinese-finetuned-ner](https://huggingface.co/gyr66/bert-base-chinese-finetuned-ner).
- Good models from AI4Privacy: [Isotonic/distilbert_finetuned_ai4privacy_v2](https://huggingface.co/Isotonic/distilbert_finetuned_ai4privacy_v2) and [Isotonic/deberta-v3-base_finetuned_ai4privacy_v2](https://huggingface.co/Isotonic/deberta-v3-base_finetuned_ai4privacy_v2).
- **Support of multiple languages**: The scanner can detect PII in English and Chinese.

!!! info
Expand Down
1 change: 0 additions & 1 deletion llm_guard/input_scanners/anonymize_helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,4 @@
"get_fake_value",
"ALL_RECOGNIZER_CONF",
"BERT_BASE_NER_CONF",
"BERT_LARGE_CASED_FINETUNED_COLL03_ENGLISH_CONF",
]
168 changes: 168 additions & 0 deletions llm_guard/input_scanners/anonymize_helpers/ner_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,176 @@
"ID_ENTITY_NAME": "ID",
}

DISTILBERT_AI4PRIVACY_v2_CONF = {
"PRESIDIO_SUPPORTED_ENTITIES": [
"LOCATION",
"PERSON",
"ORGANIZATION",
"EMAIL_ADDRESS",
"PHONE_NUMBER",
"CREDIT_CARD",
"CRYPTO",
"DATE_TIME",
"IBAN_CODE",
"IP_ADDRESS",
"URL",
],
"DEFAULT_MODEL_PATH": "Isotonic/distilbert_finetuned_ai4privacy_v2",
"ONNX_MODEL_PATH": "Isotonic/distilbert_finetuned_ai4privacy_v2",
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/distilbert_finetuned_ai4privacy_v2 NER model",
"SUB_WORD_AGGREGATION": "simple",
"DATASET_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
"CITY": "LOCATION",
"ZIPCODE": "LOCATION",
"BUILDINGNUMBER": "LOCATION",
"NEARBYGPSCOORDINATES": "LOCATION",
"SECONDARYADDRESS": "LOCATION",
"STATE": "LOCATION",
"COUNTY": "LOCATION",
"EMAIL": "EMAIL_ADDRESS",
"COMPANYNAME": "ORGANIZATION",
"PHONENUMBER": "PHONE_NUMBER",
"FIRSTNAME": "PERSON",
"LASTNAME": "PERSON",
"MIDDLENAME": "PERSON",
"USERNAME": "PERSON",
"CREDITCARDNUMBER": "CREDIT_CARD",
"ETHEREUMADDRESS": "CRYPTO",
"BITCOINADDRESS": "CRYPTO",
"LITECOINADDRESS": "CRYPTO",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"IBAN": "IBAN_CODE",
"IPV4": "IP_ADDRESS",
"IPV6": "IP_ADDRESS",
"IP": "IP_ADDRESS",
"URL": "URL",
},
"MODEL_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
"CITY": "LOCATION",
"ZIPCODE": "LOCATION",
"BUILDINGNUMBER": "LOCATION",
"NEARBYGPSCOORDINATES": "LOCATION",
"SECONDARYADDRESS": "LOCATION",
"STATE": "LOCATION",
"COUNTY": "LOCATION",
"EMAIL": "EMAIL_ADDRESS",
"COMPANYNAME": "ORGANIZATION",
"PHONENUMBER": "PHONE_NUMBER",
"FIRSTNAME": "PERSON",
"LASTNAME": "PERSON",
"MIDDLENAME": "PERSON",
"USERNAME": "PERSON",
"CREDITCARDNUMBER": "CREDIT_CARD",
"ETHEREUMADDRESS": "CRYPTO",
"BITCOINADDRESS": "CRYPTO",
"LITECOINADDRESS": "CRYPTO",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"IBAN": "IBAN_CODE",
"IPV4": "IP_ADDRESS",
"IPV6": "IP_ADDRESS",
"IP": "IP_ADDRESS",
"URL": "URL",
},
"CHUNK_OVERLAP_SIZE": 40,
"CHUNK_SIZE": 600,
"ID_SCORE_MULTIPLIER": 0.4,
"ID_ENTITY_NAME": "ID",
}

DEBERTA_AI4PRIVACY_v2_CONF = {
"PRESIDIO_SUPPORTED_ENTITIES": [
"LOCATION",
"PERSON",
"ORGANIZATION",
"EMAIL_ADDRESS",
"PHONE_NUMBER",
"CREDIT_CARD",
"CRYPTO",
"DATE_TIME",
"IBAN_CODE",
"IP_ADDRESS",
"URL",
],
"DEFAULT_MODEL_PATH": "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
"ONNX_MODEL_PATH": "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2",
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/deberta-v3-base_finetuned_ai4privacy_v2 NER model",
"SUB_WORD_AGGREGATION": "simple",
"DATASET_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
"CITY": "LOCATION",
"ZIPCODE": "LOCATION",
"BUILDINGNUMBER": "LOCATION",
"NEARBYGPSCOORDINATES": "LOCATION",
"SECONDARYADDRESS": "LOCATION",
"STATE": "LOCATION",
"COUNTY": "LOCATION",
"EMAIL": "EMAIL_ADDRESS",
"COMPANYNAME": "ORGANIZATION",
"PHONENUMBER": "PHONE_NUMBER",
"FIRSTNAME": "PERSON",
"LASTNAME": "PERSON",
"MIDDLENAME": "PERSON",
"USERNAME": "PERSON",
"CREDITCARDNUMBER": "CREDIT_CARD",
"ETHEREUMADDRESS": "CRYPTO",
"BITCOINADDRESS": "CRYPTO",
"LITECOINADDRESS": "CRYPTO",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"IBAN": "IBAN_CODE",
"IPV4": "IP_ADDRESS",
"IPV6": "IP_ADDRESS",
"IP": "IP_ADDRESS",
"URL": "URL",
},
"MODEL_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
"CITY": "LOCATION",
"ZIPCODE": "LOCATION",
"BUILDINGNUMBER": "LOCATION",
"NEARBYGPSCOORDINATES": "LOCATION",
"SECONDARYADDRESS": "LOCATION",
"STATE": "LOCATION",
"COUNTY": "LOCATION",
"EMAIL": "EMAIL_ADDRESS",
"COMPANYNAME": "ORGANIZATION",
"PHONENUMBER": "PHONE_NUMBER",
"FIRSTNAME": "PERSON",
"LASTNAME": "PERSON",
"MIDDLENAME": "PERSON",
"USERNAME": "PERSON",
"CREDITCARDNUMBER": "CREDIT_CARD",
"ETHEREUMADDRESS": "CRYPTO",
"BITCOINADDRESS": "CRYPTO",
"LITECOINADDRESS": "CRYPTO",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"IBAN": "IBAN_CODE",
"IPV4": "IP_ADDRESS",
"IPV6": "IP_ADDRESS",
"IP": "IP_ADDRESS",
"URL": "URL",
},
"CHUNK_OVERLAP_SIZE": 40,
"CHUNK_SIZE": 600,
"ID_SCORE_MULTIPLIER": 0.4,
"ID_ENTITY_NAME": "ID",
}

ALL_RECOGNIZER_CONF = [
BERT_BASE_NER_CONF,
BERT_LARGE_NER_CONF,
BERT_ZH_NER_CONF,
DISTILBERT_AI4PRIVACY_v2_CONF,
DEBERTA_AI4PRIVACY_v2_CONF,
]
87 changes: 85 additions & 2 deletions tests/input_scanners/test_anonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
default_entity_types,
sensitive_patterns_path,
)
from llm_guard.input_scanners.anonymize_helpers import BERT_LARGE_NER_CONF, BERT_ZH_NER_CONF
from llm_guard.input_scanners.anonymize_helpers import (
BERT_LARGE_NER_CONF,
BERT_ZH_NER_CONF,
DEBERTA_AI4PRIVACY_v2_CONF,
DISTILBERT_AI4PRIVACY_v2_CONF,
)
from llm_guard.util import read_json_file
from llm_guard.vault import Vault

Expand Down Expand Up @@ -95,6 +100,84 @@
Social Security Number (SSN): [REDACTED_US_SSN_RE_1].
Credit Card: [REDACTED_CREDIT_CARD_1], Exp: 09/25, CVV: 123.
Create a summarized version of his resume.""",
False,
1.0,
),
(
{"recognizer_conf": DEBERTA_AI4PRIVACY_v2_CONF},
"""Given the personal profile:
Full Name: Johnathan Edward Doe.
Date of Birth: April 12, 1990.
Address: 123 Elm Street, Springfield, IL, 62701.
Email: john.doe@protectai[DOT]com.
Phone Number: (123) 456-7890.
Educational Background:
Springfield High School, Graduated in 2008;
Springfield University, B.Sc. Computer Science, Graduated in 2012.
Employment:
ABC Tech Solutions, Software Engineer, 2012-2015;
XYZ Global Systems, Senior Software Developer, 2015-2021.
Social Security Number (SSN): 123-45-6789.
Credit Card: 4111111111111111, Exp: 09/25, CVV: 123.
Create a summarized version of his resume.""",
"""Given the personal profile:
Full Name:[REDACTED_PERSON_1][REDACTED_PERSON_3][REDACTED_PERSON_2].
Date of Birth: April 12, 1990.
Address: 123 Elm Street, Springfield, IL, 62701.
Email:[REDACTED_EMAIL_ADDRESS_1].
Phone Number:[REDACTED_PHONE_NUMBER_1].
Educational Background:
Springfield High School, Graduated in 2008;
Springfield University, B.Sc. Computer Science, Graduated in 2012.
Employment:
ABC Tech Solutions, Software Engineer, 2012-2015;
XYZ Global Systems, Senior Software Developer, 2015-2021.
Social Security Number (SSN): [REDACTED_US_SSN_RE_1].
Credit Card: [REDACTED_CREDIT_CARD_1], Exp: 09/25, CVV: 123.
Create a summarized version of his resume.""",
False,
1.0,
),
(
{"recognizer_conf": DISTILBERT_AI4PRIVACY_v2_CONF},
"""Given the personal profile:
Full Name: Johnathan Edward Doe.
Date of Birth: April 12, 1990.
Address: 123 Elm Street, Springfield, IL, 62701.
Email: john.doe@protectai[DOT]com.
Phone Number: (123) 456-7890.
Educational Background:
Springfield High School, Graduated in 2008;
Springfield University, B.Sc. Computer Science, Graduated in 2012.
Employment:
ABC Tech Solutions, Software Engineer, 2012-2015;
XYZ Global Systems, Senior Software Developer, 2015-2021.
Social Security Number (SSN): 123-45-6789.
Credit Card: 4111111111111111, Exp: 09/25, CVV: 123.
Create a summarized version of his resume.""",
"""Given the personal profile:
Full Name: [REDACTED_PERSON_3] [REDACTED_PERSON_2] [REDACTED_PERSON_1].
Date of Birth: April 12, 1990.
Address: 123 Elm Street, Springfield, IL, 62701.
Email: [REDACTED_EMAIL_ADDRESS_RE_1].
Phone Number: [REDACTED_PHONE_NUMBER_1].
Educational Background:
Springfield High School, Graduated in 2008;
Springfield University, B.Sc. Computer Science, Graduated in 2012.
Employment:
ABC Tech Solutions, Software Engineer, 2012-2015;
XYZ Global Systems, Senior Software Developer, 2015-2021.
Social Security Number (SSN): [REDACTED_US_SSN_RE_1].
Credit Card: [REDACTED_CREDIT_CARD_1], Exp: 09/25, CVV: 123.
Create a summarized version of his resume.""",
False,
1.0,
Expand Down Expand Up @@ -177,7 +260,7 @@ def test_scan_zh(settings, prompt, expected_prompt, expected_valid, expected_sco

def test_scan_unknow():
try:
Anonymize(Vault(), language="unknow")
Anonymize(Vault(), language="unknown")
except LLMGuardValidationError as e:
assert str(e) == f"Language must be in the list of allowed: {ALL_SUPPORTED_LANGUAGES}"

Expand Down

0 comments on commit 4b9ff83

Please sign in to comment.