Skip to content

Commit

Permalink
cleaned up scan functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
Sid Mohan authored and Sid Mohan committed Feb 23, 2024
1 parent 3dea5e1 commit f5dfaa4
Show file tree
Hide file tree
Showing 21 changed files with 1,914 additions and 721 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ build/
/src/datafog/__pycache__/
/src/datafog/pii_tools/__pycache__/
/tests/__pycache__/
/tests/scratch.py
node_modules

34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,34 @@ DataFog can be installed via pip:
pip install datafog # python client
```

## Usage

We're going to build up functionality starting with support for the Microsoft Presidio library. If you have any custom requests that would be of benefit to the community, please let us know!

```
import requests
from datafog import PresidioEngine as presidio
# Example: Detecting PII in a String
pii_detected = presidio.scan("My name is John Doe and my email is johndoe@genai.com")
print("PII Detected:", pii_detected)
# Example: Detecting PII in a File
sample_filepath = "/Users/sidmohan/Desktop/v2.0.0/datafog-python/tests/files/input_files/sample.csv"
with open(sample_filepath, "r") as f:
original_value = f.read()
pii_detected = presidio.scan(original_value)
print("PII Detected in File:", pii_detected)
# Example: Detecting PII in a URL
sample_url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
response = requests.get(sample_url)
original_value = response.text
pii_detected = presidio.scan(original_value)
print("PII Detected in URL Content:", pii_detected)
```

## Dev Notes

- Clone repo
Expand All @@ -51,10 +79,16 @@ pip install datafog # python client
To run the datafog unit tests, check out this repository and do

```
tox
```

### License

This software is published under the [MIT
license](https://en.wikipedia.org/wiki/MIT_License).

```
```
239 changes: 90 additions & 149 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,10 @@ python = "^3.10"
presidio-analyzer = "^2.2.353"
presidio-anonymizer = "^2.2.353"
polars = "^0.20.10"
posthog = "^3.4.2"
numpy = "^1.26.4"
pytest = "^8.0.1"
requests-mock = "^1.11.0"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Expand Down
10 changes: 2 additions & 8 deletions src/datafog/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
# datafog-python/src/datafog/__init__.py
from posthog import Posthog

from .__about__ import __version__
from .pii_tools import presidio

posthog = Posthog(
"phc_v6vMICyVCGoYZ2s2xUWB4qoTPoMNFGv2u1q0KnBpaIb", host="https://app.posthog.com"
)
from .pii_tools import PresidioEngine

__all__ = [
"__version__",
"presidio",
"PresidioEngine",
]
66 changes: 66 additions & 0 deletions src/datafog/pii_tools/PresidioEngine/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider

from .analyzer import CustomSpacyRecognizer


# Helper methods
def analyzer_engine():
"""Return AnalyzerEngine."""

spacy_recognizer = CustomSpacyRecognizer()
configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_spacy_pii_fast"}],
}

# Create NLP engine based on configuration
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

registry = RecognizerRegistry()

# add rule-based recognizers
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
registry.add_recognizer(spacy_recognizer)

# remove the nlp engine we passed, to use custom label mappings
registry.remove_recognizer("SpacyRecognizer")

analyzer = AnalyzerEngine(
nlp_engine=nlp_engine, registry=registry, supported_languages=["en"]
)

return analyzer


def annotate(text, analysis_results):
tokens = []
# sort by start index
results = sorted(analysis_results, key=lambda x: x.start)
for i, res in enumerate(results):
if i == 0:
tokens.append(text[: res.start])

# append entity text and entity type
tokens.append((text[res.start : res.end], res.entity_type))

# if another entity coming i.e. we're not at the last results element, add text up to next entity
if i != len(results) - 1:
tokens.append(text[res.end : results[i + 1].start])
# if no more entities coming, add all remaining text
else:
tokens.append(text[res.end :])
return tokens


def scan(text, **kwargs):
# Set default values for any parameters not provided
kwargs.setdefault("language", "en")
kwargs.setdefault("score_threshold", 0.35)
kwargs.setdefault("nlp_artifacts", None)

# init analyzer instance
analyzer = analyzer_engine()
# Call the analyze method with the supported parameters
return analyzer.analyze(text, **kwargs)
126 changes: 126 additions & 0 deletions src/datafog/pii_tools/PresidioEngine/analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import logging
from typing import List, Optional, Set, Tuple

from presidio_analyzer import AnalysisExplanation, LocalRecognizer, RecognizerResult

logger = logging.getLogger("presidio-module")


class CustomSpacyRecognizer(LocalRecognizer):

ENTITIES = [
"LOCATION",
"PERSON",
"NRP",
"ORGANIZATION",
"DATE_TIME",
]

DEFAULT_EXPLANATION = "Identified as {} by the PII Detection Model"

CHECK_LABEL_GROUPS = [
({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
({"PERSON"}, {"PER", "PERSON"}),
({"NRP"}, {"NORP", "NRP"}),
({"ORGANIZATION"}, {"ORG"}),
({"DATE_TIME"}, {"DATE_TIME"}),
]

MODEL_LANGUAGES = {
"en": "beki/en_spacy_pii_fast",
}

PRESIDIO_EQUIVALENCES = {
"PER": "PERSON",
"LOC": "LOCATION",
"ORG": "ORGANIZATION",
"NROP": "NRP",
"DATE_TIME": "DATE_TIME",
}

def __init__(
self,
supported_language: str = "en",
supported_entities: Optional[List[str]] = None,
check_label_groups: Optional[Tuple[Set, Set]] = None,
context: Optional[List[str]] = None,
ner_strength: float = 0.85,
):
self.ner_strength = ner_strength
self.check_label_groups = (
check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
)
supported_entities = supported_entities if supported_entities else self.ENTITIES
super().__init__(
supported_entities=supported_entities,
supported_language=supported_language,
)

def load(self) -> None:
"""Load the model, not used. Model is loaded during initialization."""
pass

def get_supported_entities(self) -> List[str]:
"""
Return supported entities by this model.
:return: List of the supported entities.
"""
return self.supported_entities

def build_spacy_explanation(
self, original_score: float, explanation: str
) -> AnalysisExplanation:
"""
Create explanation for why this result was detected.
:param original_score: Score given by this recognizer
:param explanation: Explanation string
:return:
"""
explanation = AnalysisExplanation(
recognizer=self.__class__.__name__,
original_score=original_score,
textual_explanation=explanation,
)
return explanation

def analyze(self, text, entities, nlp_artifacts=None): # noqa D102
results = []
if not nlp_artifacts:
logger.warning("No NLP artifacts provided for analysis")
return results

ner_entities = nlp_artifacts.entities

for entity in entities:
if entity not in self.supported_entities:
continue
for ent in ner_entities:
if not self.__check_label(entity, ent.label_, self.check_label_groups):
continue
textual_explanation = self.DEFAULT_EXPLANATION.format(ent.label_)
explanation = self.build_spacy_explanation(
self.ner_strength, textual_explanation
)

spacy_result = RecognizerResult(
entity_type=entity,
start=ent.start_char,
end=ent.end_char,
score=self.ner_strength,
analysis_explanation=explanation,
recognition_metadata={
RecognizerResult.RECOGNIZER_NAME_KEY: self.name
},
)

results.append(spacy_result)

return results

@staticmethod
def __check_label(
entity: str, label: str, check_label_groups: Tuple[Set, Set]
) -> bool:
return any(
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
)
Loading

0 comments on commit f5dfaa4

Please sign in to comment.