cleaned up scan functionality

DataFog · Feb 23, 2024 · f5dfaa4 · f5dfaa4
1 parent 3dea5e1
commit f5dfaa4
Show file tree

Hide file tree

Showing 21 changed files with 1,914 additions and 721 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,5 +12,6 @@ build/
 /src/datafog/__pycache__/
 /src/datafog/pii_tools/__pycache__/
 /tests/__pycache__/
+/tests/scratch.py
 node_modules
 
diff --git a/README.md b/README.md
@@ -36,6 +36,34 @@ DataFog can be installed via pip:
 pip install datafog # python client
 ```
 
+## Usage
+
+We're going to build up functionality starting with support for the Microsoft Presidio library. If you have any custom requests that would be of benefit to the community, please let us know!
+
+```
+  import requests
+  from datafog import PresidioEngine as presidio
+
+  # Example: Detecting PII in a String
+  pii_detected = presidio.scan("My name is John Doe and my email is johndoe@genai.com")
+  print("PII Detected:", pii_detected)
+
+  # Example: Detecting PII in a File
+  sample_filepath = "/Users/sidmohan/Desktop/v2.0.0/datafog-python/tests/files/input_files/sample.csv"
+  with open(sample_filepath, "r") as f:
+      original_value = f.read()
+  pii_detected = presidio.scan(original_value)
+  print("PII Detected in File:", pii_detected)
+
+  # Example: Detecting PII in a URL
+  sample_url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
+  response = requests.get(sample_url)
+  original_value = response.text
+  pii_detected = presidio.scan(original_value)
+  print("PII Detected in URL Content:", pii_detected)
+
+```
+
 ## Dev Notes
 
 - Clone repo
@@ -51,10 +79,16 @@ pip install datafog # python client
 To run the datafog unit tests, check out this repository and do
 
 ```
+
 tox
+
 ```
 
 ### License
 
 This software is published under the [MIT
 license](https://en.wikipedia.org/wiki/MIT_License).
+
+```
+
+```
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,12 +11,10 @@ python = "^3.10"
 presidio-analyzer = "^2.2.353"
 presidio-anonymizer = "^2.2.353"
 polars = "^0.20.10"
-posthog = "^3.4.2"
 numpy = "^1.26.4"
 pytest = "^8.0.1"
 requests-mock = "^1.11.0"
 
-
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

diff --git a/src/datafog/__init__.py b/src/datafog/__init__.py
@@ -1,14 +1,8 @@
 # datafog-python/src/datafog/__init__.py
-from posthog import Posthog
-
 from .__about__ import __version__
-from .pii_tools import presidio
-
-posthog = Posthog(
-    "phc_v6vMICyVCGoYZ2s2xUWB4qoTPoMNFGv2u1q0KnBpaIb", host="https://app.posthog.com"
-)
+from .pii_tools import PresidioEngine
 
 __all__ = [
     "__version__",
-    "presidio",
+    "PresidioEngine",
 ]
diff --git a/src/datafog/pii_tools/PresidioEngine/__init__.py b/src/datafog/pii_tools/PresidioEngine/__init__.py
@@ -0,0 +1,66 @@
+from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+
+from .analyzer import CustomSpacyRecognizer
+
+
+# Helper methods
+def analyzer_engine():
+    """Return AnalyzerEngine."""
+
+    spacy_recognizer = CustomSpacyRecognizer()
+    configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": "en_spacy_pii_fast"}],
+    }
+
+    # Create NLP engine based on configuration
+    provider = NlpEngineProvider(nlp_configuration=configuration)
+    nlp_engine = provider.create_engine()
+
+    registry = RecognizerRegistry()
+
+    # add rule-based recognizers
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
+    registry.add_recognizer(spacy_recognizer)
+
+    # remove the nlp engine we passed, to use custom label mappings
+    registry.remove_recognizer("SpacyRecognizer")
+
+    analyzer = AnalyzerEngine(
+        nlp_engine=nlp_engine, registry=registry, supported_languages=["en"]
+    )
+
+    return analyzer
+
+
+def annotate(text, analysis_results):
+    tokens = []
+    # sort by start index
+    results = sorted(analysis_results, key=lambda x: x.start)
+    for i, res in enumerate(results):
+        if i == 0:
+            tokens.append(text[: res.start])
+
+        # append entity text and entity type
+        tokens.append((text[res.start : res.end], res.entity_type))
+
+        # if another entity coming i.e. we're not at the last results element, add text up to next entity
+        if i != len(results) - 1:
+            tokens.append(text[res.end : results[i + 1].start])
+        # if no more entities coming, add all remaining text
+        else:
+            tokens.append(text[res.end :])
+    return tokens
+
+
+def scan(text, **kwargs):
+    # Set default values for any parameters not provided
+    kwargs.setdefault("language", "en")
+    kwargs.setdefault("score_threshold", 0.35)
+    kwargs.setdefault("nlp_artifacts", None)
+
+    # init analyzer instance
+    analyzer = analyzer_engine()
+    # Call the analyze method with the supported parameters
+    return analyzer.analyze(text, **kwargs)
diff --git a/src/datafog/pii_tools/PresidioEngine/analyzer.py b/src/datafog/pii_tools/PresidioEngine/analyzer.py
@@ -0,0 +1,126 @@
+import logging
+from typing import List, Optional, Set, Tuple
+
+from presidio_analyzer import AnalysisExplanation, LocalRecognizer, RecognizerResult
+
+logger = logging.getLogger("presidio-module")
+
+
+class CustomSpacyRecognizer(LocalRecognizer):
+
+    ENTITIES = [
+        "LOCATION",
+        "PERSON",
+        "NRP",
+        "ORGANIZATION",
+        "DATE_TIME",
+    ]
+
+    DEFAULT_EXPLANATION = "Identified as {} by the PII Detection Model"
+
+    CHECK_LABEL_GROUPS = [
+        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
+        ({"PERSON"}, {"PER", "PERSON"}),
+        ({"NRP"}, {"NORP", "NRP"}),
+        ({"ORGANIZATION"}, {"ORG"}),
+        ({"DATE_TIME"}, {"DATE_TIME"}),
+    ]
+
+    MODEL_LANGUAGES = {
+        "en": "beki/en_spacy_pii_fast",
+    }
+
+    PRESIDIO_EQUIVALENCES = {
+        "PER": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        "NROP": "NRP",
+        "DATE_TIME": "DATE_TIME",
+    }
+
+    def __init__(
+        self,
+        supported_language: str = "en",
+        supported_entities: Optional[List[str]] = None,
+        check_label_groups: Optional[Tuple[Set, Set]] = None,
+        context: Optional[List[str]] = None,
+        ner_strength: float = 0.85,
+    ):
+        self.ner_strength = ner_strength
+        self.check_label_groups = (
+            check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
+        )
+        supported_entities = supported_entities if supported_entities else self.ENTITIES
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+        )
+
+    def load(self) -> None:
+        """Load the model, not used. Model is loaded during initialization."""
+        pass
+
+    def get_supported_entities(self) -> List[str]:
+        """
+        Return supported entities by this model.
+        :return: List of the supported entities.
+        """
+        return self.supported_entities
+
+    def build_spacy_explanation(
+        self, original_score: float, explanation: str
+    ) -> AnalysisExplanation:
+        """
+        Create explanation for why this result was detected.
+        :param original_score: Score given by this recognizer
+        :param explanation: Explanation string
+        :return:
+        """
+        explanation = AnalysisExplanation(
+            recognizer=self.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=explanation,
+        )
+        return explanation
+
+    def analyze(self, text, entities, nlp_artifacts=None):  # noqa D102
+        results = []
+        if not nlp_artifacts:
+            logger.warning("No NLP artifacts provided for analysis")
+            return results
+
+        ner_entities = nlp_artifacts.entities
+
+        for entity in entities:
+            if entity not in self.supported_entities:
+                continue
+            for ent in ner_entities:
+                if not self.__check_label(entity, ent.label_, self.check_label_groups):
+                    continue
+                textual_explanation = self.DEFAULT_EXPLANATION.format(ent.label_)
+                explanation = self.build_spacy_explanation(
+                    self.ner_strength, textual_explanation
+                )
+
+                spacy_result = RecognizerResult(
+                    entity_type=entity,
+                    start=ent.start_char,
+                    end=ent.end_char,
+                    score=self.ner_strength,
+                    analysis_explanation=explanation,
+                    recognition_metadata={
+                        RecognizerResult.RECOGNIZER_NAME_KEY: self.name
+                    },
+                )
+
+                results.append(spacy_result)
+
+        return results
+
+    @staticmethod
+    def __check_label(
+        entity: str, label: str, check_label_groups: Tuple[Set, Set]
+    ) -> bool:
+        return any(
+            [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
+        )