Skip to content

Commit

Permalink
fixed isort/black issues for now
Browse files Browse the repository at this point in the history
  • Loading branch information
Sid Mohan authored and Sid Mohan committed Feb 21, 2024
1 parent d737f32 commit 3dea5e1
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 65 deletions.
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
repos:
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
rev: 5.13.2
hooks:
- id: isort

- repo: https://github.com/psf/black
rev: 22.12.0
rev: 24.2.0
hooks:
- id: black
language_version: python3

- repo: https://github.com/pycqa/flake8
rev: 6.0.0
rev: 7.0.0
hooks:
- id: flake8

- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.0.0-alpha.4
rev: v4.0.0-alpha.8
hooks:
- id: prettier
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,7 @@ requests-mock = "^1.11.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.isort]
profile = "black"
line_length = 88
1 change: 1 addition & 0 deletions src/datafog/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# datafog-python/src/datafog/__init__.py
from posthog import Posthog

from .__about__ import __version__
Expand Down
131 changes: 70 additions & 61 deletions src/datafog/pii_tools/presidio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# datafog-python/src/datafog/pii_tools/presidio/__init__.py
from typing import List, Optional

import polars as pl
Expand All @@ -6,37 +7,72 @@
from presidio_analyzer import (
AnalyzerEngine,
BatchAnalyzerEngine,
RecognizerRegistry,
RecognizerResult,
Pattern,
PatternRecognizer,
)
from presidio_analyzer.nlp_engine import NlpEngine
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine

from .nlp_engines import create_nlp_engine_with_spacy

posthog = Posthog(
"phc_v6vMICyVCGoYZ2s2xUWB4qoTPoMNFGv2u1q0KnBpaIb", host="https://app.posthog.com"
)


def create_ad_hoc_deny_list_recognizer(
deny_list: Optional[List[str]] = None,
) -> Optional[PatternRecognizer]:
if not deny_list:
return None

deny_list_recognizer = PatternRecognizer(
supported_entity="GENERIC_PII", deny_list=deny_list
)
return deny_list_recognizer


def create_ad_hoc_regex_recognizer(
regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
) -> Optional[PatternRecognizer]:
if not regex:
return None
pattern = Pattern(name="Regex pattern", regex=regex, score=score)
regex_recognizer = PatternRecognizer(
supported_entity=entity_type, patterns=[pattern], context=context
)
return regex_recognizer


class PresidioEngine:
def __init__(
self,
registry: RecognizerRegistry = None,
nlp_engine: NlpEngine = "spacy",
log_decision_process: bool = False,
default_score_threshold: float = 0.5,
supported_languages: List[str] = ["en"],
):

def __init__(self, config=None):
self.config = self.default_config() if config is None else config
self.initialize_engine()
# Simplify Posthog initialization (consider making analytics optional)

@staticmethod # TODO need to make this a class method
def default_config():
"""Return default configuration."""
return {
"nlp_engine": "spacy",
"language": "en",
# More default settings
}

def initialize_engine(self):
# Initialize NLP and anonymizer engines based on configuration.
# Logic to initialize based on self.config
self.analyzer = AnalyzerEngine()
self.batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)
self.anonymizer = AnonymizerEngine()
self.batch_anonymizer = BatchAnonymizerEngine()
posthog.capture("device_id", "presidioengine_init")

def process_input(self, input_data: str) -> pl.DataFrame:
def process_input(
self, input_data: str
) -> pl.DataFrame: # TODO refactor because it is hacky
"""Process input data and return a DataFrame."""
if input_data.startswith(("http://", "https://")):
if input_data.startswith(("http://", "https://")) and not input_data.endswith(
".txt"
):
response = requests.get(input_data)
df = pl.read_csv(response.content)
elif "\n" in input_data or "," in input_data or input_data.endswith(".csv"):
Expand All @@ -45,47 +81,6 @@ def process_input(self, input_data: str) -> pl.DataFrame:
df = pl.DataFrame(input_data)
return df

def __call__(self, input_data: str, language: str = "en") -> pl.DataFrame:
df = self.process_input(input_data)
posthog.capture(
"device_id", "presidio_input_processed", properties={"num_rows": len(df)}
)

df_dict = {col: df[col].to_list() for col in df.columns}
analyzer_results = self.batch_analyzer.analyze_dict(df_dict, language=language)
anonymizer_results = self.batch_anonymizer.anonymize_dict(analyzer_results)
scrubbed_df = pl.DataFrame(anonymizer_results)

return scrubbed_df

@staticmethod
def analyzer_engine(
model_family: str,
model_path: str,
ta_key: Optional[str] = None,
ta_endpoint: Optional[str] = None,
) -> AnalyzerEngine:
"""Create and return an AnalyzerEngine instance."""
nlp_engine, registry = create_nlp_engine_with_spacy(
model_family, model_path, ta_key, ta_endpoint
)
return AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)

@staticmethod
def anonymizer_engine() -> AnonymizerEngine:
"""Return AnonymizerEngine instance."""
return AnonymizerEngine()

@staticmethod
def get_supported_entities(
model_family: str, model_path: str, ta_key: str, ta_endpoint: str
) -> List[str]:
"""Return supported entities from the Analyzer Engine."""
analyzer = PresidioEngine.analyzer_engine(
model_family, model_path, ta_key, ta_endpoint
)
return analyzer.get_supported_entities() + ["GENERIC_PII"]

@staticmethod
def analyze(
model_family: str, model_path: str, ta_key: str, ta_endpoint: str, **kwargs
Expand All @@ -96,13 +91,27 @@ def analyze(
)
return analyzer.analyze(**kwargs)

@staticmethod
def anonymize(
self,
text: str,
operator: str,
analyze_results: List[RecognizerResult],
**operator_config
analyze_results: dict,
operator_config: dict,
):
"""Anonymize identified input using Presidio Anonymizer."""
anonymizer = PresidioEngine.anonymizer_engine()
return anonymizer.anonymize(text, operator, analyze_results, **operator_config)
return self.anonymizer.anonymize(
text=text,
operator=operator,
analyzer_results=analyze_results,
**operator_config
)

@staticmethod
def get_supported_entities(
model_family: str, model_path: str, ta_key: str, ta_endpoint: str
) -> List[str]:
"""Return supported entities from the Analyzer Engine."""
analyzer = PresidioEngine.analyzer_engine(
model_family, model_path, ta_key, ta_endpoint
)
return analyzer.get_supported_entities() + ["GENERIC_PII"]
2 changes: 2 additions & 0 deletions src/datafog/pii_tools/presidio/nlp_engines.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# datafog-python/src/datafog/pii_tools/presidio/nlp_engines.py

from typing import Tuple

import spacy
Expand Down

0 comments on commit 3dea5e1

Please sign in to comment.