Merge pull request #28 from DataFog/feature/v3.2.1

v3.2.1
DataFog · May 28, 2024 · 4311508 · 4311508
2 parents ffe5f8d + cf36dd6
commit 4311508
Show file tree

Hide file tree

Showing 778 changed files with 274,838 additions and 107 deletions.
diff --git a/.env b/.env
@@ -0,0 +1,4 @@
+APPLICATIONINSIGHTS_CONNECTION_STRING="InstrumentationKey=00bea047-1836-46fa-9652-26d43d63a3fa;IngestionEndpoint=https://eastus-8.in.applicationinsights.azure.com/;LiveEndpoint=https://eastus.livediagnostics.monitor.azure.com/;ApplicationId=959cc365-c112-491b-af69-b196d0943ca4"
+
+
+# note this is an Azure specific implementation of the OpenTelemetry distro. for more information please see https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/monitor/azure-monitor-opentelemetry
diff --git a/.github/workflows/dev-cicd-tests.yml b/.github/workflows/dev-cicd-tests.yml
@@ -21,7 +21,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11"]
+        python-version: ["3.10"]
     steps:
       - uses: actions/setup-python@v4
         with:
@@ -33,10 +33,16 @@ jobs:
           tox -- --cov datafog --cov-report xml --cov-report term
       - name: Submit to codecov
         uses: codecov/codecov-action@v3
-        if: ${{ matrix.python-version == '3.11' }}
+        if: ${{ matrix.python-version == '3.10' }}
 
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4.0.1
         env:
           token: ${{ secrets.CODECOV_TOKEN }}
           slug: DataFog/datafog-python
+
+      - name: Run script
+        env:
+          APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.APPLICATIONINSIGHTS_CONNECTION_STRING }}
+        run: |
+          python datafog/telemetry/open_telemetry.py
diff --git a/.github/workflows/feature-ci-cd.yml b/.github/workflows/feature-ci-cd.yml
@@ -0,0 +1,48 @@
+name: feature-cicd-tests
+
+on:
+  push:
+    branches:
+      - feature/*
+  pull_request:
+    branches:
+      - feature/*
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+      - name: Run pre-commit
+        uses: pre-commit/action@v3.0.1
+
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: actions/checkout@v3
+      - name: Test with tox
+        run: |
+          pip install tox
+          tox -- --cov datafog --cov-report xml --cov-report term
+      - name: Submit to codecov
+        uses: codecov/codecov-action@v3
+        if: ${{ matrix.python-version == '3.10' }}
+
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v4.0.1
+        env:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          slug: DataFog/datafog-python
+
+      - name: Run script
+        env:
+          APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.APPLICATIONINSIGHTS_CONNECTION_STRING }}
+        run: |
+          python datafog/telemetry/open_telemetry.py
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@ build/
 /src/datafog/pii_tools/__pycache__/
 /tests/__pycache__/
 /tests/scratch.py
+/tests/.datafog_env/
 node_modules/
 datafog_debug.log
 sotu_2023.txt
@@ -23,4 +24,5 @@ datafog-python/datafog/processing/text_processing/__pycache__/
 datafog-python/datafog/services/__pycache__/
 datafog-python/datafog/processing/__pycache__/
 datafog-python/datafog/__pycache__/
+.env
 
diff --git a/README.md b/README.md
@@ -39,7 +39,6 @@ DataFog can be installed via pip:
 pip install datafog
 ```
 
-
 ## Getting Started
 
 The DataFog library provides functionality for text and image processing, including PII (Personally Identifiable Information) annotation and OCR (Optical Character Recognition) capabilities.
@@ -54,8 +53,7 @@ pip install datafog
 
 ### Usage
 
-The [Getting Started notebook](/datafog-python/examples/getting_started.ipynb)  features a standalone Colab notebook. 
-
+The [Getting Started notebook](/datafog-python/examples/getting_started.ipynb) features a standalone Colab notebook.
 
 #### Text PII Annotation
 
@@ -75,7 +73,9 @@ with open(os.path.join(folder_path, text_files[0]), 'r') as file:
 
 display(Markdown(clinical_note))
 ```
+
 which looks like this:
+
 ```
 
 **Date:** April 10, 2024
@@ -124,7 +124,6 @@ loop = asyncio.get_event_loop()
 results = loop.run_until_complete(run_text_pipeline_demo())
 ```
 
-
 Note: The DataFog library uses asynchronous programming, so make sure to use the `async`/`await` syntax when calling the appropriate methods.
 
 #### OCR PII Annotation
@@ -146,7 +145,7 @@ loop.run_until_complete(run_ocr_pipeline_demo())
 
 ```
 
-You'll notice that we use async functions liberally throughout the SDK - given the nature of the functions we're providing and the extension of DataFog into API/other formats, this allows the functions to be more easily adapted for those uses. 
+You'll notice that we use async functions liberally throughout the SDK - given the nature of the functions we're providing and the extension of DataFog into API/other formats, this allows the functions to be more easily adapted for those uses.
 
 ## Contributing
 

diff --git a/datafog/__about__.py b/datafog/__about__.py
@@ -1 +1 @@
-__version__ = "3.2.0"
+__version__ = "3.2.1"
diff --git a/datafog/__init__.py b/datafog/__init__.py
@@ -1,3 +1,4 @@
+from .__about__ import __version__
 from .config import OperationType
 from .main import DataFog, OCRPIIAnnotator, TextPIIAnnotator
 from .processing.image_processing.donut_processor import DonutProcessor
@@ -7,8 +8,7 @@
 from .services.image_service import ImageService
 from .services.spark_service import SparkService
 from .services.text_service import TextService
-
-# from .__about__ import __version__
+from .telemetry import Telemetry
 
 __all__ = [
     "DonutProcessor",
@@ -22,5 +22,6 @@
     "SpacyPIIAnnotator",
     "ImageDownloader",
     "PytesseractProcessor",
-    # "__version__",
+    "__version__",
+    "Telemetry",
 ]
diff --git a/datafog/config.py b/datafog/config.py
@@ -1,7 +1,5 @@
 from enum import Enum
 
-from pydantic import BaseModel
-
 
 class OperationType(str, Enum):
     ANNOTATE_PII = "annotate_pii"

diff --git a/datafog/main.py b/datafog/main.py
@@ -13,7 +13,34 @@
 from .services.image_service import ImageService
 from .services.spark_service import SparkService
 from .services.text_service import TextService
-
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+import os
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from azure.monitor.opentelemetry.exporter import AzureMonitorTraceExporter
+from azure.monitor.opentelemetry import configure_azure_monitor
+import platform
+from opentelemetry.trace import Status, StatusCode
+
+# Use environment variable if available, otherwise fall back to hardcoded value
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from logging import INFO, getLogger
+from dotenv import load_dotenv
+import logging
+
+load_dotenv()  # Load environment variables from .env file
+APPLICATIONINSIGHTS_CONNECTION_STRING = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
+configure_azure_monitor(connection_string=APPLICATIONINSIGHTS_CONNECTION_STRING)
+trace.set_tracer_provider(TracerProvider())
+exporter = AzureMonitorTraceExporter(connection_string=APPLICATIONINSIGHTS_CONNECTION_STRING)
+trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(exporter))
+logger = logging.getLogger("datafog_logger")
+logger.setLevel(INFO)
 
 class DataFog:
     def __init__(
@@ -27,23 +54,52 @@ def __init__(
         self.text_service = text_service
         self.spark_service: SparkService = spark_service
         self.operations: List[OperationType] = operations
+        self.logger = logging.getLogger(__name__)
+        self.logger.info("Initializing DataFog class with the following services and operations:")
+        self.logger.info(f"Image Service: {type(image_service)}")
+        self.logger.info(f"Text Service: {type(text_service)}")
+        self.logger.info(f"Spark Service: {type(spark_service) if spark_service else 'None'}")
+        self.logger.info(f"Operations: {operations}")
+        self.tracer = trace.get_tracer(__name__)
 
     async def run_ocr_pipeline(self, image_urls: List[str]):
         """Run the OCR pipeline asynchronously."""
-        extracted_text = await self.image_service.ocr_extract(image_urls)
-        if OperationType.ANNOTATE_PII in self.operations:
-            annotated_text = await self.text_service.batch_annotate_texts(
-                extracted_text
-            )
-            return annotated_text
-        return extracted_text
-
+        with self.tracer.start_as_current_span("run_ocr_pipeline") as span:
+            try:
+                extracted_text = await self.image_service.ocr_extract(image_urls)
+                self.logger.info(f"OCR extraction completed for {len(image_urls)} images.")
+                self.logger.debug(f"Total length of extracted text: {sum(len(text) for text in extracted_text)}")
+
+                if OperationType.ANNOTATE_PII in self.operations:
+                    annotated_text = await self.text_service.batch_annotate_texts(extracted_text)
+                    self.logger.info(f"Text annotation completed with {len(annotated_text)} annotations.")
+                    return annotated_text
+
+                return extracted_text
+            except Exception as e:
+                self.logger.error(f"Error in run_ocr_pipeline: {str(e)}")
+                span.set_status(Status(StatusCode.ERROR, str(e)))
+                raise
     async def run_text_pipeline(self, texts: List[str]):
         """Run the text pipeline asynchronously."""
-        if OperationType.ANNOTATE_PII in self.operations:
-            annotated_text = await self.text_service.batch_annotate_texts(texts)
-            return annotated_text
-        return texts
+        with self.tracer.start_as_current_span("run_text_pipeline") as span:
+            try:
+                self.logger.info(f"Starting text pipeline with {len(texts)} texts.")
+                if OperationType.ANNOTATE_PII in self.operations:
+                    annotated_text = await self.text_service.batch_annotate_texts(texts)
+                    self.logger.info(f"Text annotation completed with {len(annotated_text)} annotations.")
+                    return annotated_text
+
+                self.logger.info("No annotation operation found; returning original texts.")
+                return texts
+            except Exception as e:
+                self.logger.error(f"Error in run_text_pipeline: {str(e)}")
+                span.set_status(Status(StatusCode.ERROR, str(e)))
+                raise
+    def _add_attributes(self, span, attributes: dict):
+        """Add multiple attributes to a span."""
+        for key, value in attributes.items():
+            span.set_attribute(key, value)
 
 
 class OCRPIIAnnotator:

diff --git a/datafog/processing/__init__.py b/datafog/processing/__init__.py
@@ -1,5 +1,7 @@
 from .image_processing.donut_processor import DonutProcessor
 from .image_processing.image_downloader import ImageDownloader
 from .image_processing.pytesseract_processor import PytesseractProcessor
-from .spark_processing.pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
+
+# from .spark_processing.pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
+from .spark_processing import get_pyspark_udfs
 from .text_processing.spacy_pii_annotator import SpacyPIIAnnotator
diff --git a/datafog/processing/spark_processing/__init__.py b/datafog/processing/spark_processing/__init__.py
@@ -1 +1,7 @@
-from .pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
+# from .pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
+
+
+def get_pyspark_udfs():
+    from .pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
+
+    return broadcast_pii_annotator_udf, pii_annotator
diff --git a/datafog/processing/spark_processing/pyspark_udfs.py b/datafog/processing/spark_processing/pyspark_udfs.py
@@ -1,8 +1,9 @@
+import importlib
+import subprocess
+import sys
+
 import requests
 import spacy
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import udf
-from pyspark.sql.types import ArrayType, StringType, StructField, StructType
 
 PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
 MAXIMAL_STRING_SIZE = 1000000
@@ -14,6 +15,11 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
     Returns:
         list[list[str]]: Values as arrays in order defined in the PII_ANNOTATION_LABELS.
     """
+    ensure_installed("pyspark")
+    from pyspark.sql import SparkSession
+    from pyspark.sql.functions import udf
+    from pyspark.sql.types import ArrayType, StringType, StructField, StructType
+
     if text:
         if len(text) > MAXIMAL_STRING_SIZE:
             # Cut the strings for required sizes
@@ -35,13 +41,27 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
 
 
 def broadcast_pii_annotator_udf(
-    spark_session: SparkSession, spacy_model: str = "en_spacy_pii_fast"
+    spark_session=None, spacy_model: str = "en_spacy_pii_fast"
 ):
     """Broadcast PII annotator across Spark cluster and create UDF"""
+    ensure_installed("pyspark")
+    from pyspark.sql import SparkSession
+    from pyspark.sql.functions import udf
+    from pyspark.sql.types import ArrayType, StringType, StructField, StructType
+
+    if not spark_session:
+        spark_session = SparkSession.builder.getOrCreate()
     broadcasted_nlp = spark_session.sparkContext.broadcast(spacy.load(spacy_model))
 
     pii_annotation_udf = udf(
         lambda text: pii_annotator(text, broadcasted_nlp),
         ArrayType(ArrayType(StringType())),
     )
     return pii_annotation_udf
+
+
+def ensure_installed(self, package_name):
+    try:
+        importlib.import_module(package_name)
+    except ImportError:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])