PerfectThymeTech · marvinbuss · Oct 29, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/code/backend/aispeechanalysis/function.py b/code/backend/aispeechanalysis/function.py
@@ -4,7 +4,11 @@
 import azure.functions as func
 import azurefunctions.extensions.bindings.blob as blob
 from aispeechanalysis.llm import LlmClient
-from aispeechanalysis.utils import get_timestamps_for_sections, get_transcript
+from aispeechanalysis.utils import (
+    get_locale,
+    get_timestamps_for_sections,
+    get_transcript,
+)
 from shared.config import settings
 from shared.utils import load_blob, upload_string
 
@@ -37,9 +41,10 @@ async def ai_speech_analysis(client: blob.BlobClient) -> func.HttpResponse:
     result_load_blob_json = json.loads(result_load_blob)
     logging.debug(f"Loaded blob content as json: '{result_load_blob_json}'")
 
-    # Get transcript
-    logging.info("Get transcript from Azure AI Speech content.")
-    result_get_transcript = get_transcript(ai_speech_blob_json=result_load_blob_json)
+    # Get transcript and locale
+    logging.info("Get transcript and locale from Azure AI Speech content.")
+    result_get_transcript = get_transcript(result_stt=result_load_blob_json)
+    result_get_locale = get_locale(result_stt=result_load_blob_json)
 
     # Use Open AI to generate scenes
     logging.info("Use Open AI to generate scenes.")
@@ -53,7 +58,7 @@ async def ai_speech_analysis(client: blob.BlobClient) -> func.HttpResponse:
     result_invoke_llm_chain = llm_client.invoke_llm_chain(
         news_content=result_get_transcript,
         news_show_details="This is a news show covering different news content.",
-        language=settings.MAIN_CONTENT_LANGUAGE,
+        language=result_get_locale,
     )
 
     # Save llm result

diff --git a/code/backend/aispeechanalysis/utils.py b/code/backend/aispeechanalysis/utils.py
@@ -1,22 +1,63 @@
 import copy
 import logging
+import string
 from datetime import datetime, timedelta
 from typing import Any, List, Tuple
 
 
-def get_transcript(ai_speech_blob_json: Any) -> str:
+def remove_punctuation(text: str) -> str:
+    """Removes punctuation from text.
+
+    text (str): Specifies the text that should be altered.
+    RETURNS (str): The altered text.
+    """
+    return text.translate(str.maketrans("", "", f"{string.punctuation}¿¡"))
+
+
+def get_normalized_text(text: str) -> str:
+    """Normalizes text by removing punctuation and lowering all characters.
+
+    text (str): Specifies the text that should be altered.
+    RETURNS (str): The altered text.
+    """
+    # Replace punctuation with dot
+    text_removed_punctuation = remove_punctuation(text=text)
+
+    # Lower text
+    return text_removed_punctuation.lower()
+
+
+def get_transcript(result_stt: Any) -> str:
     """Creates and returns a transcript based on the content from Azure AI Speech STT batch transcription.
 
-    ai_speech_blob_json (Any): JSON content from Azure AI Speech STT batch transcription.
+    result_stt (Any): Specifies the JSON content from Azure AI Speech STT batch transcription.
     RETURNS (str): The transcript extracted from the JSON file.
     """
-    ai_speech_blob_json_combined_recognized_phrases = ai_speech_blob_json.get(
+    result_stt_combined_recognized_phrases = result_stt.get(
         "combinedRecognizedPhrases", [{"display": None}]
     )
-    return ai_speech_blob_json_combined_recognized_phrases[0].get("display")
+    return result_stt_combined_recognized_phrases[0].get("display")
 
 
-def get_word_details(result_stt: Any) -> List[Any]:
+def get_locale(result_stt: Any) -> str:
+    """Returns the locale from the content from Azure AI Speech STT batch transcription.
+
+    result_stt (Any): Specifies the JSON content from Azure AI Speech STT batch transcription.
+    RETURNS (str): The locale extracted from the JSON file. Returns 'Unknown' if the property cannot be found in the JSON transcript.
+    """
+    result_stt_recognized_phrases = result_stt.get(
+        "recognizedPhrases", [{"locale": "Unknown"}]
+    )[0]
+    return result_stt_recognized_phrases.get("locale", "Unknown")
+
+
+def get_word_details(result_stt: Any, normalize_text: bool) -> List[Any]:
+    """Returns all word details from a speech to text batch analysis process.
+
+    result_stt (Any): Specifies the JSON content from Azure AI Speech STT batch transcription.
+    normalize_text (bool): Specifies whether the text should be normalized.
+    RETURNS (str): The altered text.
+    """
     word_details = []
     recognized_phrases = result_stt.get("recognizedPhrases", [])
 
@@ -26,8 +67,15 @@ def get_word_details(result_stt: Any) -> List[Any]:
             "displayWords", []
         )
 
-        # Append word details
-        word_details.extend(recognized_phrase_best_display_words)
+        if normalize_text:
+            for display_word in recognized_phrase_best_display_words:
+                display_word["displayText"] = get_normalized_text(
+                    text=display_word["displayText"]
+                )
+                word_details.append(display_word)
+        else:
+            # Append word details
+            word_details.extend(recognized_phrase_best_display_words)
 
     return word_details
 
@@ -41,8 +89,11 @@ def offset_and_duration_to_timedelta(timedelta_str: str) -> Tuple[str, timedelta
     # Initialize
     format_options = [
         "PT%S.%fS",
+        "PT%SS",
         "PT%MM%S.%fS",
+        "PT%MM%SS",
         "PT%HH%MM%S.%fS",
+        "PT%HH%MM%SS",
     ]
     td = None
     format = None
@@ -72,8 +123,14 @@ def offset_and_duration_to_timedelta(timedelta_str: str) -> Tuple[str, timedelta
 
 
 def get_timestamps_for_sections(result_stt: Any, result_llm: Any) -> Any:
+    """Calculates and adds timestamps to the llm result.
+
+    result_stt (Any): Specifies the JSON content from Azure AI Speech STT batch transcription.
+    result_llm (Any): Specifies the JSON content from Azure Open AI analysis.
+    RETURNS (Any): The JSON content from Azure Open AI analysis with added timestamps for start and end.
+    """
     # Get word details from stt result
-    word_details = get_word_details(result_stt=result_stt)
+    word_details = get_word_details(result_stt=result_stt, normalize_text=True)
 
     # Prepare result
     result = copy.deepcopy(result_llm.get("sections", []))
@@ -83,7 +140,9 @@ def get_timestamps_for_sections(result_stt: Any, result_llm: Any) -> Any:
         item_llm_current = "start"
 
         # Get llm item words
-        item_llm_words = str(item_llm.get(item_llm_current, "")).split(sep=" ")
+        item_llm_words = str(
+            get_normalized_text(item_llm.get(item_llm_current, ""))
+        ).split(sep=" ")
 
         for index_word, item_word in enumerate(word_details):
             # Get display text of current word item
@@ -127,9 +186,9 @@ def get_timestamps_for_sections(result_stt: Any, result_llm: Any) -> Any:
                         item_llm_current = "end"
 
                         # Get new llm item words
-                        item_llm_words = str(item_llm.get(item_llm_current, "")).split(
-                            sep=" "
-                        )
+                        item_llm_words = str(
+                            get_normalized_text(item_llm.get(item_llm_current, ""))
+                        ).split(sep=" ")
                     else:
                         break
 

diff --git a/code/backend/shared/utils.py b/code/backend/shared/utils.py
@@ -1,3 +1,4 @@
+import asyncio
 import hashlib
 import logging
 import os
@@ -165,11 +166,19 @@ async def copy_blob(
         await lease.acquire(lease_duration=-1)
 
         # Copy blob
-        await sink_blob_client.start_copy_from_url(
+        _ = await sink_blob_client.start_copy_from_url(
             source_url=source_blob_client.url,
-            requires_sync=True,
+            requires_sync=False,
         )
 
+        # Wait for copy to finish
+        status = (await sink_blob_client.get_blob_properties()).copy.status
+        logging.info(f"Status of copy activity: {status}")
+        while status not in ["success", "failed", "aborted"]:  # "pending",
+            await asyncio.sleep(1)
+            status = (await sink_blob_client.get_blob_properties()).copy.status
+            logging.info(f"Status of copy activity: {status}")
+
         # Delete source blob
         if delete_source:
             await source_blob_client.delete_blob(

diff --git a/code/backend/videoupload/function.py b/code/backend/videoupload/function.py
@@ -103,6 +103,7 @@ async def upload_video(client: blob.BlobClient):
     result_create_transcription_job = await speech_client.create_transcription_job(
         guid=videoupload_guid,
         blob_url=result_upload_blob,
+        locale=settings.MAIN_CONTENT_LANGUAGE,
     )
 
     # Check AI Speech STT batch job

diff --git a/code/backend/videoupload/speech.py b/code/backend/videoupload/speech.py
@@ -28,11 +28,14 @@ def __init__(
         self.azure_ai_speech_api_version = azure_ai_speech_api_version
         self.managed_identity_client_id = managed_identity_client_id
 
-    async def create_transcription_job(self, guid: str, blob_url: str) -> str:
+    async def create_transcription_job(
+        self, guid: str, blob_url: str, locale: str
+    ) -> str:
         """Creates a batch transcription job for a blob file.
 
         guid (str): Specifies the guid used as a name for the processing job.
         blob_url (str): Specifies the blob url pointing to an audio file that will be transcribed.
+        locale (str): Specifies the locale of the audio file (e.g. 'es-ES', 'de-DE').
         RETURNS (str): Returns the transaction url of the transcription job.
         """
         # Define url
@@ -46,7 +49,7 @@ async def create_transcription_job(self, guid: str, blob_url: str) -> str:
             "displayName": f"{guid}",
             "description": "STT for video file",
             "contentUrls": [blob_url],
-            "locale": "es-ES",
+            "locale": locale,
             "properties": {
                 "languageIdentification": {
                     "mode": "Single",

diff --git a/code/infra/aiservice.tf b/code/infra/aiservice.tf
@@ -18,6 +18,7 @@ module "azure_ai_generic" {
     trimsuffix(trimprefix(module.storage_account.storage_account_primary_blob_endpoint, "https://"), "/"),
     trimsuffix(trimprefix(module.azure_open_ai.cognitive_account_endpoint, "https://"), "/"),
   ]
+  cognitive_account_local_auth_enabled  = true
   cognitive_account_deployments         = {}
   diagnostics_configurations            = local.diagnostics_configurations
   subnet_id                             = azapi_resource.subnet_private_endpoints.id

diff --git a/code/infra/storage.tf b/code/infra/storage.tf
@@ -27,7 +27,7 @@ module "storage_account" {
     "/subscriptions/${data.azurerm_client_config.current.subscription_id}/providers/Microsoft.Security/datascanners/storageDataScanner",
     "/subscriptions/${data.azurerm_client_config.current.subscription_id}/resourceGroups/*/providers/Microsoft.CognitiveServices/accounts/*",
   ]
-  storage_public_network_access_enabled = false
+  storage_public_network_access_enabled = true
   storage_nfsv3_enabled                 = false
   storage_sftp_enabled                  = false
   storage_shared_access_key_enabled     = false # Required to be set to 'true' when creating a Windows host

diff --git a/code/infra/terraform.tf b/code/infra/terraform.tf
@@ -4,7 +4,7 @@ terraform {
   required_providers {
     azurerm = {
       source  = "hashicorp/azurerm"
-      version = "4.5.0"
+      version = "4.7.0"
     }
     azapi = {
       source  = "azure/azapi"

diff --git a/config/PerfectThymeTech/vars.tfvars b/config/PerfectThymeTech/vars.tfvars
@@ -11,6 +11,7 @@ tags = {
 # Service variables
 function_app_settings      = {}
 function_health_check_path = "/api/v1/heartbeat"
+main_content_language      = "es-ES"
 
 # Logging variables
 log_analytics_workspace_id = "/subscriptions/e82c5267-9dc4-4f45-ac13-abdd5e130d27/resourceGroups/ptt-dev-logging-rg/providers/Microsoft.OperationalInsights/workspaces/ptt-dev-log001"

diff --git a/docs/SystemPrompt.txt b/docs/SystemPrompt.txt
@@ -1,5 +1,4 @@
 You are a world class assistant for identifying news sections. You will be provided with a transcript from a TV news show. Your task is to extract thematic news sections from the transcript that split the content into cohesive news topics. You must define a title and tags for each news section based on the content of each section and translate them into the language of the transcript.
----
 Solve this step by step:
 1. Process the provided transcript to understand the overall context.
 2. Split the provided news content into news sections. The content of each section must cover a common topic or headline. Follow the grounding rules for new sections mentioned below. Assign an ID to every news section.
@@ -12,12 +11,11 @@ Solve this step by step:
 ---
 Grounding rules for news sections:
 - The first sentence of the transcript must be part of the first news section. The last sentence of the transcript must be part of the last news section.
-- Each news section must start and end with a full sentence and must consist of 3 or more sentences.
-- Every sentence of the transcript must be part of one news section. No sentence can be part of multiple news sections.
-- If you are unsure about one sentence, then assign it to the previous section.
+- Each news section must start and end with a full sentence and must consist of 2 or more sentences. It is ok if some news sections consist of 20 or more sentences and other sections only consist of 2 or more sentences.
+- Each sentence of the transcript must be part of exactly one news section. No sentence can be part of multiple news sections.
+- If you are unsure about the assignment of one sentence, then assign it to the previous section.
 - The last sentence of one news section must by followed by the first sentence of the next news section.
 - The news sections are not allowed to overlap and must be mutually exclusive. This means that between the first sentence and the last sentence of one news section, there can be no first sentence or last sentence of another news section in the transcript.
-- It is ok if some news sections consist of 20 or more sentences and other sections only consist of 3 or more sentences.
 - The transcript often start with an introduction. The introduction summarizes some news sections of the transcript which reappear later in the transcript. Identify the introduction and give it the title "News Show Summary". Add the following tags to this news section: intro, overview, news summary.
 - The transcript often contains a weather forecast section. Identify this news section and give it the title "Weather Forecast". Add the following tags to this news section: weather, weather forecast.
 ---