Merge pull request #24 from baloise/main

FIX for new release bild
baloise · Nov 1, 2024 · 89b0d3f · 89b0d3f
2 parents 794b7da + 306ba12
commit 89b0d3f
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 90 deletions.
diff --git a/src/static/styles/style.css b/src/static/styles/style.css
@@ -49,6 +49,15 @@ footer {
     gap: 5px; /* Optional: Add space between buttons */
 }
 
+.flex-container-row {
+    display: flex; /* Use flexbox to align items */
+    justify-content: center; /* Center the buttons horizontally */
+    align-items: center; /* Center the buttons vertically */
+    flex-direction: row;
+    gap: 5px; /* Optional: Add space between buttons */
+}
+
+
 .flex-item {
     flex: 1;
     padding: 2px;

diff --git a/src/templates/html/demask.html b/src/templates/html/demask.html
@@ -20,6 +20,17 @@ <h1>YoYo MaskЯ</h1>
 
     <main>
         <div class="flex-container">
+
+            <div class="flex-container-row">
+                <div class="flex-item">
+                    <h3>Load masked text</h3>
+                    <input type="file" id="fileInput" accept=".txt">
+                </div>
+                <div class="flex-item">
+                    <h3>Load entities</h3>
+                    <input type="file" id="fileInputE" accept=".json">
+                </div>
+            </div>
 
             <div class="flex-item">
                 <h3>Input</h3>
@@ -35,13 +46,6 @@ <h3>Input</h3>
                 </form>
             </div>
 
-            <div class="flex-item">
-                <h3>Load from text file</h3>
-            </div>
-            <div class="flex-item">
-                <input type="file" id="fileInput" accept=".txt">
-            </div>
-
             <div class="flex-item">
                 <h3>Masked Text</h3>
             </div>
@@ -56,13 +60,6 @@ <h3>Entities</h3>
                 <textarea id="responseFieldEntities" rows="10" cols="50" readonly></textarea>
             </div>
 
-            <div class="flex-item">
-                <h3>Load entities from text file</h3>
-            </div>
-            <div class="flex-item">
-                <input type="file" id="fileInputE" accept=".json">
-            </div>
-
             <div class="flex-item">
                 <h3>DeMasked text</h3>
             </div>

diff --git a/src/utils/ano_llm.py b/src/utils/ano_llm.py
@@ -1,83 +1,12 @@
 import json
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_ollama.llms import OllamaLLM
-<<<<<<< HEAD
 import regex as re
-from .env import *
-=======
 from src.utils.env import *
->>>>>>> eee9d0537daf0aa72a684a851647338c668c2cc1
-
-TEMPLATE1 = """
-Context: Your task is to find entities in a text and tag them in a way that they can be anonymized in a reversible way.
-The entity types are persons, locations, organizations, email addresses, telephone numbers and social security numbers.
-Be very careful to not miss an entity. It is better to wrongly flag some expression as an entity than to miss an entity.
-If you are absolutely certain that two or more found entities are just different variations of the same entity, you may group them.
-If you are in doubt or not sure, be on the safe side and treat them asdifferent entities that can not be grouped together.
-Expect the majority of entities you find to not be variations of the same entity.
-
-Task instructions: 
-
-General:
-Find all names of people, organizations, locations, email addresses and return nothing but a json that can be directly parsed by Python with all those entities.
-Work through the following steps and stick as close as only possible to the structure provided in the example output.
-
-Step 1: Analyze the text provided after "Text to anonymize" carefully for names of people, organizations, locations, email addresses and telephone numbers.
-Do not include the result of this step in your output.
-
-Step 2: For names of people, organizations, and locations go through all those entities that you found and evaluate for each entity whether it is a new entity
-or just a variation of an already found entity. Use the following guidelines:
-a) If two expressions are not exactly identical, it is more likely that they are different entities.
-a) Lists of names, locations or organizations such as Peter, Paul and Mary are almost certainly different entities.
-b) Treat all Email addresses and phone numbers that you find as different entities.
-c) Social security numbers have a standardized format depending on the country. Use any country specific format you are aware of
-such as 756.XXXX.XXXX.XX (Switzerland), YYMMDD-XXX-XX (Belgium), 12-345678A 123 (Germany) or (YYYYMMDDXXXX) Luxembourg.
-Do not include the result of this step in your output.
-
-Step 3: You build a json style dictionary directly be parsed by Python into an object structure
-<<<<<<< HEAD
-with labels as keys and for each label a list of variations of an entity as values.
-Stick exactly to this structure and don't add anything around that json.
-a) Labels of persons are enumerated #NAME_1#, #NAME_2#, etc. always beginning with index 1.
-=======
-with labels as keys and lists of variations of entities as values. Stick exactly to this structure and don't add anything around that json.
-a) Labels of persons are enumerated #PERSON_1#, #PERSON_2#, etc. always beginning with index 1.
->>>>>>> eee9d0537daf0aa72a684a851647338c668c2cc1
-b) Labels of organisations as #ORG_1#, #ORG_2#, etc. always beginning index with 1.
-c) Labels of email addresses as #EMAIL_1#, #EMAIL_2#, etc. always beginning with index 1.
-d) Labels of phone numbers as #PHONE_1#, #PHONE_2#, etc. always beginning with index 1.
-e) Labels of social security nubers as #SOCSEC_1#, #SOCSEC_2#, etc. always beginning with index 1.
-Only use labels for those entities that you found. It is possible that there are no entities for a kind of labels.
-Do not include the result of this step in your output yet.
-
-Step 4: Double check your grouping.
-Are all entities you grouped together really just variations of the same entity or should they have been different entities?
-If necessary, split such groups of entities. Make sure that each sort of labels keeps a continuous numbering beginning with 1.
-This json style dictionary is the only output that you return.
-
-<<<<<<< HEAD
-Example input:
-"Tony Stark and Peter Parker walk through New York where Peter wants to show Tony the Broadway and the Apple Store.
-Tony's private email is tony@stark.com, his busienss email is ceo@stark.com, his private number is +41-76-1234567 and his business number is +41 58 1234567.
-He also has an AHV number, which is 756.1234.5678.90".
-
-Example output:
-{{"#NAME_1#": ["Tony Stark", "Tony"],"#NAME_2#": ["Peter Parker", "Peter"],"#LOC_1#": ["New York"],"#LOC_2#": ["Broadway"],"#ORG_1#": ["Apple"],"#EMAIL_1#": ["tony@stark.com"],"#EMAIL_2#": ["ceo@stark.com"],"#PHONE_1#": ["+41-76-1234567"],"#PHONE_2#": ["+41 58 1234567"],"#SOCSEC_1#": ["756.1234.5678.90"]}}
-=======
-Example:
-Input: "Tony Stark and Peter Parker walk through New York where Peter wants to show Tony the Broadway and the Apple Store.
-Tony's private email is tony@stark.com, his busienss email is ceo@stark.com, his private number is +41-76-1234567 and his business number is +41 58 1234567".
-Output: {{"#PERSON_1#":["Tony Stark", "Tony"], "#PERSON_2#": ["Peter Parker", "Peter"], "#LOC_1#": ["New York"], "#LOC_2#": ["Broadway"],
-"ORG_1": ["Apple"], "#EMAIL_1#": ["tony@stark.com"], "#EMAIL_2#": ["ceo@stark.com"], "#PHONE_1#": ["+41-76-1234567"], “PHONE_2#: ["+41 58 1234567"]}}
->>>>>>> eee9d0537daf0aa72a684a851647338c668c2cc1
-
-Text to anonymize: {text}
-"""
 
-<<<<<<< HEAD
 
 
-TEMPLATE2 = """
+TEMPLATE = """
 **Objective**:
 Identify and tag entities in the provided text for reversible anonymization, focusing on Switzerland, Germany, Belgium, and Luxembourg. The relevant categories include **persons, locations, organizations, email addresses, telephone numbers, social security numbers, dates, addresses, financial information**, and **credit card information**. Accurate tagging is critical; it is preferable to flag non-entities than to miss actual entities. If certain entities are variations of the same, you may group them; however, when in doubt, treat them as separate. Expect most entities to be unique.
 
@@ -150,7 +79,7 @@
 """
 
 
-def find_entities(text, model=OLLAMA_MODEL, temperature=0.1, template=TEMPLATE2,
+def find_entities(text, model=OLLAMA_MODEL, temperature=0.1, template=TEMPLATE,
                   base_url=OLLAMA_BASE_URL, unprettify=True, raw=False):
     """
     :param text:

diff --git a/yoyomaskr.ipynb b/yoyomaskr.ipynb
@@ -156,10 +156,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.utils.ano_llm import find_entities, TEMPLATE2\n",
+    "from src.utils.ano_llm import find_entities, TEMPLATE\n",
     "with open('./sample_text.txt') as f:\n",
     "    text = f.read()\n",
-    "result = find_entities(text, model='gemma2:latest', raw=False, template=TEMPLATE2)\n",
+    "result = find_entities(text, model='gemma2:latest', raw=False, template=TEMPLATE)\n",
     "result"
    ]
   },
@@ -168,7 +168,9 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "result.get('replace_dict').keys()"
+   ]
   }
  ],
  "metadata": {