From 23b0be513a96c0529524bbec7073bc53ea316d4d Mon Sep 17 00:00:00 2001
From: Arjun-Zingg
Date: Fri, 13 Dec 2024 13:09:41 +0530
Subject: [PATCH] Add files via upload
---
examples/fabric/ExampleNotebook.ipynb | 1 +
1 file changed, 1 insertion(+)
create mode 100644 examples/fabric/ExampleNotebook.ipynb
diff --git a/examples/fabric/ExampleNotebook.ipynb b/examples/fabric/ExampleNotebook.ipynb
new file mode 100644
index 00000000..e0007e1a
--- /dev/null
+++ b/examples/fabric/ExampleNotebook.ipynb
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"code","source":["#abfss://Test@onelake.dfs.fabric.microsoft.com/ZinggData.Lakehouse/Files/data.csv\n","spark.sparkContext.setCheckpointDir(\"abfss://Zingg@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Files\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":6,"statement_ids":[6],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:44.7727126Z","session_start_time":null,"execution_start_time":"2024-12-12T14:38:45.3551064Z","execution_finish_time":"2024-12-12T14:38:46.1554742Z","parent_msg_id":"0568e5f6-3102-476c-9119-1eea357e5f90"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 6, Finished, Available, Finished)"},"metadata":{}}],"execution_count":2,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"320825db-e1b4-4106-8f77-d974f59e6fe1"},{"cell_type":"code","source":["pip install zingg"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":7,"statement_ids":[7],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:44.8919804Z","session_start_time":null,"execution_start_time":"2024-12-12T14:38:46.9779028Z","execution_finish_time":"2024-12-12T14:38:59.3086347Z","parent_msg_id":"9a6de53a-f5ed-4655-9341-4c4a7802ffe5"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 7, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Collecting zingg\n Downloading zingg-0.4.0-py2.py3-none-any.whl.metadata (933 bytes)\nCollecting py4j==0.10.9 (from zingg)\n Downloading py4j-0.10.9-py2.py3-none-any.whl.metadata (1.3 kB)\nDownloading zingg-0.4.0-py2.py3-none-any.whl (74.7 MB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.7/74.7 MB\u001b[0m \u001b[31m43.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n\u001b[?25hDownloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m198.6/198.6 kB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hInstalling collected packages: py4j, zingg\n Attempting uninstall: py4j\n Found existing installation: py4j 0.10.9.7\n Uninstalling py4j-0.10.9.7:\n Successfully uninstalled py4j-0.10.9.7\n\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\npyspark 3.5.1.5.4.20240407 requires py4j==0.10.9.7, but you have py4j 0.10.9 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0mSuccessfully installed py4j-0.10.9 zingg-0.4.0\nNote: you may need to restart the kernel to use updated packages.\n"]}],"execution_count":3,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"d45194dd-f9fa-4522-9b8d-f68390a36cb0"},{"cell_type":"code","source":["spark.sparkContext.getCheckpointDir()"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":8,"statement_ids":[8],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:45.0470709Z","session_start_time":null,"execution_start_time":"2024-12-12T14:38:59.8920089Z","execution_finish_time":"2024-12-12T14:39:00.1425377Z","parent_msg_id":"a7a3e48d-4f55-4dcc-94db-21864a32cdab"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 8, Finished, Available, Finished)"},"metadata":{}},{"output_type":"execute_result","execution_count":16,"data":{"text/plain":"'abfss://Zingg@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Files/b2adeefa-d873-4af7-9780-3af8598f5959'"},"metadata":{}}],"execution_count":4,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"735117dc-0f56-491b-a805-a16db331c90d"},{"cell_type":"code","source":["pip show zingg"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":9,"statement_ids":[9],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:45.2324828Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:00.6902784Z","execution_finish_time":"2024-12-12T14:39:04.2406337Z","parent_msg_id":"a041b135-c20d-4db9-9e2b-b8b4718c42dc"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 9, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Name: zingg\r\nVersion: 0.4.0\r\nSummary: Zingg Entity Resolution, Data Mastering and Deduplication\r\nHome-page: https://github.com/zinggAI/zingg\r\nAuthor: Zingg.AI\r\nAuthor-email: sonalgoyal4@gmail.com\r\nLicense: https://github.com/zinggAI/zingg/blob/main/LICENSE\r\nLocation: /home/trusted-service-user/cluster-env/trident_env/lib/python3.11/site-packages\r\nRequires: py4j\r\nRequired-by: \r\nNote: you may need to restart the kernel to use updated packages.\n"]}],"execution_count":5,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"51e5d94a-b1d6-47be-bbf1-98208af1b5d8"},{"cell_type":"code","source":["pip install tabulate"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":10,"statement_ids":[10],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:45.3970144Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:04.8223306Z","execution_finish_time":"2024-12-12T14:39:09.8213294Z","parent_msg_id":"c2bb18f4-faa5-4fc2-b94e-0ccd1e2b6af7"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 10, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Collecting tabulate\n Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)\nDownloading tabulate-0.9.0-py3-none-any.whl (35 kB)\nInstalling collected packages: tabulate\nSuccessfully installed tabulate-0.9.0\nNote: you may need to restart the kernel to use updated packages.\n"]}],"execution_count":6,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"a2e77ae6-eeb2-482f-a47e-8c6ed0e7bb59"},{"cell_type":"code","source":["pip show tabulate"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":11,"statement_ids":[11],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:45.5376703Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:10.4269168Z","execution_finish_time":"2024-12-12T14:39:14.5511724Z","parent_msg_id":"0a38f00a-6e32-4871-aec1-99613a3180bd"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 11, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Name: tabulate\nVersion: 0.9.0\nSummary: Pretty-print tabular data\nHome-page: \nAuthor: \nAuthor-email: Sergey Astanin \nLicense: MIT\nLocation: /home/trusted-service-user/cluster-env/trident_env/lib/python3.11/site-packages\nRequires: \nRequired-by: \nNote: you may need to restart the kernel to use updated packages.\n"]}],"execution_count":7,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"ed5c6ed3-40ef-4447-ab75-4a6a898814fe"},{"cell_type":"code","source":["##you can change these to the locations of your choice\n","##these are the only two settings that need to change\n","zinggDir = \"abfss://Zingg@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Files/models\"\n","modelId = \"testModelFebrl\""],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":12,"statement_ids":[12],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:45.6769995Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:15.1044655Z","execution_finish_time":"2024-12-12T14:39:15.354016Z","parent_msg_id":"7344a1f2-936d-4266-9e4f-bd76fd51601b"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 12, Finished, Available, Finished)"},"metadata":{}}],"execution_count":8,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"c3b77184-4165-495e-b212-521dadef7125"},{"cell_type":"code","source":["## Define constants\n","MARKED_DIR = zinggDir + \"/\" + modelId + \"/trainingData/marked/\"\n","UNMARKED_DIR = zinggDir + \"/\" + modelId + \"/trainingData/unmarked/\"\n","\n","# Fill these with your specific details\n","storage_account = \"a1a73dc0-3894-4737-b38c-aa7fea437330\" # Replace with your storage account ID\n","fabric_url = \"dfs.fabric.microsoft.com\"\n","\n","# Updated paths for Microsoft Fabric\n","MARKED_DIR_DBFS = f\"abfss://{storage_account}@{fabric_url}{MARKED_DIR}\"\n","UNMARKED_DIR_DBFS = f\"abfss://{storage_account}@{fabric_url}{UNMARKED_DIR}\"\n","\n","## Import necessary libraries\n","import pandas as pd\n","import numpy as np\n","import os\n","import time\n","import uuid\n","from tabulate import tabulate\n","from ipywidgets import widgets, interact, GridspecLayout\n","import base64\n","import pyspark.sql.functions as fn\n","\n","# Import Azure libraries for Fabric\n","from azure.identity import DefaultAzureCredential\n","from azure.storage.filedatalake import DataLakeServiceClient\n","\n","# Zingg libraries\n","from zingg.client import *\n","from zingg.pipes import *\n","\n","# Setup Fabric authentication\n","def get_service_client():\n"," credential = DefaultAzureCredential()\n"," service_client = DataLakeServiceClient(\n"," account_url=f\"https://{storage_account}.dfs.fabric.microsoft.com\",\n"," credential=credential,\n"," )\n"," return service_client\n","\n","service_client = get_service_client()\n","\n","# Function to clean model directories in Fabric\n","def cleanModel():\n"," try:\n"," # Access the file system\n"," file_system_client = service_client.get_file_system_client(file_system=storage_account)\n"," \n"," # Remove marked directory\n"," if file_system_client.get_directory_client(MARKED_DIR).exists():\n"," file_system_client.get_directory_client(MARKED_DIR).delete_directory()\n"," \n"," # Remove unmarked directory\n"," if file_system_client.get_directory_client(UNMARKED_DIR).exists():\n"," file_system_client.get_directory_client(UNMARKED_DIR).delete_directory()\n"," \n"," print(\"Model cleaned successfully.\")\n"," except Exception as e:\n"," print(f\"Error cleaning model: {str(e)}\")\n"," return\n","\n","# Function to assign label to a candidate pair\n","def assign_label(candidate_pairs_pd, z_cluster, label):\n"," '''\n"," The purpose of this function is to assign a label to a candidate pair\n"," identified by its z_cluster value. Valid labels include:\n"," 0 - not matched\n"," 1 - matched\n"," 2 - uncertain\n"," '''\n"," # Assign label\n"," candidate_pairs_pd.loc[candidate_pairs_pd['z_cluster'] == z_cluster, 'z_isMatch'] = label\n"," return\n","\n","# Function to count labeled pairs\n","def count_labeled_pairs(marked_pd):\n"," '''\n"," The purpose of this function is to count the labeled pairs in the marked folder.\n"," '''\n"," n_total = len(np.unique(marked_pd['z_cluster']))\n"," n_positive = len(np.unique(marked_pd[marked_pd['z_isMatch'] == 1]['z_cluster']))\n"," n_negative = len(np.unique(marked_pd[marked_pd['z_isMatch'] == 0]['z_cluster']))\n","\n"," return n_positive, n_negative, n_total\n","\n","# Setup interactive widget\n","available_labels = {\n"," 'No Match': 0,\n"," 'Match': 1,\n"," 'Uncertain': 2\n","}\n"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":13,"statement_ids":[13],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:45.7920676Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:15.9184099Z","execution_finish_time":"2024-12-12T14:39:16.7144224Z","parent_msg_id":"c47972cc-56fd-46a9-80fe-da0d20234a5d"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 13, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stderr","text":["/opt/spark/python/lib/pyspark.zip/pyspark/sql/context.py:113: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n"]}],"execution_count":9,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"fd229c4c-6376-4f4b-89c3-14f78822eef8"},{"cell_type":"code","source":["#build the arguments for zingg\n","args = Arguments()\n","# Set the modelid and the zingg dir. You can use this as is\n","args.setModelId(modelId)\n","args.setZinggDir(zinggDir)\n","print(args)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":14,"statement_ids":[14],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:45.916886Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:17.2999881Z","execution_finish_time":"2024-12-12T14:39:17.5431547Z","parent_msg_id":"c783d3fd-b7fa-4591-9771-32d42753ddd9"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 14, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["\n"]}],"execution_count":10,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"f92fe414-811a-4e02-b11e-9711539d1786"},{"cell_type":"code","source":["# Import pandas\n","import pandas as pd\n","\n","# Define the schema (optional for validation)\n","schema = [\"id\", \"fname\", \"lname\", \"stNo\", \"add1\", \"add2\", \"city\", \"state\", \"dob\", \"ssn\"]\n","\n","# Load the CSV file\n","data = pd.read_csv(\"abfss://Zingg@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Files/data.csv\")\n","\n","# Ensure column names match the schema\n","data.columns = schema # Adjust only if the file's column names differ\n","\n","# Display the data\n","data.head()\n"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":15,"statement_ids":[15],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:46.0524493Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:18.126005Z","execution_finish_time":"2024-12-12T14:39:19.6523511Z","parent_msg_id":"619a3f46-252d-4b59-849e-69081583ed29"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 15, Finished, Available, Finished)"},"metadata":{}},{"output_type":"execute_result","execution_count":37,"data":{"text/plain":" id fname lname stNo add1 add2 \\\n0 rec-1021-dup-0 thomas george 1 mcmanus place stoney creek \n1 rec-1021-org thomas george 1 mcmanus place north turramurra \n2 rec-1022-dup-0 jackson eglinton 840 fowles street mountview \n3 rec-1022-dup-1 jackson eglinton 840 fowles street moun tjiew \n4 rec-1022-dup-2 jackson eglinton 840 fowles street mou nview \n\n city state dob ssn \n0 3130 sa 19630225 5460534 \n1 3130 sa 19630225 5460534 \n2 2803 sa 19830807 2932837 \n3 2830 sa 19830807 2932837 \n4 2830 sa 19830807 2932837 ","text/html":"
\n\n
\n \n
\n
\n
id
\n
fname
\n
lname
\n
stNo
\n
add1
\n
add2
\n
city
\n
state
\n
dob
\n
ssn
\n
\n \n \n
\n
0
\n
rec-1021-dup-0
\n
thomas
\n
george
\n
1
\n
mcmanus place
\n
stoney creek
\n
3130
\n
sa
\n
19630225
\n
5460534
\n
\n
\n
1
\n
rec-1021-org
\n
thomas
\n
george
\n
1
\n
mcmanus place
\n
north turramurra
\n
3130
\n
sa
\n
19630225
\n
5460534
\n
\n
\n
2
\n
rec-1022-dup-0
\n
jackson
\n
eglinton
\n
840
\n
fowles street
\n
mountview
\n
2803
\n
sa
\n
19830807
\n
2932837
\n
\n
\n
3
\n
rec-1022-dup-1
\n
jackson
\n
eglinton
\n
840
\n
fowles street
\n
moun tjiew
\n
2830
\n
sa
\n
19830807
\n
2932837
\n
\n
\n
4
\n
rec-1022-dup-2
\n
jackson
\n
eglinton
\n
840
\n
fowles street
\n
mou nview
\n
2830
\n
sa
\n
19830807
\n
2932837
\n
\n \n
\n
"},"metadata":{}}],"execution_count":11,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"a76f4324-ff22-46e1-81b5-16f97ab2835d"},{"cell_type":"code","source":["schema = \"rec_id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, dob string, ssn string\"\n","inputPipe = CsvPipe(\"testFebrl\", \"abfss://Zingg@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Files/data.csv\", schema)\n","\n","args.setData(inputPipe)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":16,"statement_ids":[16],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:46.2025787Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:20.2434395Z","execution_finish_time":"2024-12-12T14:39:20.4955338Z","parent_msg_id":"5c8d332f-c5a9-4782-8aa7-923604a75d86"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 16, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["set schema \n"]}],"execution_count":12,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"d9ed37ff-f408-4f87-bda0-161ad35946fb"},{"cell_type":"code","source":["#setting outputpipe in 'args'\n","outputPipe = CsvPipe(\"resultOutput\", \"abfss://Zingg@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Files\")\n","args.setOutput(outputPipe)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":17,"statement_ids":[17],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:46.3319598Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:21.0521349Z","execution_finish_time":"2024-12-12T14:39:21.3077047Z","parent_msg_id":"edd9e63e-2f5a-41f8-aec9-be73e860542d"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 17, Finished, Available, Finished)"},"metadata":{}}],"execution_count":13,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"3c49f24d-2f15-43e6-8c73-7b77c1199845"},{"cell_type":"code","source":["# Set field definitions\n","rec_id = FieldDefinition(\"rec_id\", \"string\", MatchType.EXACT) # ID should use exact match\n","fname = FieldDefinition(\"fname\", \"string\", MatchType.FUZZY) # First Name\n","lname = FieldDefinition(\"lname\", \"string\", MatchType.FUZZY) # Last Name\n","stNo = FieldDefinition(\"stNo\", \"string\", MatchType.FUZZY) # Street Number\n","add1 = FieldDefinition(\"add1\", \"string\", MatchType.FUZZY) # Address Line 1\n","add2 = FieldDefinition(\"add2\", \"string\", MatchType.FUZZY) # Address Line 2\n","city = FieldDefinition(\"city\", \"string\", MatchType.FUZZY) # City\n","state = FieldDefinition(\"state\", \"string\", MatchType.FUZZY) # State\n","dob = FieldDefinition(\"dob\", \"string\", MatchType.EXACT) # Date of Birth (prefer exact match)\n","ssn = FieldDefinition(\"ssn\", \"string\", MatchType.EXACT) # SSN (should use exact match)\n","\n","# Create the field definitions list\n","fieldDefs = [rec_id, fname, lname, stNo, add1, add2, city, state, dob, ssn]\n","\n","# Set field definitions in args\n","args.setFieldDefinition(fieldDefs)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":18,"statement_ids":[18],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:46.4720722Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:21.8641221Z","execution_finish_time":"2024-12-12T14:39:22.1346071Z","parent_msg_id":"71227dea-6926-4e14-9e66-501b8515fa5a"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 18, Finished, Available, Finished)"},"metadata":{}}],"execution_count":14,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"76edaab7-d705-4d05-adaa-298b48f87ae6"},{"cell_type":"code","source":["# The numPartitions define how data is split across the cluster. \n","# Please change the fllowing as per your data and cluster size by referring to the docs.\n","\n","args.setNumPartitions(4)\n","args.setLabelDataSampleSize(0.5)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":19,"statement_ids":[19],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:46.5771016Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:22.6870105Z","execution_finish_time":"2024-12-12T14:39:23.1094802Z","parent_msg_id":"133bf47a-3e2c-4a69-b874-b68bd3fd0f94"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 19, Finished, Available, Finished)"},"metadata":{}}],"execution_count":15,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"ea3a596e-0571-4149-9b5b-d8357226d90c"},{"cell_type":"code","source":["options = ClientOptions([ClientOptions.PHASE,\"findTrainingData\"])\n","\n","#Zingg execution for the given phase\n","zingg = ZinggWithSpark(args, options)\n","print(args)\n","print(options)\n","print(zingg)\n","zingg.initAndExecute()"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":20,"statement_ids":[20],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:46.7720589Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:23.6806377Z","execution_finish_time":"2024-12-12T14:39:40.4666332Z","parent_msg_id":"88db0a89-5777-4e74-92c3-15e9a461056f"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 20, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["['--phase', 'findTrainingData']\narguments for client options are ['--phase', 'findTrainingData', '--license', 'zinggLic.txt', '--email', 'zingg@zingg.ai', '--conf', 'dummyConf.json']\n\n\n\n"]}],"execution_count":16,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"92238689-3e1c-4b32-9802-c59c714aa6d2"},{"cell_type":"code","source":["options = ClientOptions([ClientOptions.PHASE,\"label\"])\n","\n","#Zingg execution for the given phase\n","zingg = ZinggWithSpark(args, options)\n","zingg.init()"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":21,"statement_ids":[21],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:46.8921439Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:41.0118438Z","execution_finish_time":"2024-12-12T14:39:41.2588634Z","parent_msg_id":"9f835445-3575-444e-be68-698c87047cfa"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 21, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["['--phase', 'label']\narguments for client options are ['--phase', 'label', '--license', 'zinggLic.txt', '--email', 'zingg@zingg.ai', '--conf', 'dummyConf.json']\n"]}],"execution_count":17,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"b30911c2-9663-4260-8952-c9e5e0d668ea"},{"cell_type":"code","source":["# get candidate pairs\n","candidate_pairs_pd = getPandasDfFromDs(zingg.getUnmarkedRecords())\n"," \n","# if no candidate pairs, run job and wait\n","if candidate_pairs_pd.shape[0] == 0:\n"," print('No unlabeled candidate pairs found. Run findTraining job ...')\n","\n","else:\n"," # get list of pairs (as identified by z_cluster) to label \n"," z_clusters = list(np.unique(candidate_pairs_pd['z_cluster'])) \n","\n"," # identify last reviewed cluster\n"," last_z_cluster = '' # none yet\n","\n"," # print candidate pair stats\n"," print('{0} candidate pairs found for labeling'.format(len(z_clusters)))"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":22,"statement_ids":[22],"state":"finished","livy_statement_state":"available","session_id":"e8d52d7f-1f5d-4897-a638-4465746c84f8","normalized_state":"finished","queued_time":"2024-12-12T14:38:47.1173535Z","session_start_time":null,"execution_start_time":"2024-12-12T14:39:41.8216531Z","execution_finish_time":"2024-12-12T14:39:44.3102558Z","parent_msg_id":"6d386eec-27ed-4ac8-8c59-e45bcfa62cc5"},"text/plain":"StatementMeta(, e8d52d7f-1f5d-4897-a638-4465746c84f8, 22, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["15 candidate pairs found for labeling\n"]}],"execution_count":18,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"e303305a-e747-4807-a788-beecde020545"},{"cell_type":"code","source":["# Label Training Set\n","\n","# define variable to avoid duplicate saves\n","ready_for_save = False\n","print(candidate_pairs_pd)\n","\n","# user-friendly labels and corresponding zingg numerical value\n","# (the order in the dictionary affects how displayed below)\n","LABELS = {\n"," 'Uncertain':2,\n"," 'Match':1,\n"," 'No Match':0 \n"," }\n","\n","# GET CANDIDATE PAIRS\n","# ========================================================\n","#candidate_pairs_pd = get_candidate_pairs()\n","n_pairs = int(candidate_pairs_pd.shape[0]/2)\n","# ========================================================\n","\n","# DEFINE IPYWIDGET DISPLAY\n","# ========================================================\n","display_pd = candidate_pairs_pd.drop(\n"," labels=[\n"," 'z_zid', 'z_prediction', 'z_score', 'z_isMatch', 'z_zsource'\n"," ], \n"," axis=1)\n","\n","# define header to be used with each displayed pair\n","html_prefix = \"
\"\n","html_suffix = \"
\"\n","header = widgets.HTML(value=f\"{html_prefix}\" + \" \".join([str(i)+\" \" for i in display_pd.columns.to_list()]) + f\"{html_suffix}\")\n","\n","# initialize display\n","vContainers = []\n","vContainers.append(widgets.HTML(value=f'
Indicate if each of the {n_pairs} record pairs is a match or not
'))\n","\n","# for each set of pairs\n","for n in range(n_pairs):\n","\n"," # get candidate records\n"," candidate_left = display_pd.loc[2*n].to_list()\n"," print(candidate_left)\n"," candidate_right = display_pd.loc[(2*n)+1].to_list()\n"," print(candidate_right)\n","\n"," # define grid to hold values\n"," html = ''\n","\n"," for i in range(display_pd.shape[1]):\n","\n"," # get column name\n"," column_name = display_pd.columns[i]\n","\n"," # if field is image\n"," if column_name == 'image_path':\n","\n"," # define row header\n"," html += '
'\n"," html += '
image
'\n","\n"," # read left image to encoded string\n"," l_endcode = ''\n"," if candidate_left[i] != '':\n"," with open(candidate_left[i], \"rb\") as l_file:\n"," l_encode = base64.b64encode( l_file.read() ).decode()\n","\n"," # read right image to encoded string\n"," r_encode = ''\n"," if candidate_right[i] != '':\n"," with open(candidate_right[i], \"rb\") as r_file:\n"," r_encode = base64.b64encode( r_file.read() ).decode() \n","\n"," # present images\n"," html += f'
'\n"," html += f'
'\n"," html += '
'\n","\n"," elif column_name != 'image_path': # display text values\n","\n"," if column_name == 'z_cluster': z_cluster = candidate_left[i]\n","\n"," html += '
'\n"," html += f'
{column_name}
'\n"," html += f'
{str(candidate_left[i])}
'\n"," html += f'
{str(candidate_right[i])}
'\n"," html += '
'\n","\n"," # insert data table\n"," table = widgets.HTML(value=f'
","layout":"IPY_MODEL_2dc9896b314544f3bd71c32c625e1175","style":"IPY_MODEL_2a7ce010e31c474d834773f51158ad6c"}},"8b544a3eb42548698fec50307ca58cf0":{"model_name":"ToggleButtonsModel","model_module":"@jupyter-widgets/controls","model_module_version":"2.0.0","state":{"index":2,"_options_labels":["Uncertain","Match","No Match"],"button_style":"info","layout":"IPY_MODEL_6ff19e3e507c4bebafd8a1bff6ce55c8","tooltips":[],"style":"IPY_MODEL_cc8a117379724417a5481bb9d17126b5","icons":[]}},"318d9d146d1f41ee9a169043637dadb7":{"model_name":"LayoutModel","model_module":"@jupyter-widgets/base","model_module_version":"2.0.0","state":{}},"cbbfcbe143644072846912c9d8f1c6d7":{"model_name":"HTMLStyleModel","model_module":"@jupyter-widgets/controls","model_module_version":"2.0.0","state":{"description_width":"","font_size":null,"text_color":null}},"5227aa6fa7c749238d811d462cb0fe36":{"model_name":"HTMLModel","model_module":"@jupyter-widgets/controls","model_module_version":"2.0.0","state":{"value":" ","layout":"IPY_MODEL_44acc8fae0314cb7a33463d2bc6353e7","style":"IPY_MODEL_451cd21ac7b64517b93824dd5ab79460"}},"c80f86a431824631b6626eba7c46fc33":{"model_name":"HTMLModel","model_module":"@jupyter-widgets/controls","model_module_version":"2.0.0","state":{"value":" ","layout":"IPY_MODEL_17f6fddf67e242588f39e2aaf0558678","style":"IPY_MODEL_da34c9ff8e3b4738a59ec9eb0a39d2cb"}},"47e1703b3d45461f816b4ec1f8ea445a":{"model_name":"LayoutModel","model_module":"@jupyter-widgets/base","model_module_version":"2.0.0","state":{}},"9d57f12f444b47b58f6982290bc17ba2":{"model_name":"LayoutModel","model_module":"@jupyter-widgets/base","model_module_version":"2.0.0","state":{}},"b345a2da49d84b559a59792c488d0c1f":{"model_name":"HTMLStyleModel","model_module":"@jupyter-widgets/controls","model_module_version":"2.0.0","state":{"description_width":"","font_size":null,"text_color":null}},"04911938acd2486e8fc0ded740020ea1":{"model_name":"LayoutModel","model_module":"@jupyter-widgets/base","model_module_version":"2.0.0","state":{}},"7ef6892a4e7444458465dd5a5e76fae5":{"model_name":"LayoutModel","model_module":"@jupyter-widgets/base","model_module_version":"2.0.0","state":{}},"5d8d51ddc216416cb12979d0f38aae5a":{"model_name":"VBoxModel","model_module":"@jupyter-widgets/controls","model_module_version":"2.0.0","state":{"children":["IPY_MODEL_6542b2868c0c43359d500c3828ef12ef","IPY_MODEL_22483139248d470ca2edbb0b22a669d1","IPY_MODEL_c80f86a431824631b6626eba7c46fc33"],"layout":"IPY_MODEL_952a9f160893406791ec1975a5af971f"}},"4ebfc8728d2c4186a14ab0d9e52ca0c5":{"model_name":"LayoutModel","model_module":"@jupyter-widgets/base","model_module_version":"2.0.0","state":{}},"6ff19e3e507c4bebafd8a1bff6ce55c8":{"model_name":"LayoutModel","model_module":"@jupyter-widgets/base","model_module_version":"2.0.0","state":{}},"f1be32a9a51445f98e99e3b4a2c697bb":{"model_name":"LayoutModel","model_module":"@jupyter-widgets/base","model_module_version":"2.0.0","state":{}},"a7171853339643a48382ec125a26944d":{"model_name":"HTMLStyleModel","model_module":"@jupyter-widgets/controls","model_module_version":"2.0.0","state":{"description_width":"","font_size":null,"text_color":null}}}}},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"2400000"}}},"dependencies":{"lakehouse":{"default_lakehouse":"36ef8bc2-c67a-4512-b060-e25489729c71","default_lakehouse_name":"data","default_lakehouse_workspace_id":"e803987a-98b6-445f-815c-3d15c2c46877","known_lakehouses":[{"id":"7e68da48-69ac-4253-b7bf-1f24863ab25a"},{"id":"1ca5fe82-c7a1-494d-825d-9168c65112d1"},{"id":"36ef8bc2-c67a-4512-b060-e25489729c71"}]},"environment":{"environmentId":"1ae2ef87-3a76-4cd3-90b5-e829f7a4ca9c","workspaceId":"e803987a-98b6-445f-815c-3d15c2c46877"}}},"nbformat":4,"nbformat_minor":5}
\ No newline at end of file