From 6d5b43bc2177553dde6567794a7e4e844a62efaa Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 17 Apr 2024 17:33:10 +0000 Subject: [PATCH] fix linter issues --- .../get_started_with_embedding_tuning.ipynb | 862 +++++++++--------- 1 file changed, 434 insertions(+), 428 deletions(-) diff --git a/embeddings/get_started_with_embedding_tuning.ipynb b/embeddings/get_started_with_embedding_tuning.ipynb index 26b533bfbf..dfc3820432 100644 --- a/embeddings/get_started_with_embedding_tuning.ipynb +++ b/embeddings/get_started_with_embedding_tuning.ipynb @@ -52,24 +52,13 @@ }, { "cell_type": "markdown", - "source": [ - "| | |\n", - "|-|-|\n", - "|Author(s) | [Ivan Nardini](https://github.com/inardini)|" - ], "metadata": { "id": "3Vzj1qV_dPeO" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "24743cf4a1e1" }, "source": [ - "**_NOTE_**: This notebook has been tested in the following environment:\n", - "\n", - "* Python version = 3.9" + "| | |\n", + "|-|-|\n", + "|Author(s) | [Ivan Nardini](https://github.com/inardini)|" ] }, { @@ -377,31 +366,31 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "eckavkeph5zB" + }, "source": [ "### Set up tutorial folder\n", "\n", "Set up a folder for tutorial content including data, metadata and more." - ], - "metadata": { - "id": "eckavkeph5zB" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Kr90HWKmh8H0" + }, + "outputs": [], "source": [ "from pathlib import Path as path\n", "\n", "root_path = path.cwd()\n", - "tutorial_path = root_path / 'tutorial'\n", - "data_path = tutorial_path / 'data'\n", + "tutorial_path = root_path / \"tutorial\"\n", + "data_path = tutorial_path / \"data\"\n", "\n", "data_path.mkdir(parents=True, exist_ok=True)" - ], - "metadata": { - "id": "Kr90HWKmh8H0" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -425,48 +414,51 @@ "import random\n", "import string\n", "import time\n", - "import re\n", - "import json\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import vertexai\n", + "import vertexai.preview.generative_models as generative_models\n", "from etils import epath\n", - "from google.protobuf.json_format import MessageToDict\n", "from google.api_core.client_options import ClientOptions\n", - "from google.cloud import documentai\n", + "from google.cloud import aiplatform, documentai\n", + "from google.protobuf.json_format import MessageToDict\n", + "from langchain.docstore.document import Document\n", "from langchain_community.document_loaders.blob_loaders import Blob\n", "from langchain_community.document_loaders.parsers import DocAIParser\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", - "from langchain.docstore.document import Document\n", - "import numpy as np\n", - "import pandas as pd\n", - "import vertexai\n", - "from vertexai.generative_models import GenerativeModel\n", - "import vertexai.preview.generative_models as generative_models\n", - "from google.cloud import aiplatform" + "from vertexai.generative_models import GenerativeModel" ] }, { "cell_type": "markdown", + "metadata": { + "id": "TvQ81PjSiCuZ" + }, "source": [ "### Set Variables\n", "\n", "Set variables to run the tutorial." - ], - "metadata": { - "id": "TvQ81PjSiCuZ" - } + ] }, { "cell_type": "code", - "source": [ - "ID = \"\".join(random.choices(string.ascii_lowercase + string.digits, k=4))" - ], + "execution_count": null, "metadata": { "id": "ajbQb0eXu3xh" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "ID = \"\".join(random.choices(string.ascii_lowercase + string.digits, k=4))" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ct35zelZiELu" + }, + "outputs": [], "source": [ "# Dataset\n", "PROCESSOR_ID = f\"preprocess-docs-llm-{ID}\"\n", @@ -479,36 +471,34 @@ "\n", "# Tuning\n", "PIPELINE_ROOT = f\"{BUCKET_URI}/pipelines\"\n", - "BATCH_SIZE = 32 # @param {type:\"integer\"}\n", - "TRAINING_ACCELERATOR_TYPE = 'NVIDIA_TESLA_T4' # @param {type:\"string\"}\n", - "TRAINING_MACHINE_TYPE = 'n1-standard-16' # @param {type:\"string\"}\n", + "BATCH_SIZE = 32 # @param {type:\"integer\"}\n", + "TRAINING_ACCELERATOR_TYPE = \"NVIDIA_TESLA_T4\" # @param {type:\"string\"}\n", + "TRAINING_MACHINE_TYPE = \"n1-standard-16\" # @param {type:\"string\"}\n", "\n", "# Serving\n", - "PREDICTION_ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100' # @param {type:\"string\"}\n", - "PREDICTION_ACCELERATOR_COUNT = 1 # @param {type:\"integer\"}\n", - "PREDICTION_MACHINE_TYPE = 'a2-highgpu-1g' # @param {type:\"string\"}" - ], - "metadata": { - "id": "ct35zelZiELu" - }, - "execution_count": null, - "outputs": [] + "PREDICTION_ACCELERATOR_TYPE = \"NVIDIA_TESLA_A100\" # @param {type:\"string\"}\n", + "PREDICTION_ACCELERATOR_COUNT = 1 # @param {type:\"integer\"}\n", + "PREDICTION_MACHINE_TYPE = \"a2-highgpu-1g\" # @param {type:\"string\"}" + ] }, { "cell_type": "markdown", - "source": [ - "### Helpers" - ], "metadata": { "id": "F2mkgxcciGiZ" - } + }, + "source": [ + "### Helpers" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "40tjc8jmiH4B" + }, + "outputs": [], "source": [ - "def create_processor(\n", - " project_id: str, location: str, processor_display_name: str\n", - "):\n", + "def create_processor(project_id: str, location: str, processor_display_name: str):\n", "\n", " \"\"\"Create a Document AI processor.\"\"\"\n", " client_options = ClientOptions(api_endpoint=f\"{location}-documentai.googleapis.com\")\n", @@ -520,9 +510,10 @@ " parent=parent,\n", " processor=documentai.Processor(\n", " display_name=processor_display_name, type_=\"OCR_PROCESSOR\"\n", - " )\n", + " ),\n", " )\n", "\n", + "\n", "def generate_queries(\n", " chuck,\n", " num_questions=3,\n", @@ -533,9 +524,9 @@ " model = GenerativeModel(\"gemini-1.0-pro-001\")\n", "\n", " generation_config = {\n", - " \"max_output_tokens\": 2048,\n", - " \"temperature\": 0.9,\n", - " \"top_p\": 1,\n", + " \"max_output_tokens\": 2048,\n", + " \"temperature\": 0.9,\n", + " \"top_p\": 1,\n", " }\n", "\n", " safety_settings = {\n", @@ -555,49 +546,58 @@ " QUESTION:\n", " \"\"\"\n", "\n", - " query = prompt_template.format(chuck=chuck.page_content, num_questions=num_questions)\n", + " query = prompt_template.format(\n", + " chuck=chuck.page_content, num_questions=num_questions\n", + " )\n", "\n", " for idx in range(num_questions):\n", - " response = model.generate_content(\n", - " [query],\n", - " generation_config=generation_config,\n", - " safety_settings=safety_settings).text\n", + " response = model.generate_content(\n", + " [query],\n", + " generation_config=generation_config,\n", + " safety_settings=safety_settings,\n", + " ).text\n", + "\n", + " return Document(\n", + " page_content=response, metadata={\"page\": chuck.metadata[\"page\"]}\n", + " )\n", "\n", - " return Document(page_content=response, metadata={\"page\": chuck.metadata['page']})\n", "\n", "def get_task_by_name(job: aiplatform.PipelineJob, task_name: str):\n", " \"\"\"Get a Vertex AI Pipeline job task by its name\"\"\"\n", " for task in job.task_details:\n", " if task.task_name == task_name:\n", " return task\n", - " raise ValueError(f'Task {task_name} not found')\n", + " raise ValueError(f\"Task {task_name} not found\")\n", + "\n", "\n", "def get_metrics(\n", - " job: aiplatform.PipelineJob, task_name: str = 'text-embedding-evaluator'\n", + " job: aiplatform.PipelineJob, task_name: str = \"text-embedding-evaluator\"\n", "):\n", " \"\"\"Get metrics for the evaluation task\"\"\"\n", " evaluation_task = get_task_by_name(job, task_name)\n", - " metrics = MessageToDict(evaluation_task.outputs['metrics']._pb)['artifacts'][0]['metadata']\n", + " metrics = MessageToDict(evaluation_task.outputs[\"metrics\"]._pb)[\"artifacts\"][0][\n", + " \"metadata\"\n", + " ]\n", " metrics_df = pd.DataFrame([metrics])\n", " return metrics_df\n", "\n", "\n", "def get_uploaded_model(\n", - " job: aiplatform.PipelineJob, task_name: str = 'text-embedding-model-uploader'\n", + " job: aiplatform.PipelineJob, task_name: str = \"text-embedding-model-uploader\"\n", ") -> aiplatform.Model:\n", " \"\"\"Get uploaded model from the pipeline job\"\"\"\n", " evaluation_task = get_task_by_name(job, task_name)\n", - " upload_metadata = MessageToDict(evaluation_task.execution._pb)['metadata']\n", - " return aiplatform.Model(upload_metadata['output:model_resource_name'])\n", + " upload_metadata = MessageToDict(evaluation_task.execution._pb)[\"metadata\"]\n", + " return aiplatform.Model(upload_metadata[\"output:model_resource_name\"])\n", "\n", "\n", "def get_training_output_dir(\n", - " job: aiplatform.PipelineJob, task_name: str = 'text-embedding-trainer'\n", + " job: aiplatform.PipelineJob, task_name: str = \"text-embedding-trainer\"\n", ") -> str:\n", " \"\"\"Get training output directory for the pipeline job\"\"\"\n", " trainer_task = get_task_by_name(job, task_name)\n", - " output_artifact = MessageToDict(trainer_task.execution._pb)['metadata']\n", - " output_artifact = trainer_task.outputs['training_output'].artifacts[0]\n", + " output_artifact = MessageToDict(trainer_task.execution._pb)[\"metadata\"]\n", + " output_artifact = trainer_task.outputs[\"training_output\"].artifacts[0]\n", " return output_artifact.uri\n", "\n", "\n", @@ -608,23 +608,21 @@ "\n", "\n", "def get_topk_scores(\n", - " query_embedding: pd.DataFrame, corpus_embeddings: pd.DataFrame, k=10\n", + " query_embedding: pd.DataFrame, corpus_embeddings: pd.DataFrame, k=10\n", ") -> pd.DataFrame:\n", " \"\"\"Get top k similar scores for each query\"\"\"\n", " similarity = corpus_embeddings.dot(query_embedding.T)\n", - " topk_index = pd.DataFrame(\n", - " {c: v.nlargest(n=k).index for c, v in similarity.items()}\n", - " )\n", + " topk_index = pd.DataFrame({c: v.nlargest(n=k).index for c, v in similarity.items()})\n", " return topk_index\n", "\n", "\n", "def get_topk_documents(\n", - " query_text: list[str],\n", - " corpus_text: pd.DataFrame,\n", - " corpus_embeddings: pd.DataFrame,\n", - " task_type: str = \"RETRIEVAL_DOCUMENT\",\n", - " title: str = \"\",\n", - " k: int = 10,\n", + " query_text: list[str],\n", + " corpus_text: pd.DataFrame,\n", + " corpus_embeddings: pd.DataFrame,\n", + " task_type: str = \"RETRIEVAL_DOCUMENT\",\n", + " title: str = \"\",\n", + " k: int = 10,\n", ") -> pd.DataFrame:\n", " \"\"\"Get top k similar documents for each query\"\"\"\n", " instances = []\n", @@ -645,14 +643,9 @@ " query_text[c]: corpus_text.loc[v.values].values.ravel()\n", " for c, v in topk.items()\n", " },\n", - " orient='columns'\n", + " orient=\"columns\",\n", " )" - ], - "metadata": { - "id": "40tjc8jmiH4B" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -678,6 +671,9 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "FpDyETD1iKi-" + }, "source": [ "## Tuning text embeddings\n", "\n", @@ -686,13 +682,13 @@ "Next, you create a model tuning job and deploy the tuned model to a Vertex AI endpoint.\n", "\n", "Finally, you retrive similar items using the tuned embedding model." - ], - "metadata": { - "id": "FpDyETD1iKi-" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "WJx_WMx2wioP" + }, "source": [ "### Prepare your model tuning dataset using Document AI, Gemini API, and LangChain on Vertex AI\n", "\n", @@ -702,172 +698,174 @@ "\n", "- `query` file is a JSONL file where each line has the fields _id, and text of each relevant query.\n", "\n", - "- `labels` files are TSV files (train, test and val) with three columns: `query-id`,`corpus-id`, and `score`. `query-id` represents the query id in the query file, `corpus-id` represents the corpus id in the corpus file, and `score` indicates relevance with higher scores meaning greater relevance. A default score of 1 is used if none is specified. The `train` file is required while `test` and `val` are optional.\n", - "\n" - ], - "metadata": { - "id": "WJx_WMx2wioP" - } + "- `labels` files are TSV files (train, test and val) with three columns: `query-id`,`corpus-id`, and `score`. `query-id` represents the query id in the query file, `corpus-id` represents the corpus id in the corpus file, and `score` indicates relevance with higher scores meaning greater relevance. A default score of 1 is used if none is specified. The `train` file is required while `test` and `val` are optional.\n" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "0eDQ8BSFiOH9" + }, "source": [ "#### Create a Document AI preprocessor\n", "\n", "Create the OCR processor to identify and extract text in PDF document." - ], - "metadata": { - "id": "0eDQ8BSFiOH9" - } + ] }, { "cell_type": "code", - "source": [ - "processor = create_processor(PROJECT_ID, LOCATION, PROCESSOR_ID)" - ], + "execution_count": null, "metadata": { "id": "ZIJXJt6ciNdj" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "processor = create_processor(PROJECT_ID, LOCATION, PROCESSOR_ID)" + ] }, { "cell_type": "markdown", - "source": [ - "#### Parse the document using DocAI Parser in LangChain" - ], "metadata": { "id": "P8podNK-rxm6" - } + }, + "source": [ + "#### Parse the document using DocAI Parser in LangChain" + ] }, { "cell_type": "markdown", - "source": [ - "Initiate a LangChain parser." - ], "metadata": { "id": "jCUWtEq3oLqs" - } + }, + "source": [ + "Initiate a LangChain parser." + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D_PU-I_-teWQ" + }, + "outputs": [], "source": [ "blob = Blob(\n", " path=f\"{RAW_DATA_URI}/goog-10-k-2023.pdf\",\n", ")\n", "\n", "parser = DocAIParser(\n", - " processor_name=processor.name, location=LOCATION, gcs_output_path=PROCESSED_DATA_OCR_URI\n", + " processor_name=processor.name,\n", + " location=LOCATION,\n", + " gcs_output_path=PROCESSED_DATA_OCR_URI,\n", ")" - ], - "metadata": { - "id": "D_PU-I_-teWQ" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "Run a Google Document AI PDF Batch Processing job.\n", - "\n" - ], "metadata": { "id": "hz254MGnoPgR" - } + }, + "source": [ + "Run a Google Document AI PDF Batch Processing job.\n" + ] }, { "cell_type": "code", - "source": [ - "operations = parser.docai_parse([blob])" - ], + "execution_count": null, "metadata": { "id": "sjm3LSh7oKJk" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "operations = parser.docai_parse([blob])" + ] }, { "cell_type": "code", - "source": [ - "while True:\n", - " if parser.is_running(operations):\n", - " print(\"Waiting for DocAI to finish...\")\n", - " time.sleep(10)\n", - " else:\n", - " print(\"DocAI successfully processed!\")\n", - " break" - ], + "execution_count": null, "metadata": { "id": "5_DpZodtvATw" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "while True:\n", + " if parser.is_running(operations):\n", + " print(\"Waiting for DocAI to finish...\")\n", + " time.sleep(10)\n", + " else:\n", + " print(\"DocAI successfully processed!\")\n", + " break" + ] }, { "cell_type": "markdown", - "source": [ - "Get the resulting LangChain Documents containing the extracted text and metadata." - ], "metadata": { "id": "6MjwFF6loZfF" - } + }, + "source": [ + "Get the resulting LangChain Documents containing the extracted text and metadata." + ] }, { "cell_type": "code", - "source": [ - "results = parser.get_results(operations)" - ], + "execution_count": null, "metadata": { "id": "oBh2AKdIvX8g" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "results = parser.get_results(operations)" + ] }, { "cell_type": "code", - "source": [ - "docs = list(parser.parse_from_results(results))" - ], + "execution_count": null, "metadata": { "id": "ZGfde504wIcF" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "docs = list(parser.parse_from_results(results))" + ] }, { "cell_type": "code", - "source": [ - "docs[0]" - ], + "execution_count": null, "metadata": { "id": "rFsBLHpwWRCy" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "docs[0]" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "MXGdZYB_x9R6" + }, "source": [ "#### Create document chunks using `RecursiveCharacterTextSplitter`\n", "\n", "You can create chucks using `RecursiveCharacterTextSplitter` in LangChain. The splitter divides text into smaller chunks of a chosen size based on a set of specified characters." - ], - "metadata": { - "id": "MXGdZYB_x9R6" - } + ] }, { "cell_type": "markdown", - "source": [ - "Initiate the splitter." - ], "metadata": { "id": "y_JIYFd7qePe" - } + }, + "source": [ + "Initiate the splitter." + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TxBSDPGPyA4l" + }, + "outputs": [], "source": [ "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=2500,\n", @@ -875,394 +873,409 @@ " length_function=len,\n", " is_separator_regex=False,\n", ")" - ], - "metadata": { - "id": "TxBSDPGPyA4l" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "Create text chucks." - ], "metadata": { "id": "Le_WIyGCqg7O" - } + }, + "source": [ + "Create text chucks." + ] }, { "cell_type": "code", - "source": [ - "document_content = [doc.page_content for doc in docs]\n", - "document_metadata = [{'page': idx} for idx, doc in enumerate(docs, 1)]\n", - "chunks = text_splitter.create_documents(document_content, metadatas=document_metadata)" - ], + "execution_count": null, "metadata": { "id": "M8BxdLjUycm4" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "document_content = [doc.page_content for doc in docs]\n", + "document_metadata = [{\"page\": idx} for idx, doc in enumerate(docs, 1)]\n", + "chunks = text_splitter.create_documents(document_content, metadatas=document_metadata)" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "3wm91wQnB-Xd" + }, "source": [ "#### Create queries\n", "\n", "You can utilize Gemini on Vertex AI to produce hypothetical questions that are relevant to a given piece of context (chunk). This approach enables the generation of synthetic positive pairs of (query, relevant documents) in a scalable manner." - ], - "metadata": { - "id": "3wm91wQnB-Xd" - } + ] }, { "cell_type": "code", - "source": [ - "generated_queries = [generate_queries(chuck = chuck,\n", - " num_questions=3) for chuck in chunks]" - ], + "execution_count": null, "metadata": { "id": "LsnSHzZBCCij" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "generated_queries = [generate_queries(chuck=chuck, num_questions=3) for chuck in chunks]" + ] }, { "cell_type": "markdown", - "source": [ - "#### Create the tuning training and test dataset files." - ], "metadata": { "id": "x2XYblPb8Uvy" - } + }, + "source": [ + "#### Create the tuning training and test dataset files." + ] }, { "cell_type": "markdown", - "source": [ - "Create the `corpus` file." - ], "metadata": { "id": "b78Kh1K0vP3s" - } + }, + "source": [ + "Create the `corpus` file." + ] }, { "cell_type": "code", - "source": [ - "corpus_df = pd.DataFrame()\n", - "corpus_df[\"_id\"] = ['text_' + str(idx) for idx in range(len(generated_queries))]\n", - "corpus_df[\"text\"] = [chuck.page_content for chuck in chunks]\n", - "corpus_df[\"doc_id\"] = [chuck.metadata['page'] for chuck in chunks]" - ], + "execution_count": null, "metadata": { "id": "Zbl6sB3-8YdP" }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", + "outputs": [], "source": [ - "corpus_df.head(10)" - ], + "corpus_df = pd.DataFrame()\n", + "corpus_df[\"_id\"] = [\"text_\" + str(idx) for idx in range(len(generated_queries))]\n", + "corpus_df[\"text\"] = [chuck.page_content for chuck in chunks]\n", + "corpus_df[\"doc_id\"] = [chuck.metadata[\"page\"] for chuck in chunks]" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "id": "dDCRCw-buq8U" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "corpus_df.head(10)" + ] }, { "cell_type": "markdown", - "source": [ - "Create the `query` file." - ], "metadata": { "id": "_tu0Lx7FvVJe" - } + }, + "source": [ + "Create the `query` file." + ] }, { "cell_type": "code", - "source": [ - "query_df = pd.DataFrame()\n", - "query_df[\"_id\"] = ['query_' + str(idx) for idx in range(len(generated_queries))]\n", - "query_df[\"text\"] = [query.page_content for query in generated_queries]\n", - "query_df[\"doc_id\"] = [query.metadata['page'] for query in generated_queries]" - ], + "execution_count": null, "metadata": { "id": "Fu1fFhdkrCoq" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "query_df = pd.DataFrame()\n", + "query_df[\"_id\"] = [\"query_\" + str(idx) for idx in range(len(generated_queries))]\n", + "query_df[\"text\"] = [query.page_content for query in generated_queries]\n", + "query_df[\"doc_id\"] = [query.metadata[\"page\"] for query in generated_queries]" + ] }, { "cell_type": "code", - "source": [ - "query_df.head(10)" - ], + "execution_count": null, "metadata": { "id": "Uo4yaw4Xu8wJ" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "query_df.head(10)" + ] }, { "cell_type": "markdown", - "source": [ - "Create the `score` file." - ], "metadata": { "id": "Iv9dcRqFvYN-" - } + }, + "source": [ + "Create the `score` file." + ] }, { "cell_type": "code", - "source": [ - "score_df = corpus_df.merge(query_df, on='doc_id')\n", - "score_df = score_df.rename(columns={'_id_x': 'corpus-id', '_id_y': 'query-id'})\n", - "score_df = score_df.drop(columns=['doc_id', 'text_x', 'text_y'])\n", - "score_df['score'] = 1\n", - "train_df = score_df.sample(frac=0.8)\n", - "test_df = score_df.drop(train_df.index)" - ], + "execution_count": null, "metadata": { "id": "NMDpCp06wAcX" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "score_df = corpus_df.merge(query_df, on=\"doc_id\")\n", + "score_df = score_df.rename(columns={\"_id_x\": \"corpus-id\", \"_id_y\": \"query-id\"})\n", + "score_df = score_df.drop(columns=[\"doc_id\", \"text_x\", \"text_y\"])\n", + "score_df[\"score\"] = 1\n", + "train_df = score_df.sample(frac=0.8)\n", + "test_df = score_df.drop(train_df.index)" + ] }, { "cell_type": "code", - "source": [ - "train_df.head(10)" - ], + "execution_count": null, "metadata": { "id": "Tc-JAxKwxoar" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "train_df.head(10)" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "YGgY3L-CyhAQ" + }, "source": [ "#### Save the tuning dataset\n", "\n", "Upload the model tuning datasets to a Cloud Storage bucket." - ], - "metadata": { - "id": "YGgY3L-CyhAQ" - } + ] }, { "cell_type": "code", - "source": [ - "corpus_df.to_json(f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/corpus.jsonl\", orient='records', lines=True)\n", - "query_df.to_json(f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/query.jsonl\", orient='records', lines=True)\n", - "train_df.to_csv(f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/train.tsv\", sep='\\t', header=True, index=False)\n", - "test_df.to_csv(f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/test.tsv\", sep='\\t', header=True, index=False)" - ], + "execution_count": null, "metadata": { "id": "HHiC-Qg8yorH" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "corpus_df.to_json(\n", + " f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/corpus.jsonl\",\n", + " orient=\"records\",\n", + " lines=True,\n", + ")\n", + "query_df.to_json(\n", + " f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/query.jsonl\", orient=\"records\", lines=True\n", + ")\n", + "train_df.to_csv(\n", + " f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/train.tsv\",\n", + " sep=\"\\t\",\n", + " header=True,\n", + " index=False,\n", + ")\n", + "test_df.to_csv(\n", + " f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/test.tsv\",\n", + " sep=\"\\t\",\n", + " header=True,\n", + " index=False,\n", + ")" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "6p9NR9I31fo6" + }, "source": [ "### Run an embedding tuning job on Vertex AI Pipelines\n", "\n", "Next, set the tuning pipeline parameters including the Cloud Storage bucket paths with train and test datasets, the training batch size and the number of steps to perform model tuning. For more information about pipeline parameters, [check](https://cloud.google.com/vertex-ai/generative-ai/docs/models/tune-embeddings#create-embedding-tuning-job) the official tuning documentation." - ], - "metadata": { - "id": "6p9NR9I31fo6" - } + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YePRoZg31iSJ" + }, + "outputs": [], "source": [ "ITERATIONS = len(train_df) // BATCH_SIZE\n", "\n", "params = {\n", - " 'batch_size': BATCH_SIZE,\n", - " 'iterations': ITERATIONS,\n", - " 'accelerator_type': TRAINING_ACCELERATOR_TYPE,\n", - " 'machine_type': TRAINING_MACHINE_TYPE,\n", - " 'base_model_version_id': 'textembedding-gecko@003',\n", - " 'queries_path': f'{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/query.jsonl',\n", - " 'corpus_path': f'{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/corpus.jsonl',\n", - " 'train_label_path': f'{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/train.tsv',\n", - " 'test_label_path': f'{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/test.tsv',\n", - " 'project': PROJECT_ID,\n", - " 'location': REGION,\n", + " \"batch_size\": BATCH_SIZE,\n", + " \"iterations\": ITERATIONS,\n", + " \"accelerator_type\": TRAINING_ACCELERATOR_TYPE,\n", + " \"machine_type\": TRAINING_MACHINE_TYPE,\n", + " \"base_model_version_id\": \"textembedding-gecko@003\",\n", + " \"queries_path\": f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/query.jsonl\",\n", + " \"corpus_path\": f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/corpus.jsonl\",\n", + " \"train_label_path\": f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/train.tsv\",\n", + " \"test_label_path\": f\"{PROCESSED_DATA_TUNING_URI}/{TIMESTAMP}/test.tsv\",\n", + " \"project\": PROJECT_ID,\n", + " \"location\": REGION,\n", "}\n", "\n", - "template_uri = 'https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.1'" - ], - "metadata": { - "id": "YePRoZg31iSJ" - }, - "execution_count": null, - "outputs": [] + "template_uri = \"https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.1\"" + ] }, { "cell_type": "markdown", - "source": [ - "Run the model tuning pipeline job." - ], "metadata": { "id": "0rN_XWFjxWZn" - } + }, + "source": [ + "Run the model tuning pipeline job." + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "m7JEoMT-1mAC" + }, + "outputs": [], "source": [ "job = aiplatform.PipelineJob(\n", - " display_name='tune-text-embedding',\n", + " display_name=\"tune-text-embedding\",\n", " parameter_values=params,\n", " template_path=template_uri,\n", " pipeline_root=PIPELINE_ROOT,\n", " project=PROJECT_ID,\n", " location=REGION,\n", ")" - ], - "metadata": { - "id": "m7JEoMT-1mAC" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "job.run()" - ], + "execution_count": null, "metadata": { "id": "ExF5xlj0uBjK" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "job.run()" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "UJCufZmS7SVM" + }, "source": [ "### Evaluate the tuned model\n", "\n", "Evaluate the tuned embedding model. The Vertex AI Pipeline automatically produces NDCG (Normalized Discounted Cumulative Gain) for both training and test datasets. NDCG measures ranking effectiveness taking position of relevant items in the ranked list.\n" - ], - "metadata": { - "id": "UJCufZmS7SVM" - } + ] }, { "cell_type": "code", - "source": [ - "metric_df = get_metrics(job)" - ], + "execution_count": null, "metadata": { "id": "eldc535Y7xmD" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "metric_df = get_metrics(job)" + ] }, { "cell_type": "code", - "source": [ - "metric_df.to_dict()" - ], + "execution_count": null, "metadata": { "id": "W-s_AEuoaKSd" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "metric_df.to_dict()" + ] }, { "cell_type": "code", - "source": [ - "metric_df" - ], + "execution_count": null, "metadata": { "id": "qMBUmPFgBRWi" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "metric_df" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "ZP6nCnJ2_6wU" + }, "source": [ "### Deploy the embedding tuned model on Vertex AI Prediction\n", "\n", "To deploy the embedding tuned model, you need to create an Vertex AI Endpoint.\n", "\n", "Then you deploy the tuned embeddings model to the endpoint." - ], - "metadata": { - "id": "ZP6nCnJ2_6wU" - } + ] }, { "cell_type": "markdown", - "source": [ - "#### Create the endpoint" - ], "metadata": { "id": "d5LtEGEbAHPd" - } + }, + "source": [ + "#### Create the endpoint" + ] }, { "cell_type": "code", - "source": [ - "endpoint = aiplatform.Endpoint.create(\n", - " display_name='tuned_custom_embedding_endpoint',\n", - " description='Endpoint for tuned model embeddings.',\n", - " project=PROJECT_ID,\n", - " location=REGION,\n", - " )" - ], + "execution_count": null, "metadata": { "id": "Q2KRaRzHAF8f" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "endpoint = aiplatform.Endpoint.create(\n", + " display_name=\"tuned_custom_embedding_endpoint\",\n", + " description=\"Endpoint for tuned model embeddings.\",\n", + " project=PROJECT_ID,\n", + " location=REGION,\n", + ")" + ] }, { "cell_type": "markdown", - "source": [ - "#### Deploy the tuned model" - ], "metadata": { "id": "xGLjbFY-AYi3" - } + }, + "source": [ + "#### Deploy the tuned model" + ] }, { "cell_type": "markdown", - "source": [ - "Get the tuned model." - ], "metadata": { "id": "mdfedMEj1ZNy" - } + }, + "source": [ + "Get the tuned model." + ] }, { "cell_type": "code", - "source": [ - "model = get_uploaded_model(job)" - ], + "execution_count": null, "metadata": { "id": "ndE5WGVkBQA2" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "model = get_uploaded_model(job)" + ] }, { "cell_type": "markdown", - "source": [ - "Deploy the tuned model to the endpoint." - ], "metadata": { "id": "CqkAAzvV1ewQ" - } + }, + "source": [ + "Deploy the tuned model to the endpoint." + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nOZyyPALDcSP" + }, + "outputs": [], "source": [ "endpoint.deploy(\n", " model,\n", @@ -1270,101 +1283,94 @@ " accelerator_count=PREDICTION_ACCELERATOR_COUNT,\n", " machine_type=PREDICTION_MACHINE_TYPE,\n", ")" - ], - "metadata": { - "id": "nOZyyPALDcSP" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "8VVac8ah2DzJ" + }, "source": [ "### Retrive similar items using the tuned embedding model\n", "\n", "To retrive similar items using the tuned embedding model, you need both the corpus text and the generated embeddings. Given a query, you will calculate the associated embeddings with the tuned model and you will apply a similarity function to find the most relevant document with respect the query. " - ], - "metadata": { - "id": "8VVac8ah2DzJ" - } + ] }, { "cell_type": "markdown", - "source": [ - "Read the corpus text and the generated embeddings." - ], "metadata": { "id": "L2ohcM4a6mOx" - } + }, + "source": [ + "Read the corpus text and the generated embeddings." + ] }, { "cell_type": "code", - "source": [ - "training_output_dir = get_training_output_dir(job)" - ], + "execution_count": null, "metadata": { "id": "C5O2WP_i1z3I" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "training_output_dir = get_training_output_dir(job)" + ] }, { "cell_type": "code", - "source": [ - "corpus_text = get_df_from_jsonl(\n", - " epath.Path(training_output_dir) / 'corpus_text.jsonl'\n", - ")\n", - "\n", - "corpus_text.head()" - ], + "execution_count": null, "metadata": { "id": "TJcTLpV263tK" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "corpus_text = get_df_from_jsonl(epath.Path(training_output_dir) / \"corpus_text.jsonl\")\n", + "\n", + "corpus_text.head()" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3g-8ECt9VU_P" + }, + "outputs": [], "source": [ "corpus_embeddings = get_df_from_jsonl(\n", - " epath.Path(training_output_dir) / 'corpus_custom.jsonl'\n", + " epath.Path(training_output_dir) / \"corpus_custom.jsonl\"\n", ")\n", "\n", "corpus_embeddings.head()" - ], - "metadata": { - "id": "3g-8ECt9VU_P" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "Find the most relevant documents for each query." - ], "metadata": { "id": "6Dwbak5t670Y" - } + }, + "source": [ + "Find the most relevant documents for each query." + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IV9d2afP11em" + }, + "outputs": [], "source": [ "queries = [\n", - " '''What about the revenues?''',\n", - " '''Who is Alphabet?''',\n", - " '''What about the costs?''',\n", + " \"\"\"What about the revenues?\"\"\",\n", + " \"\"\"Who is Alphabet?\"\"\",\n", + " \"\"\"What about the costs?\"\"\",\n", "]\n", "output = get_topk_documents(queries, corpus_text, corpus_embeddings, k=10)\n", "\n", - "with pd.option_context('display.max_colwidth', 200):\n", - " display(output)" - ], - "metadata": { - "id": "IV9d2afP11em" - }, - "execution_count": null, - "outputs": [] + "with pd.option_context(\"display.max_colwidth\", 200):\n", + " display(output)" + ] }, { "cell_type": "markdown", @@ -1397,15 +1403,15 @@ "\n", "# Delete endpoint resource\n", "if delete_endpoint or os.getenv(\"IS_TESTING\"):\n", - " endpoint.delete()\n", + " endpoint.delete()\n", "\n", "# Delete model resource\n", "if delete_model or os.getenv(\"IS_TESTING\"):\n", - " model.delete()\n", + " model.delete()\n", "\n", "# Delete pipeline job\n", "if delete_job or os.getenv(\"IS_TESTING\"):\n", - " job.delete()\n", + " job.delete()\n", "\n", "# Delete Cloud Storage objects that were created\n", "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", @@ -1415,7 +1421,7 @@ ], "metadata": { "colab": { - "provenance": [], + "name": "get_started_with_embedding_tuning.ipynb", "toc_visible": true }, "kernelspec": { @@ -1425,4 +1431,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +}