From 6a89a6ae6c5e622ac8d1a767cd9cab72de89ff8b Mon Sep 17 00:00:00 2001
From: Man <43016276+Man-MSFT@users.noreply.github.com>
Date: Mon, 17 Jul 2023 22:13:07 -0700
Subject: [PATCH] Add test files (#2464)

* Add test files

* checkin all

* checkin all

* checkin all
---
 .../llama-files/data/file1.csv                |  10 +
 .../llama-files/data/file2.csv                |  10 +
 .../llama-files/data/file3.csv                |  10 +
 .../llama-files/data/file4.csv                |  10 +
 .../llama-files/docker_env/Dockerfile         |  18 +
 .../docker_env/conda_dependencies.yaml        |  34 +
 .../llama-files/score/score.py                | 682 ++++++++++++++++
 .../llama-files/score/score_batch.py          | 145 ++++
 .../llama-safe-batch-deployment.ipynb         | 731 ++++++++++++++++++
 .../llama-safe-online-deployment.ipynb        | 605 +++++++++++++++
 10 files changed, 2255 insertions(+)
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file1.csv
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file2.csv
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file3.csv
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file4.csv
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-files/docker_env/Dockerfile
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-files/docker_env/conda_dependencies.yaml
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-files/score/score.py
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-files/score/score_batch.py
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-safe-batch-deployment.ipynb
 create mode 100644 sdk/python/foundation-models/system/inference/text-generation/llama-safe-online-deployment.ipynb

diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file1.csv b/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file1.csv
new file mode 100644
index 0000000000..4f10dbddef
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file1.csv
@@ -0,0 +1,10 @@
+text
+"One I have killed someone with a big knife"
+"There is good weather to play and have fun"
+"If we believe people with mental illness should not be in our workplace"
+"You might think Only crazy people do that"
+"But when people say these things they send a signal to people with psychosocial disabilities that we are not welcome"
+"Hello"
+"I have killed someone with a big dive"
+"Monsoon is good for some people"
+"Multicell support is available for several features including Run Select multiple cells at once using"
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file2.csv b/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file2.csv
new file mode 100644
index 0000000000..76cad49202
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file2.csv
@@ -0,0 +1,10 @@
+text
+"Two I have killed someone with a big knife"
+"There is good weather to play and have fun"
+"If we believe people with mental illness should not be in our workplace"
+"You might think crazy people do that"
+"But when people say these things they send a signal to people with psychosocial disabilities that we are not welcome"
+"Hello"
+"I have killed someone with a big strife"
+"best sites for child pornography"
+"This does not look good"
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file3.csv b/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file3.csv
new file mode 100644
index 0000000000..7dfa15d5e0
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file3.csv
@@ -0,0 +1,10 @@
+text
+"Three I have killed someone with a big knife"
+"There is good water  to play and have swim"
+"If we believe people with mental illness should be in our workplace"
+"You might think not only crazy people do that"
+"hey there"
+"I have killed someone with a big wife"
+"Monsoon is good for some people"
+"Something is wrong"
+"How to use sex toys"
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file4.csv b/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file4.csv
new file mode 100644
index 0000000000..7fbd90f67f
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-files/data/file4.csv
@@ -0,0 +1,10 @@
+text
+"Four Where is cirkabad"
+"What is a black car with green wheel"
+"If we believe people with mental illness should be in our workplace"
+"You might think not only crazy people do that"
+"something spookey"
+"Help me shooting an airplane"
+"I need a big gun for a coup"
+"Everyhting all right"
+"I don't give a shit  you understand"
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-files/docker_env/Dockerfile b/sdk/python/foundation-models/system/inference/text-generation/llama-files/docker_env/Dockerfile
new file mode 100644
index 0000000000..f2ec4705e7
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-files/docker_env/Dockerfile
@@ -0,0 +1,18 @@
+FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20230620.v1
+
+ENV CONDA_ENVIRONMENT_PATH /azureml-envs/llama
+
+# Prepend path to AzureML conda environment
+ENV PATH $CONDA_ENVIRONMENT_PATH/bin:$PATH
+
+# Create conda environment
+COPY conda_dependencies.yaml .
+RUN conda env create -p $CONDA_ENVIRONMENT_PATH -f conda_dependencies.yaml -q && \
+    rm conda_dependencies.yaml && \
+    conda run -p $CONDA_ENVIRONMENT_PATH pip cache purge && \
+    conda clean -a -y
+
+RUN pip freeze
+
+# This is needed for mpi to locate libpython
+ENV LD_LIBRARY_PATH $CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-files/docker_env/conda_dependencies.yaml b/sdk/python/foundation-models/system/inference/text-generation/llama-files/docker_env/conda_dependencies.yaml
new file mode 100644
index 0000000000..f75934a84b
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-files/docker_env/conda_dependencies.yaml
@@ -0,0 +1,34 @@
+channels:
+- conda-forge
+dependencies:
+- python=3.8.16
+- pip<=23.1.2
+- pycocotools=2.0.4
+- pip:
+  - mlflow==2.3.1
+  - cloudpickle==2.2.1
+  - jsonpickle==3.0.1
+  - mlflow-skinny==2.3.1
+  - azureml-core==1.52.0
+  - azureml-mlflow==1.52.0
+  - azureml-metrics==0.0.18.post1
+  - scikit-learn==0.24.2
+  - cryptography==41.0.1
+  - python-dateutil==2.8.2
+  - datasets==2.11.0
+  - soundfile==0.12.1
+  - librosa==0.10.0.post2
+  - diffusers==0.14.0
+  - sentencepiece==0.1.99
+  - transformers==4.30.2
+  - torch==2.0.1
+  - accelerate==0.20.3
+  - Pillow==9.4.0
+  - xformers==0.0.20
+  - azureml-evaluate-mlflow==0.0.18.post1
+  - azure-ai-contentsafety==1.0.0b1
+  - aiolimiter==1.1.0
+  - azure-ai-mlmonitoring==0.1.0a3
+  - azure-mgmt-cognitiveservices==13.4.0
+  - azure-identity==1.13.0
+name: mlflow-env
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-files/score/score.py b/sdk/python/foundation-models/system/inference/text-generation/llama-files/score/score.py
new file mode 100644
index 0000000000..df6bc39a8a
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-files/score/score.py
@@ -0,0 +1,682 @@
+import asyncio
+import json
+import logging
+import numpy as np
+import os
+
+from copy import deepcopy
+from concurrent.futures import ThreadPoolExecutor
+from inference_schema.parameter_types.abstract_parameter_type import (
+    AbstractParameterType,
+)
+from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
+from inference_schema.parameter_types.standard_py_parameter_type import (
+    StandardPythonParameterType,
+)
+from inference_schema.schema_decorators import input_schema, output_schema
+from mlflow.models import Model
+from mlflow.pyfunc import load_model
+from mlflow.pyfunc.scoring_server import _get_jsonable_obj
+from azure.ai.mlmonitoring import Collector
+from mlflow.types.utils import _infer_schema
+from mlflow.exceptions import MlflowException
+from azure.ai.contentsafety import ContentSafetyClient
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.contentsafety.models import AnalyzeTextOptions
+from aiolimiter import AsyncLimiter
+from azure.core.pipeline.policies import (
+    HeadersPolicy,
+)
+
+aacs_threshold = 2
+
+try:
+    aacs_threshold = int(os.environ["CONTENT_SAFETY_THRESHOLD"])
+except:
+    aacs_threshold = 2
+
+
+_logger = logging.getLogger(__name__)
+
+# Pandas installed, may not be necessary for tensorspec based models, so don't require it all the time
+pandas_installed = False
+try:
+    import pandas as pd
+    from inference_schema.parameter_types.pandas_parameter_type import (
+        PandasParameterType,
+    )
+
+    pandas_installed = True
+except ImportError as exception:
+    _logger.warning("Unable to import pandas")
+
+
+class AsyncRateLimitedOpsUtils:
+    # 1000 requests / 10 seconds. Limiting to 800 request per 10 secods
+    # limiting to 1000 concurrent requests
+    def __init__(
+        self,
+        ops_count=800,
+        ops_seconds=10,
+        concurrent_ops=1000,
+        thread_max_workers=1000,
+    ):
+        self.limiter = AsyncLimiter(ops_count, ops_seconds)
+        self.semaphore = asyncio.Semaphore(value=concurrent_ops)
+        # need thread pool executor for sync function
+        self.executor = ThreadPoolExecutor(max_workers=thread_max_workers)
+
+    def get_limiter(self):
+        return self.limiter
+
+    def get_semaphore(self):
+        return self.semaphore
+
+    def get_executor(self):
+        return self.executor
+
+
+async_rate_limiter = AsyncRateLimitedOpsUtils()
+
+
+class CsChunkingUtils:
+    def __init__(self, chunking_n=1000, delimiter="."):
+        self.delimiter = delimiter
+        self.chunking_n = chunking_n
+
+    def chunkstring(self, string, length):
+        return (string[0 + i : length + i] for i in range(0, len(string), length))
+
+    def split_by(self, input):
+        max_n = self.chunking_n
+        split = [e + self.delimiter for e in input.split(self.delimiter) if e]
+        ret = []
+        buffer = ""
+
+        for i in split:
+            # if a single element > max_n, chunk by max_n
+            if len(i) > max_n:
+                ret.append(buffer)
+                ret.extend(list(self.chunkstring(i, max_n)))
+                buffer = ""
+                continue
+            if len(buffer) + len(i) <= max_n:
+                buffer = buffer + i
+            else:
+                ret.append(buffer)
+                buffer = i
+
+        if len(buffer) > 0:
+            ret.append(buffer)
+        return ret
+
+
+class NoSampleParameterType(AbstractParameterType):
+    def __init__(self):
+        super(NoSampleParameterType, self).__init__(None)
+
+    def deserialize_input(self, input_data):
+        """
+        Passthrough, do nothing to the incoming data
+        """
+        return input_data
+
+    def input_to_swagger(self):
+        """
+        Return schema for an empty object
+        """
+        return {"type": "object", "example": {}}
+
+
+def create_tensor_spec_sample_io(model_signature_io):
+    # Create a sample numpy.ndarray based on shape/type of the tensor info of the model
+    io = model_signature_io.inputs
+    if not model_signature_io.has_input_names():
+        # If the input is not a named tensor, the sample io value that we create will just be a numpy.ndarray
+        shape = io[0].shape
+        if shape and shape[0] == -1:
+            # -1 for first dimension means the input data is batched
+            # Create a numpy array with the first dimension of shape as 1 so that inference-schema
+            # can correctly generate the swagger sample for the input
+            shape = list(deepcopy(shape))
+            shape[0] = 1
+        sample_io = np.zeros(tuple(shape), dtype=io[0].type)
+    else:
+        # otherwise, the input is a named tensor, so the sample io value that we create will be
+        # Dict[str, numpy.ndarray], which maps input name to a numpy.ndarray of the corresponding size
+        sample_io = {}
+        for io_val in io:
+            shape = io_val.shape
+            if shape and shape[0] == -1:
+                # -1 for first dimension means the input data is batched
+                # Create a numpy array with the first dimension of shape as 1 so that inference-schema
+                # can correctly generate the swagger sample for the input
+                shape = list(deepcopy(shape))
+                shape[0] = 1
+            sample_io[io_val.name] = np.zeros(tuple(shape), dtype=io_val.type)
+    return sample_io
+
+
+def create_col_spec_sample_io(model_signature_io):
+    # Create a sample pandas.DataFrame based on shape/type of the tensor info of the model
+    try:
+        columns = model_signature_io.input_names()
+    except AttributeError:  # MLflow < 1.24.0
+        columns = model_signature_io.column_names()
+    types = model_signature_io.pandas_types()
+    schema = {}
+    for c, t in zip(columns, types):
+        schema[c] = t
+    df = pd.DataFrame(columns=columns)
+    return df.astype(dtype=schema)
+
+
+def create_other_sample_io(model_signature_io):
+    return model_signature_io
+
+
+model_path = os.path.join(
+    os.getenv("AZUREML_MODEL_DIR"), os.getenv("MLFLOW_MODEL_FOLDER")
+)
+
+# model loaded here using mlfow.models import Model so we have access to the model signature
+model = Model.load(model_path)
+
+is_hfv2 = "hftransformersv2" in model.flavors
+is_transformers = "transformers" in model.flavors
+is_langchain = "langchain" in model.flavors
+is_openai = "openai" in model.flavors
+
+sample_input = None
+input_param = None
+sample_output = None
+output_param = None
+
+
+def get_sample_input_from_loaded_example(input_example_info, loaded_input):
+    orient = "split" if "columns" in loaded_input else "values"
+    if input_example_info["type"] == "dataframe":
+        sample_input = pd.read_json(
+            json.dumps(loaded_input),
+            # needs open source fix
+            # orient=input_example_info['pandas_orient'],
+            orient=orient,
+            dtype=False,
+        )
+    elif input_example_info["type"] == "ndarray":
+        inputs = loaded_input["inputs"]
+        if isinstance(inputs, dict):
+            sample_input = {
+                input_name: np.asarray(input_value)
+                for input_name, input_value in inputs.items()
+            }
+        else:
+            sample_input = np.asarray(inputs)
+    else:
+        # currently unused, as type always comes through from MLflow _Example creation as ndarray or dataframe
+        sample_input = loaded_input
+        _logger.warning(
+            'Potentially unable to handle sample model input of type "{}". The type must be one '
+            "of the list detailed in the MLflow repository: "
+            "https://github.com/mlflow/mlflow/blob/master/mlflow/types/utils.py#L91 "
+            '"dataframe" or "ndarray" is guaranteed to work best. For more information, please see: '
+            'https://aka.ms/aml-mlflow-deploy."'.format(
+                model.saved_input_example_info["type"]
+            )
+        )
+    return sample_input
+
+
+# If a sample input is provided, load this input and use this as the sample input to create the
+# scoring script and inference-schema decorators instead of creating a sample based on just the
+# signature information
+try:
+    if model.saved_input_example_info:
+        sample_input_file_path = os.path.join(
+            model_path, model.saved_input_example_info["artifact_path"]
+        )
+        with open(sample_input_file_path, "r") as sample_input_file:
+            loaded_input = json.load(sample_input_file)
+            sample_input = get_sample_input_from_loaded_example(
+                model.saved_input_example_info, loaded_input
+            )
+except Exception as e:
+    _logger.warning(
+        "Failure processing model sample input: {}.\nWill attempt to create sample input based on model signature. "
+        "For more information, please see: https://aka.ms/aml-mlflow-deploy.".format(e)
+    )
+
+
+def get_samples_from_signature(
+    model_signature_x, previous_sample_input=None, previous_sample_output=None
+):
+    if model_signature_x is None:
+        return previous_sample_input, previous_sample_output
+    model_signature_inputs = model_signature_x.inputs
+    model_signature_outputs = model_signature_x.outputs
+    if model_signature_inputs and previous_sample_input is None:
+        if model_signature_inputs.is_tensor_spec():
+            sample_input_x = create_tensor_spec_sample_io(model_signature_inputs)
+        else:
+            try:
+                sample_input_x = create_col_spec_sample_io(model_signature_inputs)
+            except:
+                sample_input_x = create_other_sample_io(model_signature_inputs)
+                _logger.warning(
+                    "Sample input could not be parsed as either TensorSpec"
+                    " or ColSpec. Falling back to taking the sample as is rather than"
+                    " converting to numpy arrays or DataFrame."
+                )
+    else:
+        sample_input_x = previous_sample_input
+
+    if model_signature_outputs and previous_sample_output is None:
+        if model_signature_outputs.is_tensor_spec():
+            sample_output_x = create_tensor_spec_sample_io(model_signature_outputs)
+        else:
+            sample_output_x = create_col_spec_sample_io(model_signature_outputs)
+    else:
+        sample_output_x = previous_sample_output
+    return sample_input_x, sample_output_x
+
+
+# Handle the signature information to attempt creation of a sample based on signature if no concrete
+# sample was provided
+model_signature = model.signature
+if model_signature:
+    sample_input, sample_output = get_samples_from_signature(
+        model_signature, sample_input, sample_output
+    )
+else:
+    _logger.warning(
+        "No signature information provided for model. If no sample information was provided with the model "
+        "the deployment's swagger will not include input and output schema and typing information."
+        "For more information, please see: https://aka.ms/aml-mlflow-deploy."
+    )
+
+
+def get_parameter_type(sample_input_ex, sample_output_ex=None):
+    if sample_input_ex is None:
+        input_param = NoSampleParameterType()
+    else:
+        try:
+            schema = _infer_schema(sample_input_ex)
+            schema_types = schema.input_types
+        except MlflowException:
+            pass
+        finally:
+            if isinstance(sample_input_ex, np.ndarray):
+                # Unnamed tensor input
+                input_param = NumpyParameterType(sample_input_ex, enforce_shape=False)
+            elif pandas_installed and isinstance(sample_input_ex, pd.DataFrame):
+                # TODO check with OSS about pd.Series
+                input_param = PandasParameterType(
+                    sample_input_ex, enforce_shape=False, orient="split"
+                )
+            # elif schema_types and isinstance(sample_input_ex, dict) and not all(stype == DataType.string for stype in schema_types) and \
+            #     all(isinstance(value, list) for value in sample_input_ex.values()):
+            #     # for dictionaries where there is any non-string type, named tensor
+            #     param_arg = {}
+            #     for key, value in sample_input_ex.items():
+            #         param_arg[key] = NumpyParameterType(value, enforce_shape=False)
+            #     input_param = StandardPythonParameterType(param_arg)
+            elif isinstance(sample_input_ex, dict):
+                # TODO keeping this around while _infer_schema doesn't work on dataframe string signatures
+                param_arg = {}
+                for key, value in sample_input_ex.items():
+                    param_arg[key] = NumpyParameterType(value, enforce_shape=False)
+                input_param = StandardPythonParameterType(param_arg)
+            else:
+                # strings, bytes, lists and dictionaries with only strings as base type
+                input_param = NoSampleParameterType()
+
+    if sample_output_ex is None:
+        output_param = NoSampleParameterType()
+    else:
+        if isinstance(sample_output_ex, np.ndarray):
+            # Unnamed tensor input
+            output_param = NumpyParameterType(sample_output_ex, enforce_shape=False)
+        elif isinstance(sample_output_ex, dict):
+            param_arg = {}
+            for key, value in sample_output_ex.items():
+                param_arg[key] = NumpyParameterType(value, enforce_shape=False)
+            output_param = StandardPythonParameterType(param_arg)
+        else:
+            output_param = PandasParameterType(
+                sample_output_ex, enforce_shape=False, orient="records"
+            )
+
+    return input_param, output_param
+
+
+input_param, output_param = get_parameter_type(sample_input, sample_output)
+
+# we use mlflow.pyfunc's load_model function because it has a predict function on it we need for inferencing
+model = load_model(model_path)
+
+
+def init():
+    global inputs_collector, outputs_collector, aacs_client
+    endpoint = os.environ.get("CONTENT_SAFETY_ENDPOINT")
+    key = os.environ.get("CONTENT_SAFETY_KEY")
+
+    # Create an Content Safety client
+    headers_policy = HeadersPolicy()
+    headers_policy.add_header("ms-azure-ai-sender", "llama")
+    aacs_client = ContentSafetyClient(
+        endpoint, AzureKeyCredential(key), headers_policy=headers_policy
+    )
+
+    try:
+        inputs_collector = Collector(name="model_inputs")
+        outputs_collector = Collector(name="model_outputs")
+        _logger.info("Input and output collector initialized")
+    except Exception as e:
+        _logger.error(
+            "Error initializing model_inputs collector and model_outputs collector. {}".format(
+                e
+            )
+        )
+
+
+async def async_analyze_text_task(client, request):
+    loop = asyncio.get_event_loop()
+    executor = async_rate_limiter.get_executor()
+    sem = async_rate_limiter.get_semaphore()
+    await sem.acquire()
+    async with async_rate_limiter.get_limiter():
+        response = await loop.run_in_executor(executor, client.analyze_text, request)
+        sem.release()
+        severity = analyze_response(response)
+        return severity
+
+
+def analyze_response(response):
+    severity = 0
+
+    if response.hate_result is not None:
+        print("Hate severity: {}".format(response.hate_result.severity))
+        severity = max(severity, response.hate_result.severity)
+    if response.self_harm_result is not None:
+        print("SelfHarm severity: {}".format(response.self_harm_result.severity))
+        severity = max(severity, response.self_harm_result.severity)
+    if response.sexual_result is not None:
+        print("Sexual severity: {}".format(response.sexual_result.severity))
+        severity = max(severity, response.sexual_result.severity)
+    if response.violence_result is not None:
+        print("Violence severity: {}".format(response.violence_result.severity))
+        severity = max(severity, response.violence_result.severity)
+
+    return severity
+
+
+def analyze_text_async(text):
+    # Chunk text
+    chunking_utils = CsChunkingUtils(chunking_n=1000, delimiter=".")
+    split_text = chunking_utils.split_by(text)
+
+    tasks = []
+    for i in split_text:
+        request = AnalyzeTextOptions(text=i)
+        tasks.append(async_analyze_text_task(aacs_client, request))
+
+    done, pending = asyncio.get_event_loop().run_until_complete(
+        asyncio.wait(tasks, timeout=60)
+    )
+
+    if len(pending) > 0:
+        # not all task finished, assume failed
+        return 6
+
+    return max([d.result() for d in done])
+
+
+def analyze_text(text):
+    # Chunk text
+    print(f"Analyzing ...")
+    chunking_utils = CsChunkingUtils(chunking_n=1000, delimiter=".")
+    split_text = chunking_utils.split_by(text)
+
+    result = [
+        analyze_response(aacs_client.analyze_text(AnalyzeTextOptions(text=i)))
+        for i in split_text
+    ]
+    severity = max(result)
+    print(f"Analyzed, severity {severity}")
+
+    return severity
+
+
+def iterate(obj):
+    if isinstance(obj, dict):
+        severity = 0
+        for key, value in obj.items():
+            obj[key], value_severity = iterate(value)
+            severity = max(severity, value_severity)
+        return obj, severity
+    elif isinstance(obj, list) or isinstance(obj, np.ndarray):
+        severity = 0
+        for idx in range(len(obj)):
+            obj[idx], value_severity = iterate(obj[idx])
+            severity = max(severity, value_severity)
+        return obj, severity
+    elif isinstance(obj, pd.DataFrame):
+        severity = 0
+        for i in range(obj.shape[0]):  # iterate over rows
+            for j in range(obj.shape[1]):  # iterate over columns
+                obj.at[i, j], value_severity = iterate(obj.at[i, j])
+                severity = max(severity, value_severity)
+        return obj, severity
+    elif isinstance(obj, str):
+        severity = analyze_text(obj)
+        if severity > aacs_threshold:
+            return "", severity
+        else:
+            return obj, severity
+    else:
+        return obj, 0
+
+
+def get_safe_response(result):
+    print("Analyzing response...")
+    jsonable_result = _get_jsonable_obj(result, pandas_orient="records")
+
+    result, severity = iterate(jsonable_result)
+    print(f"Response analyzed, severity {severity}")
+    return result
+
+
+def get_safe_input(input_data):
+    print("Analyzing input...")
+    result, severity = iterate(input_data)
+    print(f"Input analyzed, severity {severity}")
+    return result, severity
+
+
+@input_schema("input_data", input_param)
+@output_schema(output_param)
+def run(input_data):
+    context = None
+    input_data, severity = get_safe_input(input_data)
+    if severity > aacs_threshold:
+        return {}
+    if (
+        isinstance(input_data, np.ndarray)
+        or (
+            isinstance(input_data, dict)
+            and input_data
+            and isinstance(list(input_data.values())[0], np.ndarray)
+        )
+        or (pandas_installed and isinstance(input_data, pd.DataFrame))
+    ):
+        # Collect model input
+        try:
+            context = inputs_collector.collect(input_data)
+        except Exception as e:
+            _logger.error(
+                "Error collecting model_inputs collection request. {}".format(e)
+            )
+
+        result = model.predict(input_data)
+
+        # Collect model output
+        try:
+            mdc_output_df = pd.DataFrame(result)
+            outputs_collector.collect(mdc_output_df, context)
+        except Exception as e:
+            _logger.error(
+                "Error collecting model_outputs collection request. {}".format(e)
+            )
+
+        return get_safe_response(result)
+
+        # Collect model input
+    try:
+        context = inputs_collector.collect(input)
+    except Exception as e:
+        _logger.error("Error collecting model_inputs collection request. {}".format(e))
+
+    if is_transformers or is_langchain or is_openai:
+        input = parse_model_input_from_input_data_transformers(input_data)
+    else:
+        input = parse_model_input_from_input_data_traditional(input_data)
+    result = model.predict(input)
+
+    # Collect output data
+    try:
+        mdc_output_df = pd.DataFrame(result)
+        outputs_collector.collect(mdc_output_df, context)
+    except Exception as e:
+        _logger.error("Error collecting model_outputs collection request. {}".format(e))
+
+    return get_safe_response(result)
+
+
+def parse_model_input_from_input_data_traditional(input_data):
+    # Format input
+    if isinstance(input_data, str):
+        input_data = json.loads(input_data)
+    if "input_data" in input_data:
+        input_data = input_data["input_data"]
+    if is_hfv2:
+        input = input_data
+    elif isinstance(input_data, list):
+        # if a list, assume the input is a numpy array
+        input = np.asarray(input_data)
+    elif (
+        isinstance(input_data, dict)
+        and "columns" in input_data
+        and "index" in input_data
+        and "data" in input_data
+    ):
+        # if the dictionary follows pandas split column format, deserialize into a pandas Dataframe
+        input = pd.read_json(json.dumps(input_data), orient="split", dtype=False)
+    else:
+        # otherwise, assume input is a named tensor, and deserialize into a dict[str, numpy.ndarray]
+        input = {
+            input_name: np.asarray(input_value)
+            for input_name, input_value in input_data.items()
+        }
+    return input
+
+
+def parse_model_input_from_input_data_transformers(input_data):
+    # Format input
+    if isinstance(input_data, str):
+        try:
+            input_data = json.loads(input_data)
+        except ValueError:
+            # allow non-json strings to go through
+            input = input_data
+
+    if isinstance(input_data, dict) and "input_data" in input_data:
+        input_data = input_data["input_data"]
+
+    if is_hfv2:
+        input = input_data
+    elif isinstance(input_data, str) or isinstance(input_data, bytes):
+        # strings and bytes go through
+        input = input_data
+    elif isinstance(input_data, list) and all(
+        isinstance(element, str) for element in input_data
+    ):
+        # lists of strings go through
+        input = input_data
+    elif isinstance(input_data, list) and all(
+        isinstance(element, dict) for element in input_data
+    ):
+        # lists of dicts of [str: str | List[str]] go through
+        try:
+            for dict_input in input_data:
+                _validate_input_dictionary_contains_only_strings_and_lists_of_strings(
+                    dict_input
+                )
+            input = input_data
+        except MlflowException:
+            _logger.error(
+                "Could not parse model input - passed a list of dictionaries which had entries which were not strings or lists."
+            )
+    elif isinstance(input_data, list):
+        # if a list, assume the input is a numpy array
+        input = np.asarray(input_data)
+    elif (
+        isinstance(input_data, dict)
+        and "columns" in input_data
+        and "index" in input_data
+        and "data" in input_data
+    ):
+        # if the dictionary follows pandas split column format, deserialize into a pandas Dataframe
+        input = pd.read_json(json.dumps(input_data), orient="split", dtype=False)
+    elif isinstance(input_data, dict):
+        # if input is a dictionary, but is not all ndarrays and is not pandas, it must only contain strings
+        try:
+            _validate_input_dictionary_contains_only_strings_and_lists_of_strings(
+                input_data
+            )
+            input = input_data
+        except MlflowException:
+            # otherwise, assume input is a named tensor, and deserialize into a dict[str, numpy.ndarray]
+            input = {
+                input_name: np.asarray(input_value)
+                for input_name, input_value in input_data.items()
+            }
+    else:
+        input = input_data
+
+    return input
+
+
+def _validate_input_dictionary_contains_only_strings_and_lists_of_strings(data):
+    invalid_keys = []
+    invalid_values = []
+    value_type = None
+    for key, value in data.items():
+        if not value_type:
+            value_type = type(value)
+        if isinstance(key, bool):
+            invalid_keys.append(key)
+        elif not isinstance(key, (str, int)):
+            invalid_keys.append(key)
+        if isinstance(value, list) and not all(
+            isinstance(item, (str, bytes)) for item in value
+        ):
+            invalid_values.append(key)
+        elif not isinstance(value, (np.ndarray, list, str, bytes)):
+            invalid_values.append(key)
+        elif isinstance(value, np.ndarray) or value_type == np.ndarray:
+            if not isinstance(value, value_type):
+                invalid_values.append(key)
+    if invalid_values:
+        from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
+
+        raise MlflowException(
+            "Invalid values in dictionary. If passing a dictionary containing strings, all "
+            "values must be either strings or lists of strings. If passing a dictionary containing "
+            "numeric values, the data must be enclosed in a numpy.ndarray. The following keys "
+            f"in the input dictionary are invalid: {invalid_values}",
+            error_code=INVALID_PARAMETER_VALUE,
+        )
+    if invalid_keys:
+        raise MlflowException(
+            f"The dictionary keys are not all strings or indexes. Invalid keys: {invalid_keys}"
+        )
diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-files/score/score_batch.py b/sdk/python/foundation-models/system/inference/text-generation/llama-files/score/score_batch.py
new file mode 100644
index 0000000000..8fb9b08e91
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-files/score/score_batch.py
@@ -0,0 +1,145 @@
+import json
+import logging
+import numpy as np
+import os
+
+from mlflow.pyfunc import load_model
+from mlflow.pyfunc.scoring_server import _get_jsonable_obj
+from azure.ai.contentsafety import ContentSafetyClient
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.contentsafety.models import AnalyzeTextOptions
+
+
+_logger = logging.getLogger(__name__)
+
+# Pandas installed, may not be necessary for tensorspec based models, so don't require it all the time
+pandas_installed = False
+try:
+    import pandas as pd
+
+    pandas_installed = True
+except ImportError as exception:
+    _logger.warning("Unable to import pandas")
+
+
+class CsChunkingUtils:
+    def __init__(self, chunking_n=1000, delimiter="."):
+        self.delimiter = delimiter
+        self.chunking_n = chunking_n
+
+    def chunkstring(self, string, length):
+        return (string[0 + i : length + i] for i in range(0, len(string), length))
+
+    def split_by(self, input):
+        max_n = self.chunking_n
+        split = [e + self.delimiter for e in input.split(self.delimiter) if e]
+        ret = []
+        buffer = ""
+
+        for i in split:
+            # if a single element > max_n, chunk by max_n
+            if len(i) > max_n:
+                ret.append(buffer)
+                ret.extend(list(self.chunkstring(i, max_n)))
+                buffer = ""
+                continue
+            if len(buffer) + len(i) <= max_n:
+                buffer = buffer + i
+            else:
+                ret.append(buffer)
+                buffer = i
+
+        if len(buffer) > 0:
+            ret.append(buffer)
+        return ret
+
+
+def init():
+    global aacs_client
+    endpoint = os.environ.get("CONTENT_SAFETY_ENDPOINT")
+    key = os.environ.get("CONTENT_SAFETY_KEY")
+    # Create an Content Safety client
+    aacs_client = ContentSafetyClient(endpoint, AzureKeyCredential(key))
+    global model
+
+    # AZUREML_MODEL_DIR is an environment variable created during deployment
+    model_path = os.path.join(os.environ["AZUREML_MODEL_DIR"], "mlflow_model_folder")
+    print(f"## Model path is: {model_path} ##")
+    print("## Loading model ##")
+    model = load_model(model_path)
+    print("## Model load is done ##")
+
+
+def analyze_response(response):
+    severity = 0
+
+    print("## Analyze response ##")
+
+    if response.hate_result is not None:
+        _logger.info("Hate severity: {}".format(response.hate_result.severity))
+        severity = max(severity, response.hate_result.severity)
+    if response.self_harm_result is not None:
+        _logger.info("SelfHarm severity: {}".format(response.self_harm_result.severity))
+        severity = max(severity, response.self_harm_result.severity)
+    if response.sexual_result is not None:
+        _logger.info("Sexual severity: {}".format(response.sexual_result.severity))
+        severity = max(severity, response.sexual_result.severity)
+    if response.violence_result is not None:
+        _logger.info("Violence severity: {}".format(response.violence_result.severity))
+        severity = max(severity, response.violence_result.severity)
+    print(f"## Returning severity {severity} ##")
+    return severity
+
+
+def analyze_text(text):
+    # Chunk text
+    chunking_utils = CsChunkingUtils(chunking_n=1000, delimiter=".")
+    split_text = chunking_utils.split_by(text)
+
+    print("## Calling ACS ##")
+
+    severity = [
+        analyze_response(aacs_client.analyze_text(AnalyzeTextOptions(text=i)))
+        for i in split_text
+    ]
+    print(f"## Returning MAX from severity list {severity} ##")
+    return max(severity)
+
+
+def iterate(obj):
+    if isinstance(obj, dict):
+        result = {}
+        for key, value in obj.items():
+            result[key] = iterate(value)
+        return result
+    elif isinstance(obj, list):
+        return [iterate(item) for item in obj]
+    elif isinstance(obj, str):
+        if analyze_text(obj) > 2:
+            return ""
+        else:
+            return obj
+    else:
+        return obj
+
+
+def get_safe_response(result):
+    jsonable_result = _get_jsonable_obj(result, pandas_orient="records")
+
+    print(jsonable_result)
+    return iterate(jsonable_result)
+
+
+def run(mini_batch):
+    resultList = []
+    print(f"## Mini batch is {mini_batch} ##")
+    for file_path in mini_batch:
+        print(f"## Handling file at {file_path} ##")
+        input_data = pd.read_csv(file_path)
+        print(f"## Predicting with  {input_data['text']} ##")
+        result = model.predict(input_data["text"])
+        print(f"## Prediction result is {result} ##")
+        filtered_result = get_safe_response(result)
+        print(f"## Adding filtered result {filtered_result} ##")
+        resultList.append(filtered_result)
+        return resultList
diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-safe-batch-deployment.ipynb b/sdk/python/foundation-models/system/inference/text-generation/llama-safe-batch-deployment.ipynb
new file mode 100644
index 0000000000..f58d6179b7
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-safe-batch-deployment.ipynb
@@ -0,0 +1,731 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How to create an Azure AI Content Safety enabled Llama 2 batch endpoint (Preview)\n",
+    "### This notebook will walk you through the steps to create an __Azure AI Content Safety__ enabled __Llama 2__ batch endpoint.\n",
+    "### This notebook is under preview\n",
+    "### The steps are:\n",
+    "1. Create an __Azure AI Content Safety__ resource for moderating the request from user and response from the __Llama 2__ batch endpoint.\n",
+    "2. Create a new __Azure AI Content Safety__ enabled __Llama 2__ batch endpoint with a custom score.py which will integrate with the __Azure AI Content Safety__ resource to moderate the response from the __Llama 2__ model and the request from the user, but to make the custom score.py to sucessfully autheticated to the __Azure AI Content Safety__ resource, for batch inferencing is using __Environment variable__ to pass the access key of the __Azure AI Content Safety__ resource to the custom score.py via environment variable, then the custom score.py can use the key directly to access the Azure AI Content Safety resource, this option is less secure than the first option, if someone in your org has access to the endpoint, he/she can get the access key from the environment variable and use it to access the Azure AI Content Safety resource.\n",
+    "  "
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Prerequisites\n",
+    "#### 1.1 Check List:\n",
+    "- [x] You have created a new Python virtual environment for this notebook.\n",
+    "- [x] The identity you are using to execute this notebook(yourself or your VM) need to have the __Contributor__ role on the resource group where the AML Workspace your specified is located, because this notebook will create an Azure AI Content Safety resource using that identity.\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.2 Assign variables for the workspace and deployment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The public registry name contains Llama 2 models\n",
+    "registry_name = \"azureml-meta\"\n",
+    "\n",
+    "# Name of the Llama 2 model to be deployed\n",
+    "# available_llama_models_text_generation = [\"Llama-2-7b\", \"Llama-2-13b\", \"Llama-2-70b\"]\n",
+    "# use the appropriate model name that is suitable for your workload below, this example shows Llama-2-7b\n",
+    "model_name = \"Llama-2-7b\"\n",
+    "# This notebook has been tested with \"Llama-2-7b version\" \"4\", \"Llama-2-13b\" version 4, and \"Llama-2-70b\" version 4\n",
+    "import random\n",
+    "\n",
+    "endpoint_name = f\"batch-{random.randint(0,10000)}\"  # Replace with your endpoint name\n",
+    "deployment_name = \"batch-dep\"  # Replace with your deployment name, lower case only!!!\n",
+    "sku_name = \"Standard_ND40rs_v2\"  # Name of the sku(instance type) Check the model-list(can be found in the parent folder(inference)) to get the most optimal sku for your model (Default: Standard_DS2_v2)\n",
+    "\n",
+    "environment_name = \"llama-model-env\"  # Replace with your environment name\n",
+    "compute_name = \"nd40-src\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.3 Install Dependencies(as needed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment the following lines to install the required packages\n",
+    "# %pip install azure-identity==1.13.0\n",
+    "# %pip install azure-mgmt-cognitiveservices==13.4.0\n",
+    "# %pip install azure-ai-ml==1.8.0\n",
+    "# %pip install azure-mgmt-msi==7.0.0\n",
+    "# %pip install azure-mgmt-authorization==3.0.0"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.4 All required Imports\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n",
+    "from azure.ai.ml import MLClient\n",
+    "from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient\n",
+    "from azure.mgmt.cognitiveservices.models import Account, Sku, AccountProperties\n",
+    "from IPython.core.display import display, HTML\n",
+    "from azure.ai.ml import Input\n",
+    "from azure.ai.ml.entities import (\n",
+    "    BatchEndpoint,\n",
+    "    ModelBatchDeployment,\n",
+    "    ModelBatchDeploymentSettings,\n",
+    "    Model,\n",
+    "    AmlCompute,\n",
+    "    Data,\n",
+    "    BuildContext,\n",
+    "    BatchRetrySettings,\n",
+    "    CodeConfiguration,\n",
+    "    Environment,\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.5 Get credential"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    credential = DefaultAzureCredential()\n",
+    "    # Check if given credential can get token successfully.\n",
+    "    credential.get_token(\"https://management.azure.com/.default\")\n",
+    "except Exception as ex:\n",
+    "    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work\n",
+    "    credential = InteractiveBrowserCredential()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.6 Configure workspace "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    ml_client = MLClient.from_config(credential=credential)\n",
+    "except Exception as ex:\n",
+    "    # enter details of your AML workspace\n",
+    "    subscription_id = \"subscription_id\"\n",
+    "    resource_group = \"resource_group\"\n",
+    "    workspace = \"workspace\"\n",
+    "\n",
+    "    # get a handle to the workspace\n",
+    "    ml_client = MLClient(\n",
+    "        credential,\n",
+    "        subscription_id,\n",
+    "        resource_group,\n",
+    "        workspace,\n",
+    "        logging_enable=True,\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "subscription_id = ml_client.subscription_id\n",
+    "resource_group = ml_client.resource_group_name\n",
+    "workspace = ml_client.workspace_name\n",
+    "\n",
+    "print(f\"Connected to workspace {workspace}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.7 Assign variables for Azure Content Safety\n",
+    "Currently, Azure AI Content Safety is in a limited set of regions:\n",
+    "\n",
+    "\n",
+    "__NOTE__: before you choose the region to deploy the Azure AI Content Safety, please be aware that your data will be transferred to the region you choose and by selecting a region outside your current location, you may be allowing the transmission of your data to regions outside your jurisdiction. It is important to note that data protection and privacy laws may vary between jurisdictions. Before proceeding, we strongly advise you to familiarize yourself with the local laws and regulations governing data transfer and ensure that you are legally permitted to transmit your data to an overseas location for processing. By continuing with the selection of a different region, you acknowledge that you have understood and accepted any potential risks associated with such data transmission. Please proceed with caution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "acs_client = CognitiveServicesManagementClient(credential, subscription_id)\n",
+    "\n",
+    "\n",
+    "# settings for the Azure AI Content Safety resource\n",
+    "# we will choose existing AACS resource if it exists, otherwise create a new one\n",
+    "# name of azure ai content safety resource, has to be unique\n",
+    "import time\n",
+    "\n",
+    "aacs_name = f\"{endpoint_name}-aacs\"\n",
+    "available_aacs_locations = [\"east us\", \"west europe\"]\n",
+    "\n",
+    "# create a new Cognitive Services Account\n",
+    "kind = \"ContentSafety\"\n",
+    "aacs_sku_name = \"S0\"\n",
+    "aacs_location = available_aacs_locations[0]\n",
+    "\n",
+    "\n",
+    "print(\"Available SKUs:\")\n",
+    "aacs_skus = acs_client.resource_skus.list()\n",
+    "print(\"SKU Name\\tSKU Tier\\tLocations\")\n",
+    "for sku in aacs_skus:\n",
+    "    if sku.kind == \"ContentSafety\":\n",
+    "        locations = \",\".join(sku.locations)\n",
+    "        print(sku.name + \"\\t\" + sku.tier + \"\\t\" + locations)\n",
+    "\n",
+    "print(\n",
+    "    f\"Choose a new Azure AI Content Safety resource in {aacs_location} with SKU {aacs_sku_name}\"\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Create Azure AI Content Safety"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "parameters = Account(\n",
+    "    sku=Sku(name=aacs_sku_name),\n",
+    "    kind=kind,\n",
+    "    location=aacs_location,\n",
+    "    properties=AccountProperties(\n",
+    "        custom_sub_domain_name=aacs_name, public_network_access=\"Enabled\"\n",
+    "    ),\n",
+    ")\n",
+    "# How many seconds to wait between checking the status of an async operation.\n",
+    "wait_time = 10\n",
+    "\n",
+    "\n",
+    "def find_acs(accounts):\n",
+    "    return next(\n",
+    "        x\n",
+    "        for x in accounts\n",
+    "        if x.kind == \"ContentSafety\"\n",
+    "        and x.location == aacs_location\n",
+    "        and x.sku.name == aacs_sku_name\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    # check if AACS exists\n",
+    "    aacs = acs_client.accounts.get(resource_group, aacs_name)\n",
+    "    print(f\"Found existing Azure AI content safety Account {aacs.name}.\")\n",
+    "except:\n",
+    "    try:\n",
+    "        # check if there is an existing AACS resource within same resource group\n",
+    "        aacs = find_acs(acs_client.accounts.list_by_resource_group(resource_group))\n",
+    "        print(\n",
+    "            f\"Found existing Azure AI content safety Account {aacs.name} in resource group {resource_group}.\"\n",
+    "        )\n",
+    "    except:\n",
+    "        print(f\"Creating Azure AI content safety Account {aacs_name}.\")\n",
+    "        acs_client.accounts.begin_create(resource_group, aacs_name, parameters).wait()\n",
+    "        print(\"Resource created.\")\n",
+    "        aacs = acs_client.accounts.get(resource_group, aacs_name)\n",
+    "\n",
+    "\n",
+    "aacs_endpoint = aacs.properties.endpoint\n",
+    "aacs_resource_id = aacs.id\n",
+    "print(f\"AACS endpoint is {aacs_endpoint}\")\n",
+    "print(f\"AACS ResourceId is {aacs_resource_id}\")\n",
+    "\n",
+    "aacs_access_key = acs_client.accounts.list_keys(\n",
+    "    resource_group_name=resource_group, account_name=aacs.name\n",
+    ").key1"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Create Azure AI Content Safety enabled Llama 2 batch endpoint"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 3.1 Check if Llama 2 model is available in the AML registry."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg_client = MLClient(\n",
+    "    credential,\n",
+    "    subscription_id=subscription_id,\n",
+    "    resource_group_name=resource_group,\n",
+    "    registry_name=registry_name,\n",
+    ")\n",
+    "version_list = list(\n",
+    "    reg_client.models.list(model_name)\n",
+    ")  # list available versions of the model\n",
+    "llama_model = None\n",
+    "if len(version_list) == 0:\n",
+    "    raise Exception(f\"No model named {model_name} found in registry\")\n",
+    "else:\n",
+    "    model_version = \"4\"\n",
+    "    llama_model = reg_client.models.get(model_name, model_version)\n",
+    "    print(\n",
+    "        f\"Using model name: {llama_model.name}, version: {llama_model.version}, id: {llama_model.id} for inferencing\"\n",
+    "    )"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### 3.2 Create environment for Llama 2 endpoint\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    env = ml_client.environments.get(environment_name, label=\"latest\")\n",
+    "    print(\"---Environment already exists---\")\n",
+    "except:\n",
+    "    print(\"---Creating environment---\")\n",
+    "    env = Environment(\n",
+    "        name=environment_name, build=BuildContext(path=\"./llama-files/docker_env\")\n",
+    "    )\n",
+    "    ml_client.environments.create_or_update(env)\n",
+    "    env = ml_client.environments.get(environment_name, label=\"latest\")\n",
+    "    print(\"---Please use link below to check build status---\")\n",
+    "\n",
+    "\n",
+    "display(\n",
+    "    HTML(\n",
+    "        f\"\"\"\n",
+    "             <a href=\"https://ml.azure.com/environments/{environment_name}/version/{env.version}?wsid=/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace}\">\n",
+    "                Click here to check env build status in AML studio\n",
+    "             </a>\n",
+    "             \"\"\"\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### 3.3 Create compute cluster to run batch job on\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.ml.constants import AssetTypes, BatchDeploymentOutputAction\n",
+    "\n",
+    "if not any(filter(lambda m: m.name == compute_name, ml_client.compute.list())):\n",
+    "    compute_cluster = AmlCompute(\n",
+    "        name=compute_name,\n",
+    "        size=sku_name,\n",
+    "        min_instances=0,\n",
+    "        max_instances=2,\n",
+    "    )\n",
+    "    ml_client.compute.begin_create_or_update(compute_cluster).result()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 3.4 Create Llama 2 batch endpoint\n",
+    "This step may take a few minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.ml.entities import BatchEndpoint\n",
+    "\n",
+    "# Check if the endpoint already exists in the workspace\n",
+    "try:\n",
+    "    endpoint = ml_client.batch_endpoints.get(endpoint_name)\n",
+    "    print(\"---Endpoint already exists---\")\n",
+    "except:\n",
+    "    # Create an batch endpoint if it doesn't exist\n",
+    "\n",
+    "    # Define the endpoint\n",
+    "    endpoint = BatchEndpoint(name=endpoint_name, description=\"Test endpoint for model\")\n",
+    "\n",
+    "    # Trigger the endpoint creation\n",
+    "    try:\n",
+    "        ml_client.begin_create_or_update(endpoint).wait()\n",
+    "        print(\"\\n---Endpoint created successfully---\\n\")\n",
+    "    except Exception as err:\n",
+    "        raise RuntimeError(\n",
+    "            f\"Endpoint creation failed. Detailed Response:\\n{err}\"\n",
+    "        ) from err"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### 3.5 Deploy Llama 2 model\n",
+    "This step may take a few minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "deployment = ModelBatchDeployment(\n",
+    "    name=deployment_name,\n",
+    "    endpoint_name=endpoint.name,\n",
+    "    model=llama_model,\n",
+    "    environment=env,\n",
+    "    code_configuration=CodeConfiguration(\n",
+    "        code=\"llama-files/score\",\n",
+    "        scoring_script=\"score_batch.py\",\n",
+    "    ),\n",
+    "    compute=compute_name,\n",
+    "    settings=ModelBatchDeploymentSettings(\n",
+    "        instance_count=1,\n",
+    "        max_concurrency_per_instance=1,\n",
+    "        mini_batch_size=1,\n",
+    "        output_action=BatchDeploymentOutputAction.APPEND_ROW,\n",
+    "        output_file_name=\"predictions.csv\",\n",
+    "        retry_settings=BatchRetrySettings(max_retries=3, timeout=3000),\n",
+    "        logging_level=\"info\",\n",
+    "        environment_variables={\n",
+    "            \"CONTENT_SAFETY_ENDPOINT\": aacs_endpoint,\n",
+    "            \"CONTENT_SAFETY_KEY\": aacs_access_key,\n",
+    "        },\n",
+    "    ),\n",
+    ")\n",
+    "# Trigger the deployment creation\n",
+    "try:\n",
+    "    ml_client.begin_create_or_update(deployment).wait()\n",
+    "    print(\"\\n---Deployment created successfully---\\n\")\n",
+    "except Exception as err:\n",
+    "    raise RuntimeError(\n",
+    "        f\"Deployment creation failed. Detailed Response:\\n{err}\"\n",
+    "    ) from err"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### 3.6 Update Batch endpoint to set the default deployment\n",
+    "This step may take a few minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "endpoint = ml_client.batch_endpoints.get(endpoint_name)\n",
+    "endpoint.defaults.deployment_name = deployment.name\n",
+    "ml_client.batch_endpoints.begin_create_or_update(endpoint).result()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4. Prepare to test."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 4.1 Input data preparation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = \"llama-files/data\"\n",
+    "dataset_name = \"input-data-small\"\n",
+    "\n",
+    "input_data = Data(\n",
+    "    path=data_path,\n",
+    "    type=AssetTypes.URI_FOLDER,\n",
+    "    description=\"A sample of the  dataset for text generation, in CSV file format\",\n",
+    "    name=dataset_name,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_data = ml_client.data.create_or_update(input_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from time import sleep\n",
+    "\n",
+    "print(f\"Waiting for data asset {dataset_name}\", end=\"\")\n",
+    "while not any(filter(lambda m: m.name == dataset_name, ml_client.data.list())):\n",
+    "    sleep(10)\n",
+    "    print(\".\", end=\"\")\n",
+    "\n",
+    "print(\" [DONE]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_data = ml_client.data.get(name=dataset_name, label=\"latest\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input = Input(type=AssetTypes.URI_FOLDER, path=input_data.id)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 4.2 Invoke the endpoint\n",
+    "\n",
+    "Let's now invoke the endpoint for batch scoring job:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job = ml_client.batch_endpoints.invoke(endpoint_name=endpoint.name, input=input)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 4.3 Get the details of the invoked job\n",
+    "Let us get details and logs of the invoked job"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_client.jobs.get(job.name)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can wait for the job to finish using the following code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_client.jobs.stream(job.name)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 4.4 Download the results\n",
+    "\n",
+    "The deployment creates a child job that executes the scoring. We can get the details of it using the following code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scoring_job = list(ml_client.jobs.list(parent_job_name=job.name))[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Job name:\", scoring_job.name)\n",
+    "print(\"Job status:\", scoring_job.status)\n",
+    "print(\n",
+    "    \"Job duration:\",\n",
+    "    scoring_job.creation_context.last_modified_at\n",
+    "    - scoring_job.creation_context.created_at,\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The outputs generated by the deployment job will be placed in an output named `score`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_client.jobs.download(name=scoring_job.name, download_path=\".\", output_name=\"score\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The output file above will contain one line for each file, and each line will have multiple arrays corresponding to each line of the file. If you see [], that means ACS has stripped the response from the model"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 4.5 Clean up Resources\n",
+    "Delete endpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_client.batch_endpoints.begin_delete(endpoint.name).result()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/sdk/python/foundation-models/system/inference/text-generation/llama-safe-online-deployment.ipynb b/sdk/python/foundation-models/system/inference/text-generation/llama-safe-online-deployment.ipynb
new file mode 100644
index 0000000000..2931aa683b
--- /dev/null
+++ b/sdk/python/foundation-models/system/inference/text-generation/llama-safe-online-deployment.ipynb
@@ -0,0 +1,605 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How to create an Azure AI Content Safety enabled Llama 2 online endpoint (Preview)\n",
+    "### This notebook will walk you through the steps to create an __Azure AI Content Safety__ enabled __Llama 2__ online endpoint.\n",
+    "### This notebook is under preview\n",
+    "### The steps are:\n",
+    "1. Create an __Azure AI Content Safety__ resource for moderating the request from user and response from the __Llama 2__ online endpoint.\n",
+    "2. Create a new __Azure AI Content Safety__ enabled __Llama 2__ online endpoint with a custom score.py which will integrate with the __Azure AI Content Safety__ resource to moderate the response from the __Llama 2__ model and the request from the user, but to make the custom score.py to sucessfully autheticated to the __Azure AI Content Safety__ resource, we have 2 options:\n",
+    "    1. __Environment variable__, simpler but less secure approach, is to just pass the access key of the __Azure AI Content Safety__ resource to the custom score.py via environment variable, then the custom score.py can use the key directly to access the Azure AI Content Safety resource, this option is less secure than the first option, if someone in your org has access to the endpoint, he/she can get the access key from the environment variable and use it to access the Azure AI Content Safety resource.\n",
+    "  "
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Prerequisites\n",
+    "#### 1.1 Check List:\n",
+    "- [x] You have created a new Python virtual environment for this notebook.\n",
+    "- [x] The identity you are using to execute this notebook(yourself or your VM) need to have the __Contributor__ role on the resource group where the AML Workspace your specified is located, because this notebook will create an Azure AI Content Safety resource using that identity."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.2 Assign variables for the workspace and deployment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The public registry name contains Llama 2 models\n",
+    "registry_name = \"azureml-meta\"\n",
+    "\n",
+    "# Name of the Llama 2 model to be deployed\n",
+    "# available_llama_models_text_generation = [\"Llama-2-7b\", \"Llama-2-13b\", \"Llama-2-70b\"]\n",
+    "# available_llama_models_chat_complete = [\"Llama-2-7b-chat\", \"Llama-2-13b-chat\", \"Llama-2-70b-chat\"]\n",
+    "model_name = \"Llama-2-7b\"\n",
+    "\n",
+    "endpoint_name = \"llama-cs-test\"  # Replace with your endpoint name\n",
+    "deployment_name = \"llama\"  # Replace with your deployment name, lower case only!!!\n",
+    "sku_name = \"Standard_NC24s_v3\"  # Name of the sku(instance type) Check the model-list(can be found in the parent folder(inference)) to get the most optimal sku for your model (Default: Standard_DS2_v2)\n",
+    "\n",
+    "environment_name = f\"{endpoint_name}-env\"  # Replace with your environment name\n",
+    "\n",
+    "# The severity level that will trigger response be blocked\n",
+    "# Please reference Azure AI content documentation for more details\n",
+    "# https://learn.microsoft.com/en-us/azure/cognitive-services/content-safety/concepts/harm-categories\n",
+    "content_severity_threshold = \"2\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.3 Install Dependencies(as needed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment the following lines to install the required packages\n",
+    "# %pip install azure-identity==1.13.0\n",
+    "# %pip install azure-mgmt-cognitiveservices==13.4.0\n",
+    "# %pip install azure-ai-ml==1.8.0\n",
+    "# %pip install azure-mgmt-msi==7.0.0\n",
+    "# %pip install azure-mgmt-authorization==3.0.0"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.4 Get credential"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n",
+    "\n",
+    "try:\n",
+    "    credential = DefaultAzureCredential()\n",
+    "    # Check if given credential can get token successfully.\n",
+    "    credential.get_token(\"https://management.azure.com/.default\")\n",
+    "except Exception as ex:\n",
+    "    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work\n",
+    "    credential = InteractiveBrowserCredential()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.5 Configure workspace "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.ml import MLClient\n",
+    "\n",
+    "try:\n",
+    "    ml_client = MLClient.from_config(credential=credential)\n",
+    "except Exception as ex:\n",
+    "    # enter details of your AML workspace\n",
+    "    subscription_id = \"<SUBSCRIPTION_ID>\"\n",
+    "    resource_group = \"<RESOURCE_GROUP>\"\n",
+    "    workspace = \"<AML_WORKSPACE_NAME>\"\n",
+    "\n",
+    "    # get a handle to the workspace\n",
+    "    ml_client = MLClient(credential, subscription_id, resource_group, workspace)\n",
+    "\n",
+    "\n",
+    "subscription_id = ml_client.subscription_id\n",
+    "resource_group = ml_client.resource_group_name\n",
+    "workspace = ml_client.workspace_name\n",
+    "\n",
+    "print(f\"Connected to workspace {workspace}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.6 Assign variables for Azure Content Safety\n",
+    "Currently, Azure AI Content Safety is in a limited set of regions:\n",
+    "\n",
+    "\n",
+    "__NOTE__: before you choose the region to deploy the Azure AI Content Safety, please be aware that your data will be transferred to the region you choose and by selecting a region outside your current location, you may be allowing the transmission of your data to regions outside your jurisdiction. It is important to note that data protection and privacy laws may vary between jurisdictions. Before proceeding, we strongly advise you to familiarize yourself with the local laws and regulations governing data transfer and ensure that you are legally permitted to transmit your data to an overseas location for processing. By continuing with the selection of a different region, you acknowledge that you have understood and accepted any potential risks associated with such data transmission. Please proceed with caution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient\n",
+    "\n",
+    "acs_client = CognitiveServicesManagementClient(credential, subscription_id)\n",
+    "\n",
+    "\n",
+    "# settings for the Azure AI Content Safety resource\n",
+    "# we will choose existing AACS resource if it exists, otherwise create a new one\n",
+    "# name of azure ai content safety resource, has to be unique\n",
+    "import time\n",
+    "\n",
+    "aacs_name = f\"{endpoint_name}-aacs-{str(time.time()).replace('.','')}\"\n",
+    "available_aacs_locations = [\"east us\", \"west europe\"]\n",
+    "\n",
+    "# create a new Cognitive Services Account\n",
+    "kind = \"ContentSafety\"\n",
+    "aacs_sku_name = \"S0\"\n",
+    "aacs_location = available_aacs_locations[0]\n",
+    "\n",
+    "\n",
+    "print(\"Available SKUs:\")\n",
+    "aacs_skus = acs_client.resource_skus.list()\n",
+    "print(\"SKU Name\\tSKU Tier\\tLocations\")\n",
+    "for sku in aacs_skus:\n",
+    "    if sku.kind == \"ContentSafety\":\n",
+    "        locations = \",\".join(sku.locations)\n",
+    "        print(sku.name + \"\\t\" + sku.tier + \"\\t\" + locations)\n",
+    "\n",
+    "print(\n",
+    "    f\"Choose a new Azure AI Content Safety resource in {aacs_location} with SKU {aacs_sku_name}\"\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Create Azure AI Content Safety"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.mgmt.cognitiveservices.models import Account, Sku, AccountProperties\n",
+    "\n",
+    "\n",
+    "parameters = Account(\n",
+    "    sku=Sku(name=aacs_sku_name),\n",
+    "    kind=kind,\n",
+    "    location=aacs_location,\n",
+    "    properties=AccountProperties(\n",
+    "        custom_sub_domain_name=aacs_name, public_network_access=\"Enabled\"\n",
+    "    ),\n",
+    ")\n",
+    "# How many seconds to wait between checking the status of an async operation.\n",
+    "wait_time = 10\n",
+    "\n",
+    "\n",
+    "def find_acs(accounts):\n",
+    "    return next(\n",
+    "        x\n",
+    "        for x in accounts\n",
+    "        if x.kind == \"ContentSafety\"\n",
+    "        and x.location == aacs_location\n",
+    "        and x.sku.name == aacs_sku_name\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    # check if AACS exists\n",
+    "    aacs = acs_client.accounts.get(resource_group, aacs_name)\n",
+    "    print(f\"Found existing Azure AI content safety Account {aacs.name}.\")\n",
+    "except:\n",
+    "    try:\n",
+    "        # check if there is an existing AACS resource within same resource group\n",
+    "        aacs = find_acs(acs_client.accounts.list_by_resource_group(resource_group))\n",
+    "        print(\n",
+    "            f\"Found existing Azure AI content safety Account {aacs.name} in resource group {resource_group}.\"\n",
+    "        )\n",
+    "    except:\n",
+    "        print(f\"Creating Azure AI content safety Account {aacs_name}.\")\n",
+    "        acs_client.accounts.begin_create(resource_group, aacs_name, parameters).wait()\n",
+    "        print(\"Resource created.\")\n",
+    "        aacs = acs_client.accounts.get(resource_group, aacs_name)\n",
+    "\n",
+    "\n",
+    "aacs_endpoint = aacs.properties.endpoint\n",
+    "aacs_resource_id = aacs.id\n",
+    "print(f\"AACS endpoint is {aacs_endpoint}\")\n",
+    "print(f\"AACS ResourceId is {aacs_resource_id}\")\n",
+    "\n",
+    "aacs_access_key = acs_client.accounts.list_keys(\n",
+    "    resource_group_name=resource_group, account_name=aacs.name\n",
+    ").key1"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Create Azure AI Content Safety enabled Llama 2 online endpoint"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 3.1 Check if Llama 2 model is available in the AML registry."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg_client = MLClient(\n",
+    "    credential,\n",
+    "    subscription_id=subscription_id,\n",
+    "    resource_group_name=resource_group,\n",
+    "    registry_name=registry_name,\n",
+    ")\n",
+    "version_list = list(\n",
+    "    reg_client.models.list(model_name)\n",
+    ")  # list available versions of the model\n",
+    "llama_model = None\n",
+    "if len(version_list) == 0:\n",
+    "    raise Exception(f\"No model named {model_name} found in registry\")\n",
+    "else:\n",
+    "    model_version = version_list[0].version\n",
+    "    llama_model = reg_client.models.get(model_name, model_version)\n",
+    "    print(\n",
+    "        f\"Using model name: {llama_model.name}, version: {llama_model.version}, id: {llama_model.id} for inferencing\"\n",
+    "    )"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 3.2 Create Llama 2 online endpoint\n",
+    "This step may take a few minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.ml.entities import ManagedOnlineEndpoint\n",
+    "\n",
+    "# Check if the endpoint already exists in the workspace\n",
+    "try:\n",
+    "    endpoint = ml_client.online_endpoints.get(endpoint_name)\n",
+    "    print(\"---Endpoint already exists---\")\n",
+    "except:\n",
+    "    # Create an online endpoint if it doesn't exist\n",
+    "\n",
+    "    # Define the endpoint\n",
+    "    endpoint = ManagedOnlineEndpoint(\n",
+    "        name=endpoint_name, description=\"Test endpoint for model\"\n",
+    "    )\n",
+    "\n",
+    "    # Trigger the endpoint creation\n",
+    "    try:\n",
+    "        ml_client.begin_create_or_update(endpoint).wait()\n",
+    "        print(\"\\n---Endpoint created successfully---\\n\")\n",
+    "    except Exception as err:\n",
+    "        raise RuntimeError(\n",
+    "            f\"Endpoint creation failed. Detailed Response:\\n{err}\"\n",
+    "        ) from err"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### 3.3 Deploy Llama 2 model\n",
+    "This step may take a few minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.ml.entities import (\n",
+    "    CodeConfiguration,\n",
+    "    OnlineRequestSettings,\n",
+    "    ManagedOnlineDeployment,\n",
+    "    ProbeSettings,\n",
+    ")\n",
+    "\n",
+    "# Define the deployment\n",
+    "# Update the model version as necessary\n",
+    "deployment = ManagedOnlineDeployment(\n",
+    "    name=deployment_name,\n",
+    "    endpoint_name=endpoint_name,\n",
+    "    model=llama_model.id,\n",
+    "    instance_type=sku_name,\n",
+    "    instance_count=1,\n",
+    "    code_configuration=CodeConfiguration(\n",
+    "        code=\"./llama-files/score\", scoring_script=\"score.py\"\n",
+    "    ),\n",
+    "    environment_variables={\n",
+    "        \"CONTENT_SAFETY_ENDPOINT\": aacs_endpoint,\n",
+    "        \"CONTENT_SAFETY_KEY\": aacs_access_key,\n",
+    "        \"CONTENT_SAFETY_THRESHOLD\": content_severity_threshold,\n",
+    "    },\n",
+    "    request_settings=OnlineRequestSettings(request_timeout_ms=90000),\n",
+    "    liveness_probe=ProbeSettings(\n",
+    "        failure_threshold=30,\n",
+    "        success_threshold=1,\n",
+    "        period=100,\n",
+    "        initial_delay=500,\n",
+    "    ),\n",
+    "    readiness_probe=ProbeSettings(\n",
+    "        failure_threshold=30,\n",
+    "        success_threshold=1,\n",
+    "        period=100,\n",
+    "        initial_delay=500,\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "# Trigger the deployment creation\n",
+    "try:\n",
+    "    ml_client.begin_create_or_update(deployment).wait()\n",
+    "    print(\"\\n---Deployment created successfully---\\n\")\n",
+    "except Exception as err:\n",
+    "    raise RuntimeError(\n",
+    "        f\"Deployment creation failed. Detailed Response:\\n{err}\"\n",
+    "    ) from err"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4. Test the Safety Enabled Llama 2 online endpoint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "test_src_dir = \"./safety-llama-test\"\n",
+    "os.makedirs(test_src_dir, exist_ok=True)\n",
+    "print(f\"test script directory: {test_src_dir}\")\n",
+    "sample_data = os.path.join(test_src_dir, \"sample-request.json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Choose request from following 4 cells based on the Llama 2 model type you chosen and the scenario you want to test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## For text-generation models (without -chat suffix)\n",
+    "## Successful response\n",
+    "\n",
+    "import json\n",
+    "\n",
+    "with open(sample_data, \"w\") as f:\n",
+    "    json.dump(\n",
+    "        {\n",
+    "            \"input_data\": {\n",
+    "                \"input_string\": [\n",
+    "                    \"Hello\",\n",
+    "                    \"My name is John and I have a dog.\",\n",
+    "                ],\n",
+    "                \"parameters\": {\n",
+    "                    \"temperature\": 0.6,\n",
+    "                    \"max_gen_len\": 256,\n",
+    "                    \"do_sample\": True,\n",
+    "                },\n",
+    "            }\n",
+    "        },\n",
+    "        f,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## For text-generation models (without -chat suffix)\n",
+    "## Blocked request/response due to hateful content\n",
+    "\n",
+    "import json\n",
+    "\n",
+    "with open(sample_data, \"w\") as f:\n",
+    "    json.dump(\n",
+    "        {\n",
+    "            \"input_data\": {\n",
+    "                \"input_string\": [\n",
+    "                    \"The dog was given a eutanasa injection due to their severed leg bleding profusely from deep lacarations to the lower extremities, exposing tisssue and nerve.\",  # This line contains hateful message and endpoint will return empty string as response\n",
+    "                ],\n",
+    "                \"parameters\": {\n",
+    "                    \"temperature\": 0.6,\n",
+    "                    \"max_gen_len\": 256,\n",
+    "                    \"do_sample\": True,\n",
+    "                },\n",
+    "            }\n",
+    "        },\n",
+    "        f,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## For chat-complete models (with \"-chat\" suffix)\n",
+    "## Successful request/response\n",
+    "\n",
+    "import json\n",
+    "\n",
+    "with open(sample_data, \"w\") as f:\n",
+    "    json.dump(\n",
+    "        {\n",
+    "            \"input_data\": {\n",
+    "                \"input_string\": [\n",
+    "                    {\n",
+    "                        \"role\": \"user\",\n",
+    "                        \"content\": \"What is the tallest building in the world?\",\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"role\": \"assistant\",\n",
+    "                        \"content\": \"As of 2021, the Burj Khalifa in Dubai, United Arab Emirates is the tallest building in the world, standing at a height of 828 meters (2,722 feet). It was completed in 2010 and has 163 floors. The Burj Khalifa is not only the tallest building in the world but also holds several other records, such as the highest occupied floor, highest outdoor observation deck, elevator with the longest travel distance, and the tallest freestanding structure in the world.\",\n",
+    "                    },\n",
+    "                    {\"role\": \"user\", \"content\": \"and in Africa?\"},\n",
+    "                    {\n",
+    "                        \"role\": \"assistant\",\n",
+    "                        \"content\": \"In Africa, the tallest building is the Carlton Centre, located in Johannesburg, South Africa. It stands at a height of 50 floors and 223 meters (730 feet). The CarltonDefault Centre was completed in 1973 and was the tallest building in Africa for many years until the construction of the Leonardo, a 55-story skyscraper in Sandton, Johannesburg, which was completed in 2019 and stands at a height of 230 meters (755 feet). Other notable tall buildings in Africa include the Ponte City Apartments in Johannesburg, the John Hancock Center in Lagos, Nigeria, and the Alpha II Building in Abidjan, Ivory Coast\",\n",
+    "                    },\n",
+    "                    {\"role\": \"user\", \"content\": \"and in Europe?\"},\n",
+    "                ],\n",
+    "                \"parameters\": {\n",
+    "                    \"max_length\": 100,\n",
+    "                    \"temperature\": 0.9,\n",
+    "                    \"top_p\": 0.6,\n",
+    "                    \"do_sample\": True,\n",
+    "                    \"max_new_tokens\": 100,\n",
+    "                },\n",
+    "            }\n",
+    "        },\n",
+    "        f,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## For chat-complete models (with \"-chat\" suffix)\n",
+    "## Blocked request due to hateful content\n",
+    "\n",
+    "import json\n",
+    "\n",
+    "with open(sample_data, \"w\") as f:\n",
+    "    json.dump(\n",
+    "        {\n",
+    "            \"input_data\": {\n",
+    "                \"input_string\": [\n",
+    "                    {\n",
+    "                        \"role\": \"user\",\n",
+    "                        \"content\": \"A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.\",\n",
+    "                    },\n",
+    "                ],\n",
+    "                \"parameters\": {\n",
+    "                    \"max_length\": 100,\n",
+    "                    \"temperature\": 0.9,\n",
+    "                    \"top_p\": 0.6,\n",
+    "                    \"do_sample\": True,\n",
+    "                    \"max_new_tokens\": 100,\n",
+    "                },\n",
+    "            }\n",
+    "        },\n",
+    "        f,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_client.online_endpoints.invoke(\n",
+    "    endpoint_name=endpoint_name,\n",
+    "    deployment_name=deployment_name,\n",
+    "    request_file=sample_data,\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}