From 9cf62545ef07745d9577b02056ba47ed71c23a79 Mon Sep 17 00:00:00 2001 From: Devin Ha <33089471+DevinTDHa@users.noreply.github.com> Date: Fri, 24 May 2024 10:26:52 +0200 Subject: [PATCH 1/3] SPARKNLP-1036: Onnx Example notebooks (#14234) * SPARKNLP-1036: Fix dev python kernel names * SPARKNLP-1036: Bump transformers version * SPARKNLP-1036: Fix Colab buttons * SPARKNLP-1036: Pin onnx version for compatibility * SPARKNLP-1036: Upgrade Spark version * SPARKNLP-1036: Minor Fixes * SPARKNLP-1036: Clean Metadata * SPARKNLP-1036: Add/Adjust Documentation - Note for supported Spark Version of Annotators - added missing Documentation for BGEEmbeddings --- docs/en/annotator_entries/BGEEmbeddings.md | 160 + docs/en/annotators.md | 1 + docs/en/transformer_entries/E5Embeddings.md | 2 + .../en/transformer_entries/MPNetEmbeddings.md | 2 + ...HuggingFace_ONNX_in_Spark_NLP_ALBERT.ipynb | 1180 ++-- ...Spark_NLP_AlbertForQuestionAnswering.ipynb | 4 +- ..._NLP_AlbertForSequenceClassification.ipynb | 4 +- ...ark_NLP_AlbertForTokenClassification.ipynb | 4 +- .../HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb | 7 +- .../HuggingFace_ONNX_in_Spark_NLP_BGE.ipynb | 5484 +++++++-------- ...n_Spark_NLP_BertForQuestionAnswering.ipynb | 4 +- ...rk_NLP_BertForSequenceClassification.ipynb | 4 +- ...Spark_NLP_BertForTokenClassification.ipynb | 4 +- ...rk_NLP_BertForZeroShotClassification.ipynb | 4791 ++++++------- ..._in_Spark_NLP_BertSentenceEmbeddings.ipynb | 5 +- .../HuggingFace_ONNX_in_Spark_NLP_CLIP.ipynb | 17 +- ...gingFace_ONNX_in_Spark_NLP_CamemBERT.ipynb | 4607 ++++++------ ...rk_NLP_CamemBertForQuestionAnswering.ipynb | 245 +- ...P_CamemBertForSequenceClassification.ipynb | 277 +- ..._NLP_CamemBertForTokenClassification.ipynb | 275 +- ...uggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb | 4 +- ...park_NLP_DeBertaForQuestionAnswering.ipynb | 6038 ++++++++-------- ...NLP_DeBertaForSequenceClassification.ipynb | 4 +- ...rk_NLP_DeBertaForTokenClassification.ipynb | 6232 ++++++++--------- ...ingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb | 6 +- ...k_NLP_DistilBertForQuestionAnswering.ipynb | 4 +- ..._DistilBertForSequenceClassification.ipynb | 4 +- ...NLP_DistilBertForTokenClassification.ipynb | 6 +- .../HuggingFace_ONNX_in_Spark_NLP_E5.ipynb | 916 ++- .../HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb | 1466 ++-- ..._Spark_NLP_MPNetForQuestionAnswering.ipynb | 11 +- ...k_NLP_MPNetForSequenceClassification.ipynb | 2 +- ...HuggingFace_ONNX_in_Spark_NLP_Marian.ipynb | 2 +- ...uggingFace_ONNX_in_Spark_NLP_RoBERTa.ipynb | 4 +- ...park_NLP_RoBertaForQuestionAnswering.ipynb | 4 +- ...NLP_RoBertaForSequenceClassification.ipynb | 5570 +++++++-------- ...rk_NLP_RoBertaForTokenClassification.ipynb | 6222 ++++++++-------- .../HuggingFace_ONNX_in_Spark_NLP_T5.ipynb | 2 +- ...uggingFace_ONNX_in_Spark_NLP_Whisper.ipynb | 1243 ++-- ...ngFace_ONNX_in_Spark_NLP_XLM_RoBERTa.ipynb | 4601 ++++++------ ...k_NLP_XlmRoBertaForQuestionAnswering.ipynb | 4616 ++++++------ ..._XlmRoBertaForSequenceClassification.ipynb | 4093 ++++++----- ...NLP_XlmRoBertaForTokenClassification.ipynb | 4036 ++++++----- ...ark_NLP_XlmRoBertaSentenceEmbeddings.ipynb | 9 +- ...Spark_NLP_AlbertForQuestionAnswering.ipynb | 4 +- .../annotator/embeddings/bge_embeddings.py | 2 + .../annotator/embeddings/e5_embeddings.py | 2 + .../annotator/embeddings/mpnet_embeddings.py | 2 + .../nlp/embeddings/BGEEmbeddings.scala | 4 +- .../nlp/embeddings/E5Embeddings.scala | 2 + .../nlp/embeddings/MPNetEmbeddings.scala | 2 + 51 files changed, 30360 insertions(+), 31830 deletions(-) create mode 100644 docs/en/annotator_entries/BGEEmbeddings.md diff --git a/docs/en/annotator_entries/BGEEmbeddings.md b/docs/en/annotator_entries/BGEEmbeddings.md new file mode 100644 index 00000000000000..7e006a5e7789a8 --- /dev/null +++ b/docs/en/annotator_entries/BGEEmbeddings.md @@ -0,0 +1,160 @@ +{%- capture title -%} +BGEEmbeddings +{%- endcapture -%} + +{%- capture description -%} +Sentence embeddings using BGE. + +BGE, or BAAI General Embeddings, a model that can map any text to a low-dimensional dense +vector which can be used for tasks like retrieval, classification, clustering, or semantic +search. + +Note that this annotator is only supported for Spark Versions 3.4 and up. + +Pretrained models can be loaded with `pretrained` of the companion object: + +```scala +val embeddings = BGEEmbeddings.pretrained() + .setInputCols("document") + .setOutputCol("embeddings") +``` + +The default model is `"bge_base"`, if no name is provided. + +For available pretrained models please see the +[Models Hub](https://sparknlp.org/models?q=BGE). + +For extended examples of usage, see +[BGEEmbeddingsTestSpec](https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddingsTestSpec.scala). + +**Sources** : + +[C-Pack: Packaged Resources To Advance General Chinese Embedding](https://arxiv.org/pdf/2309.07597) + +[BGE Github Repository](https://github.com/FlagOpen/FlagEmbedding) + +**Paper abstract** + +*We introduce C-Pack, a package of resources that significantly advance the field of general +Chinese embeddings. C-Pack includes three critical resources. 1) C-MTEB is a comprehensive +benchmark for Chinese text embeddings covering 6 tasks and 35 datasets. 2) C-MTP is a massive +text embedding dataset curated from labeled and unlabeled Chinese corpora for training +embedding models. 3) C-TEM is a family of embedding models covering multiple sizes. Our models +outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the time of the +release. We also integrate and optimize the entire suite of training methods for C-TEM. Along +with our resources on general Chinese embedding, we release our data and models for English +text embeddings. The English models achieve stateof-the-art performance on the MTEB benchmark; +meanwhile, our released English data is 2 times larger than the Chinese data. All these +resources are made publicly available at https://github.com/FlagOpen/FlagEmbedding.* +{%- endcapture -%} + +{%- capture input_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture output_anno -%} +SENTENCE_EMBEDDINGS +{%- endcapture -%} + +{%- capture python_example -%} +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") +embeddings = BGEEmbeddings.pretrained() \ + .setInputCols(["document"]) \ + .setOutputCol("bge_embeddings") +embeddingsFinisher = EmbeddingsFinisher() \ + .setInputCols(["bge_embeddings"]) \ + .setOutputCols("finished_embeddings") \ + .setOutputAsVector(True) +pipeline = Pipeline().setStages([ + documentAssembler, + embeddings, + embeddingsFinisher +]) +data = spark.createDataFrame([["query: how much protein should a female eat", +"passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \ +"But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \ +"marathon. Check out the chart below to see how much protein you should be eating each day.", +]]).toDF("text") +result = pipeline.fit(data).transform(data) +result.selectExpr("explode(finished_embeddings) as result").show(5, 80) ++--------------------------------------------------------------------------------+ +| result| ++--------------------------------------------------------------------------------+ +|[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...| +|[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...| ++--------------------------------------------------------------------------------+ +{%- endcapture -%} + +{%- capture scala_example -%} +import spark.implicits._ +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.annotators.Tokenizer +import com.johnsnowlabs.nlp.embeddings.BGEEmbeddings +import com.johnsnowlabs.nlp.EmbeddingsFinisher +import org.apache.spark.ml.Pipeline + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val embeddings = BGEEmbeddings.pretrained("bge_base", "en") + .setInputCols("document") + .setOutputCol("bge_embeddings") + +val embeddingsFinisher = new EmbeddingsFinisher() + .setInputCols("bge_embeddings") + .setOutputCols("finished_embeddings") + .setOutputAsVector(true) + +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + embeddings, + embeddingsFinisher +)) + +val data = Seq("query: how much protein should a female eat", +"passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + +But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + +marathon. Check out the chart below to see how much protein you should be eating each day." + +).toDF("text") +val result = pipeline.fit(data).transform(data) + +result.selectExpr("explode(finished_embeddings) as result").show(1, 80) ++--------------------------------------------------------------------------------+ +| result| ++--------------------------------------------------------------------------------+ +|[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...| +|[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...| ++--------------------------------------------------------------------------------+ +{%- endcapture -%} + +{%- capture api_link -%} +[BGEEmbeddings](/api/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings) +{%- endcapture -%} + +{%- capture python_api_link -%} +[BGEEmbeddings](/api/python/reference/autosummary/sparknlp/annotator/embeddings/bge_embeddings/index.html#sparknlp.annotator.embeddings.bge_embeddings.BGEEmbeddings) +{%- endcapture -%} + +{%- capture source_link -%} +[BGEEmbeddings](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.scala) +{%- endcapture -%} + +{% include templates/anno_template.md +title=title +description=description +input_anno=input_anno +output_anno=output_anno +python_example=python_example +scala_example=scala_example +api_link=api_link +python_api_link=python_api_link +source_link=source_link +%} \ No newline at end of file diff --git a/docs/en/annotators.md b/docs/en/annotators.md index 858a07d0a06336..b65eae52cc7f12 100644 --- a/docs/en/annotators.md +++ b/docs/en/annotators.md @@ -45,6 +45,7 @@ There are two types of Annotators: {:.table-model-big} |Annotator|Description|Version | |---|---|---| +{% include templates/anno_table_entry.md path="" name="BGEEmbeddings" summary="Sentence embeddings using BGE."%} {% include templates/anno_table_entry.md path="" name="BigTextMatcher" summary="Annotator to match exact phrases (by token) provided in a file against a Document."%} {% include templates/anno_table_entry.md path="" name="Chunk2Doc" summary="Converts a `CHUNK` type column back into `DOCUMENT`. Useful when trying to re-tokenize or do further analysis on a `CHUNK` result."%} {% include templates/anno_table_entry.md path="" name="ChunkEmbeddings" summary="This annotator utilizes WordEmbeddings, BertEmbeddings etc. to generate chunk embeddings from either Chunker, NGramGenerator, or NerConverter outputs."%} diff --git a/docs/en/transformer_entries/E5Embeddings.md b/docs/en/transformer_entries/E5Embeddings.md index 04fa9482ed52d3..d3f0ec14da9fcc 100644 --- a/docs/en/transformer_entries/E5Embeddings.md +++ b/docs/en/transformer_entries/E5Embeddings.md @@ -8,6 +8,8 @@ Sentence embeddings using E5. E5, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) +Note that this annotator is only supported for Spark Versions 3.4 and up. + Pretrained models can be loaded with `pretrained` of the companion object: ```scala diff --git a/docs/en/transformer_entries/MPNetEmbeddings.md b/docs/en/transformer_entries/MPNetEmbeddings.md index a4c8b4df22c968..843078363277b2 100644 --- a/docs/en/transformer_entries/MPNetEmbeddings.md +++ b/docs/en/transformer_entries/MPNetEmbeddings.md @@ -10,6 +10,8 @@ Understanding by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. MPNet a pre-training method, named masked and permuted language modeling, to inherit the advantages of masked language modeling and permuted language modeling for natural language understanding. +Note that this annotator is only supported for Spark Versions 3.4 and up. + Pretrained models can be loaded with `pretrained` of the companion object: ```scala diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_ALBERT.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_ALBERT.ipynb index 10c920bfea02be..08b77ed29e3841 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_ALBERT.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_ALBERT.ipynb @@ -41,16 +41,16 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "bgAl5Z8JLROz", - "outputId": "0e3bf64c-a97d-465c-8357-47e6b1409135", "colab": { "base_uri": "https://localhost:8080/" - } + }, + "id": "bgAl5Z8JLROz", + "outputId": "0e3bf64c-a97d-465c-8357-47e6b1409135" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m69.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", @@ -119,7 +119,7 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow" ] }, { @@ -137,8 +137,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VHQU-UIHLRO3", - "outputId": "9cbb1d48-174a-4194-c475-dc616ec8076e", "colab": { "base_uri": "https://localhost:8080/", "height": 423, @@ -199,12 +197,14 @@ "a27e457fcb9746c7ad4a8a694f76e367", "220b5d00bae9489fb70ce469d4103d08" ] - } + }, + "id": "VHQU-UIHLRO3", + "outputId": "9cbb1d48-174a-4194-c475-dc616ec8076e" }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", @@ -215,43 +215,43 @@ ] }, { - "output_type": "display_data", "data": { - "text/plain": [ - "config.json: 0%| | 0.00/684 [00:00=4.26.0 in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (4.38.2)\n", + "Requirement already satisfied: torch>=1.11 in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (2.2.1+cu121)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (24.0)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (1.25.2)\n", + "Requirement already satisfied: huggingface-hub>=0.8.0 in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (0.20.3)\n", + "Collecting datasets (from optimum[onnxruntime])\n", + " Downloading datasets-2.18.0-py3-none-any.whl (510 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m25.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting onnx (from optimum[onnxruntime])\n", + " Downloading onnx-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m63.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting onnxruntime>=1.11.0 (from optimum[onnxruntime])\n", + " Downloading onnxruntime-1.17.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m68.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting evaluate (from optimum[onnxruntime])\n", + " Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: protobuf>=3.20.1 in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (3.20.3)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (3.13.4)\n", + "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (14.0.2)\n", + "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (0.6)\n", + "Collecting dill<0.3.9,>=0.3.0 (from datasets->optimum[onnxruntime])\n", + " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (2.0.3)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (2.31.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (4.66.2)\n", + "Collecting xxhash (from datasets->optimum[onnxruntime])\n", + " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting multiprocess (from datasets->optimum[onnxruntime])\n", + " Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: fsspec[http]<=2024.2.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (2023.6.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (3.9.3)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.8.0->optimum[onnxruntime]) (4.11.0)\n", + "Requirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.11.0->optimum[onnxruntime]) (24.3.25)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.11->optimum[onnxruntime]) (3.3)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11->optimum[onnxruntime]) (3.1.3)\n", + "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", + "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", + "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", + "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", + "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", + "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", + "Collecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", + "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", + "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", + "Collecting nvidia-nccl-cu12==2.19.3 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n", + "Collecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", + "Requirement already satisfied: triton==2.2.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11->optimum[onnxruntime]) (2.2.0)\n", + "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.11->optimum[onnxruntime])\n", + " Using cached nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<4.40.0,>=4.26.0->optimum[onnxruntime]) (2023.12.25)\n", + "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<4.40.0,>=4.26.0->optimum[onnxruntime]) (0.15.2)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<4.40.0,>=4.26.0->optimum[onnxruntime]) (0.4.2)\n", + "Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<4.40.0,>=4.26.0->optimum[onnxruntime]) (0.1.99)\n", + "Collecting humanfriendly>=9.1 (from coloredlogs->optimum[onnxruntime])\n", + " Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting responses<0.19 (from evaluate->optimum[onnxruntime])\n", + " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->optimum[onnxruntime]) (1.3.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (1.9.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (4.0.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->optimum[onnxruntime]) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->optimum[onnxruntime]) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->optimum[onnxruntime]) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->optimum[onnxruntime]) (2024.2.2)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.11->optimum[onnxruntime]) (2.1.5)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets->optimum[onnxruntime]) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets->optimum[onnxruntime]) (2023.4)\n", + "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets->optimum[onnxruntime]) (2024.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets->optimum[onnxruntime]) (1.16.0)\n", + "Installing collected packages: xxhash, onnx, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, install, humanfriendly, dill, responses, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, coloredlogs, onnxruntime, nvidia-cusolver-cu12, datasets, evaluate, optimum\n", + "Successfully installed coloredlogs-15.0.1 datasets-2.18.0 dill-0.3.8 evaluate-0.4.1 humanfriendly-10.0 install-1.3.5 multiprocess-0.70.16 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.1.105 onnx-1.16.0 onnxruntime-1.17.1 optimum-1.18.1 responses-0.18.0 xxhash-3.4.1\n" + ] + } + ], + "source": [ + "!pip install --upgrade-strategy eager install optimum[onnxruntime]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [BAAI/bge-base-en](https://huggingface.co/BAAI/bge-base-en) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "6Kdh7FHM07h_" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cec3087741c14a54a25fa6228c28aae3", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Export and Save HuggingFace model" + "text/plain": [ + "config.json: 0%| | 0.00/719 [00:00=4.26.0 in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (4.35.2)\n", - "Requirement already satisfied: torch>=1.9 in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (2.1.0+cu121)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (23.2)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (1.23.5)\n", - "Requirement already satisfied: huggingface-hub>=0.8.0 in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (0.19.4)\n", - "Collecting datasets (from optimum[onnxruntime])\n", - " Downloading datasets-2.16.1-py3-none-any.whl (507 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting onnx (from optimum[onnxruntime])\n", - " Downloading onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.7 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m50.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting onnxruntime>=1.11.0 (from optimum[onnxruntime])\n", - " Downloading onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m70.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting evaluate (from optimum[onnxruntime])\n", - " Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: protobuf>=3.20.1 in /usr/local/lib/python3.10/dist-packages (from optimum[onnxruntime]) (3.20.3)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (3.13.1)\n", - "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (10.0.1)\n", - "Collecting pyarrow-hotfix (from datasets->optimum[onnxruntime])\n", - " Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", - "Collecting dill<0.3.8,>=0.3.0 (from datasets->optimum[onnxruntime])\n", - " Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (1.5.3)\n", - "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (2.31.0)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (4.66.1)\n", - "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (3.4.1)\n", - "Collecting multiprocess (from datasets->optimum[onnxruntime])\n", - " Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (2023.6.0)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (3.9.1)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets->optimum[onnxruntime]) (6.0.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.8.0->optimum[onnxruntime]) (4.5.0)\n", - "Requirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.11.0->optimum[onnxruntime]) (23.5.26)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.9->optimum[onnxruntime]) (3.2.1)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.9->optimum[onnxruntime]) (3.1.2)\n", - "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.9->optimum[onnxruntime]) (2.1.0)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]>=4.26.0->optimum[onnxruntime]) (2023.6.3)\n", - "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]>=4.26.0->optimum[onnxruntime]) (0.15.0)\n", - "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]>=4.26.0->optimum[onnxruntime]) (0.4.1)\n", - "Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece]>=4.26.0->optimum[onnxruntime])\n", - " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m53.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting humanfriendly>=9.1 (from coloredlogs->optimum[onnxruntime])\n", - " Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting responses<0.19 (from evaluate->optimum[onnxruntime])\n", - " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->optimum[onnxruntime]) (1.3.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (23.1.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (6.0.4)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (1.9.4)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (1.4.1)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (1.3.1)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->optimum[onnxruntime]) (4.0.3)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->optimum[onnxruntime]) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->optimum[onnxruntime]) (3.6)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->optimum[onnxruntime]) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets->optimum[onnxruntime]) (2023.11.17)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.9->optimum[onnxruntime]) (2.1.3)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets->optimum[onnxruntime]) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets->optimum[onnxruntime]) (2023.3.post1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets->optimum[onnxruntime]) (1.16.0)\n", - "Installing collected packages: sentencepiece, pyarrow-hotfix, onnx, humanfriendly, dill, responses, multiprocess, coloredlogs, onnxruntime, datasets, evaluate, optimum\n", - "Successfully installed coloredlogs-15.0.1 datasets-2.16.1 dill-0.3.7 evaluate-0.4.1 humanfriendly-10.0 multiprocess-0.70.15 onnx-1.15.0 onnxruntime-1.16.3 optimum-1.16.1 pyarrow-hotfix-0.6 responses-0.18.0 sentencepiece-0.1.99\n" - ] - } - ], - "source": [ - "!pip install --upgrade-strategy eager install optimum[onnxruntime]" + "text/plain": [ + "model.safetensors: 0%| | 0.00/438M [00:00 False\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", - "\n", - "MODEL_NAME = \"BAAI/bge-base-en\"\n", - "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(EXPORT_PATH)\n", - "\n", - "# Create directory for assets and move the tokenizer files.\n", - "# A separate folder is needed for Spark NLP.\n", - "!mkdir {EXPORT_PATH}/assets\n", - "!mv {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets/" + "text/plain": [ + "vocab.txt: 0%| | 0.00/232k [00:00 False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", + "\n", + "MODEL_NAME = \"BAAI/bge-base-en\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets\n", + "!mv {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 426316\n", + "drwxr-xr-x 2 root root 4096 Apr 12 11:13 assets\n", + "-rw-r--r-- 1 root root 735 Apr 12 11:13 config.json\n", + "-rw-r--r-- 1 root root 435811541 Apr 12 11:13 model.onnx\n", + "-rw-r--r-- 1 root root 695 Apr 12 11:13 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 1242 Apr 12 11:13 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 711396 Apr 12 11:13 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 228\n", + "-rw-r--r-- 1 root root 231508 Apr 12 11:13 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save BGE in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script\n", + "- However, we need to upgrade Spark to a more recent version to use this annotator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.3.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.3.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m568.4/568.4 kB\u001b[0m \u001b[31m38.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting pyspark==3.4.1\n", + " Downloading pyspark-3.4.1.tar.gz (310.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.8/310.8 MB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting py4j==0.10.9.7 (from pyspark==3.4.1)\n", + " Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m200.5/200.5 kB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hBuilding wheels for collected packages: pyspark\n", + " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=62465da1460fcdc99650dde11bbf8f2ea59eed17293c05cc491293d1f701c682\n", + " Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834\n", + "Successfully built pyspark\n", + "Installing collected packages: py4j, pyspark\n", + " Attempting uninstall: py4j\n", + " Found existing installation: py4j 0.10.9.5\n", + " Uninstalling py4j-0.10.9.5:\n", + " Successfully uninstalled py4j-0.10.9.5\n", + " Attempting uninstall: pyspark\n", + " Found existing installation: pyspark 3.2.3\n", + " Uninstalling pyspark-3.2.3:\n", + " Successfully uninstalled pyspark-3.2.3\n", + "Successfully installed py4j-0.10.9.7 pyspark-3.4.1\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash\n", + "! pip install -U pyspark==3.4.1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/lib/python3.10/subprocess.py:1796: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n", + " self.pid = _posixsubprocess.fork_exec(\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `E5Embeddings` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `E5Embeddings` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "BGE = BGEEmbeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"bge\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
sparknlp.annotator.embeddings.bge_embeddings.BGEEmbeddings
def __init__(classname='com.johnsnowlabs.nlp.embeddings.BGEEmbeddings', java_model=None)
/usr/local/lib/python3.10/dist-packages/sparknlp/annotator/embeddings/bge_embeddings.pySentence embeddings using BGE.\n",
+       "\n",
+       " BGE, or BAAI General Embeddings, a model that can map any text to a low-dimensional dense \n",
+       "vector which can be used for tasks like retrieval, classification, clustering, or semantic search.\n",
+       "\n",
+       "Pretrained models can be loaded with `pretrained` of the companion object:\n",
+       "\n",
+       "  >>> embeddings = BGEEmbeddings.pretrained() \\\n",
+       "  ...     .setInputCols(["document"]) \\\n",
+       "  ...     .setOutputCol("bge_embeddings")\n",
+       "\n",
+       "\n",
+       "  The default model is ``"bge_base"``, if no name is provided.\n",
+       "\n",
+       "  For available pretrained models please see the\n",
+       "  `Models Hub <https://sparknlp.org/models?q=BGE>`__.\n",
+       "\n",
+       "\n",
+       "  ====================== ======================\n",
+       "  Input Annotation types Output Annotation type\n",
+       "  ====================== ======================\n",
+       "  ``DOCUMENT``            ``SENTENCE_EMBEDDINGS``\n",
+       "  ====================== ======================\n",
+       "\n",
+       "  Parameters\n",
+       "  ----------\n",
+       "  batchSize\n",
+       "      Size of every batch , by default 8\n",
+       "  dimension\n",
+       "      Number of embedding dimensions, by default 768\n",
+       "  caseSensitive\n",
+       "      Whether to ignore case in tokens for embeddings matching, by default False\n",
+       "  maxSentenceLength\n",
+       "      Max sentence length to process, by default 512\n",
+       "  configProtoBytes\n",
+       "      ConfigProto from tensorflow, serialized into byte array.\n",
+       "\n",
+       "  References\n",
+       "  ----------\n",
+       "  `C-Pack: Packaged Resources To Advance General Chinese Embedding <https://arxiv.org/pdf/2309.07597>`__\n",
+       "  `BGE Github Repository <https://github.com/FlagOpen/FlagEmbedding>`__\n",
+       "\n",
+       "  **Paper abstract**\n",
+       "\n",
+       "  *We introduce C-Pack, a package of resources that significantly advance the field of general\n",
+       "  Chinese embeddings. C-Pack includes three critical resources. \n",
+       "  1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.\n",
+       "  2) C-MTP is a massive text embedding dataset curated from labeled and unlabeled Chinese corpora\n",
+       "  for training embedding models.\n",
+       "  3) C-TEM is a family of embedding models covering multiple sizes.\n",
+       "  Our models outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the \n",
+       "  time of the release. We also integrate and optimize the entire suite of training methods for\n",
+       "  C-TEM. Along with our resources on general Chinese embedding, we release our data and models for\n",
+       "  English text embeddings. The English models achieve stateof-the-art performance on the MTEB\n",
+       "  benchmark; meanwhile, our released English data is 2 times larger than the Chinese data. All\n",
+       "  these resources are made publicly available at https://github.com/FlagOpen/FlagEmbedding.*\n",
+       "\n",
+       "  Examples\n",
+       "  --------\n",
+       "  >>> import sparknlp\n",
+       "  >>> from sparknlp.base import *\n",
+       "  >>> from sparknlp.annotator import *\n",
+       "  >>> from pyspark.ml import Pipeline\n",
+       "  >>> documentAssembler = DocumentAssembler() \\\n",
+       "  ...     .setInputCol("text") \\\n",
+       "  ...     .setOutputCol("document")\n",
+       "  >>> embeddings = BGEEmbeddings.pretrained() \\\n",
+       "  ...     .setInputCols(["document"]) \\\n",
+       "  ...     .setOutputCol("bge_embeddings")\n",
+       "  >>> embeddingsFinisher = EmbeddingsFinisher() \\\n",
+       "  ...     .setInputCols(["bge_embeddings"]) \\\n",
+       "  ...     .setOutputCols("finished_embeddings") \\\n",
+       "  ...     .setOutputAsVector(True)\n",
+       "  >>> pipeline = Pipeline().setStages([\n",
+       "  ...     documentAssembler,\n",
+       "  ...     embeddings,\n",
+       "  ...     embeddingsFinisher\n",
+       "  ... ])\n",
+       "  >>> data = spark.createDataFrame([["query: how much protein should a female eat",\n",
+       "  ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." +     ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" +     ... "marathon. Check out the chart below to see how much protein you should be eating each day.",\n",
+       "  ... ]]).toDF("text")\n",
+       "  >>> result = pipeline.fit(data).transform(data)\n",
+       "  >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)\n",
+       "  +--------------------------------------------------------------------------------+\n",
+       "  |                                                                          result|\n",
+       "  +--------------------------------------------------------------------------------+\n",
+       "  |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|\n",
+       "  |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|\n",
+       "  +--------------------------------------------------------------------------------+\n",
+       "  
\n", + " \n", + "
" ], - "source": [ - "!ls -l {EXPORT_PATH}/assets" + "text/plain": [ + "sparknlp.annotator.embeddings.bge_embeddings.BGEEmbeddings" ] - }, + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BGEEmbeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BGE.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX BGE model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "8U9fAdyP07iB" - }, - "source": [ - "## Import and Save BGE in Spark NLP\n", - "\n", - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 425676\n", + "-rw-r--r-- 1 root root 435878171 Apr 12 11:18 bge_onnx\n", + "drwxr-xr-x 3 root root 4096 Apr 12 11:18 fields\n", + "drwxr-xr-x 2 root root 4096 Apr 12 11:17 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny E5 model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "BGE_loaded = BGEEmbeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"bge\")\\\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " document_assembler,\n", + " BGE_loaded\n", + " ])\n", + "\n", + "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", + "model = pipeline.fit(data)\n", + "result = model.transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "source": [], - "metadata": { - "id": "lsEhkReSVKT1" - }, - "execution_count": null, - "outputs": [] + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| embeddings|\n", + "+--------------------+\n", + "|[-0.03762533, 0.0...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "result.selectExpr(\"explode(bge.embeddings) as embeddings\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of E5 models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "04fc855de92a4ac7bde136daffa9df93": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_fbe3133d96334b3588ccdc747a4dc860", + "IPY_MODEL_78e7ad669cda4b0db40de79feec10453", + "IPY_MODEL_ee18a8efc7564117b7243dcf266b6930" + ], + "layout": "IPY_MODEL_e4fdba14d81f4709b4377885faacbc84" + } }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "Tqi7wiwq07iC", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "1c02ea38-a45e-443d-aa87-7911413026fa" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting pyspark\n", - " Downloading pyspark-3.5.0.tar.gz (316.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m316.9/316.9 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n", - "Building wheels for collected packages: pyspark\n", - " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=540db909df4d61fe0e14a45bce69a7e9f9ca01fb84b9d9eccb028c1238573047\n", - " Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc\n", - "Successfully built pyspark\n", - "Installing collected packages: pyspark\n", - "Successfully installed pyspark-3.5.0\n", - "Processing ./spark_nlp-5.2.1-py2.py3-none-any.whl\n", - "Installing collected packages: spark-nlp\n", - "Successfully installed spark-nlp-5.2.1\n" - ] - } + "05843a7fbe90487395d085273a862474": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_602e54ecffd94d599fa3bc0164c61b95", + "max": 125, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2f01819dfb6a487d9fd1e9c64eaa4a2f", + "value": 125 + } + }, + "1ba47727b2d34d2cb19c2eacc94df028": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1c33fe0eff454bc69582ef43ac6d6cc5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f01819dfb6a487d9fd1e9c64eaa4a2f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "31a9b57e1773438dbeecf2ddf33ef99d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "38683dbe19364b19a005b184a924a72c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9ad4628cf6674e57955dc441145fda6d", + "max": 719, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7b1b0c121332424687393b0352878e3d", + "value": 719 + } + }, + "4be5b724e5044ed5a617e9a763bc252e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e8c9a619bf2a4e1a8ab88d04e8571f36", + "placeholder": "​", + "style": "IPY_MODEL_b7f88f02a2994b78b3896ff819d319ad", + "value": " 232k/232k [00:00<00:00, 10.4MB/s]" + } + }, + "4d9593b4c3f843b3bc142b3177d88ec2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8d563f1b8401418cb650099070bd9b89", + "IPY_MODEL_da3e4bc985454dfeac9072005ecc1476", + "IPY_MODEL_9e8ebe9169c64d95a732686184fed177" ], - "source": [ - "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] + "layout": "IPY_MODEL_641bde89ac824839868903e74e921989" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "SZhqrGJg07iC" - }, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] + "4dfad0c536844190889d5a09c0efde78": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1c33fe0eff454bc69582ef43ac6d6cc5", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8de47e3311e24ee6b6b54054852ae7e7", + "value": 231508 + } }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "Pe2IVIs407iC" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()" - ] + "508795d94bb648eb91c40fa786b7ae90": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "iJ0nwY_g07iC" - }, - "source": [ - "- Let's use `loadSavedModel` functon in `E5Embeddings` which allows us to load the ONNX model\n", - "- Most params will be set automatically. They can also be set later after loading the model in `E5Embeddings` during runtime, so don't worry about setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" - ] + "5127f6c9f27f45a281e3de25b798f442": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "2ArVWTRr07iD" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "\n", - "# All these params should be identical to the original ONNX model\n", - "BGE = BGEEmbeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", - " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"bge\")" - ] + "53590c0136a44efb94ed26e30594552e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee0fa667215148858fd66cc301d1ada1", + "placeholder": "​", + "style": "IPY_MODEL_faeec3bcda1b4682a9c5754394e7515c", + "value": " 438M/438M [00:04<00:00, 33.6MB/s]" + } }, - { - "cell_type": "code", - "source": [ - "BGEEmbeddings" + "561900c921f04ab0870bd3b32badfbe1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "574b1a52edf94c24b5bc9990a91a74da": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5b9976ca08014c5ebe13c299b280bdb1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5c9c70b7f00c4afb958815a69aee0c69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5d5dbec6000a4ed1b2dca640d685bace": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "602e54ecffd94d599fa3bc0164c61b95": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "607fb88de6f44dd682878653c85a539e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "641bde89ac824839868903e74e921989": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "648d3d6bc327494cb2ccf0bf3286eb36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_31a9b57e1773438dbeecf2ddf33ef99d", + "placeholder": "​", + "style": "IPY_MODEL_a181a880c31b4e6993a7a2fc5cc5927a", + "value": "vocab.txt: 100%" + } + }, + "6b37614be8144aaa8a370cfae6b4ab75": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6c1a99fe6ec84ca7afd06ed5be648eeb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6ebe12e78b624355a69d62c827d7f085": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7095df5dce6d4bb68fa0d87aab86f78f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "72ec65cd646f47b7b8787313ed308c4a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "78e7ad669cda4b0db40de79feec10453": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_508795d94bb648eb91c40fa786b7ae90", + "max": 711396, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5d5dbec6000a4ed1b2dca640d685bace", + "value": 711396 + } + }, + "7998f7bec2c342c8b9d0622db8b47397": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7b10f1b428904bc0905e54fc030a91e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7b1b0c121332424687393b0352878e3d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "87e79a2bd1814d05b72f339f9d28a9f7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "88ccf9763afd48019f621b8953211a92": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_574b1a52edf94c24b5bc9990a91a74da", + "placeholder": "​", + "style": "IPY_MODEL_7095df5dce6d4bb68fa0d87aab86f78f", + "value": "config.json: 100%" + } + }, + "8af20d759d014ca78d2328346ba9c57b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f0fc8b2e6ff840309c9a219ea217fcc0", + "placeholder": "​", + "style": "IPY_MODEL_95c146a715a641f6996ecd333a78881d", + "value": " 719/719 [00:00<00:00, 9.30kB/s]" + } + }, + "8d563f1b8401418cb650099070bd9b89": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c4c1fec0be994ac89622f441e4ff3354", + "placeholder": "​", + "style": "IPY_MODEL_7b10f1b428904bc0905e54fc030a91e8", + "value": "tokenizer_config.json: 100%" + } + }, + "8de47e3311e24ee6b6b54054852ae7e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "942546f9f9d34fc6b978d3efadeddb06": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "95c146a715a641f6996ecd333a78881d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "98ab41762ba1448a884693fa58dae307": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "98bdb7fefacf4f69ae6476621725637d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_648d3d6bc327494cb2ccf0bf3286eb36", + "IPY_MODEL_4dfad0c536844190889d5a09c0efde78", + "IPY_MODEL_4be5b724e5044ed5a617e9a763bc252e" ], - "metadata": { - "id": "zWw5xrO-53xk", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "58f812d5-e2a8-4ab0-cc37-1fa496213458" - }, - "execution_count": 8, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "sparknlp.annotator.embeddings.bge_embeddings.BGEEmbeddings" - ] - }, - "metadata": {}, - "execution_count": 8 - } - ] + "layout": "IPY_MODEL_e84a92e1d713426c8fe341a623904c1c" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "F9TmAQx_07iD" - }, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] + "9ad4628cf6674e57955dc441145fda6d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "KsM5Ect607iD" - }, - "outputs": [], - "source": [ - "BGE.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" - ] + "9e8ebe9169c64d95a732686184fed177": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1ba47727b2d34d2cb19c2eacc94df028", + "placeholder": "​", + "style": "IPY_MODEL_7998f7bec2c342c8b9d0622db8b47397", + "value": " 366/366 [00:00<00:00, 19.4kB/s]" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "-mSnWope07iD" - }, - "source": [ - "Let's clean up stuff we don't need anymore" - ] + "a181a880c31b4e6993a7a2fc5cc5927a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "AhmAF6KZ07iD" - }, - "outputs": [], - "source": [ - "!rm -rf {EXPORT_PATH}" - ] + "a483a2cc4c7449db85992600938d1fd4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_87e79a2bd1814d05b72f339f9d28a9f7", + "max": 437955512, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_98ab41762ba1448a884693fa58dae307", + "value": 437955512 + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "207B-CAx07iE" - }, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your ONNX BGE model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] + "a6bd88a906a74c2483fb5f1ef48ff445": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "vnWXwenx07iE", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "156ad10b-73c5-489d-e961-d9f3688b2705" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "total 425676\n", - "-rw-r--r-- 1 root root 435878170 Jan 1 09:41 bge_onnx\n", - "drwxr-xr-x 3 root root 4096 Jan 1 09:41 fields\n", - "drwxr-xr-x 2 root root 4096 Jan 1 09:41 metadata\n" - ] - } + "a87d5e5fa0a94fdc959435575a1f0b21": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a95909b2bc4841f4bd64dc9f4732d9c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6ebe12e78b624355a69d62c827d7f085", + "placeholder": "​", + "style": "IPY_MODEL_5c9c70b7f00c4afb958815a69aee0c69", + "value": " 125/125 [00:00<00:00, 9.31kB/s]" + } + }, + "b7f88f02a2994b78b3896ff819d319ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c4c1fec0be994ac89622f441e4ff3354": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "caf7c06fa3b04d33ad38ffcac64c7dfe": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cec3087741c14a54a25fa6228c28aae3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_88ccf9763afd48019f621b8953211a92", + "IPY_MODEL_38683dbe19364b19a005b184a924a72c", + "IPY_MODEL_8af20d759d014ca78d2328346ba9c57b" ], - "source": [ - "! ls -l {MODEL_NAME}_spark_nlp" - ] + "layout": "IPY_MODEL_dd80da9f70c041c186b2fe9a347e05ec" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "LUlYEn6c07iE" - }, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny E5 model 😊" - ] + "d6ed52dfbff341dba71ec72853407d80": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_561900c921f04ab0870bd3b32badfbe1", + "placeholder": "​", + "style": "IPY_MODEL_edf3057d83c84f71afbd77a839fb1a77", + "value": "special_tokens_map.json: 100%" + } }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "Ltaki7iL07iE" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "\n", - "from sparknlp.base import *\n", - "from sparknlp.annotator import *\n", - "\n", - "document_assembler = DocumentAssembler()\\\n", - " .setInputCol(\"text\")\\\n", - " .setOutputCol(\"document\")\n", - "\n", - "BGE_loaded = BGEEmbeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", - " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"bge\")\\\n", - "\n", - "pipeline = Pipeline(\n", - " stages = [\n", - " document_assembler,\n", - " BGE_loaded\n", - " ])\n", - "\n", - "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", - "model = pipeline.fit(data)\n", - "result = model.transform(data)" - ] + "da3e4bc985454dfeac9072005ecc1476": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_caf7c06fa3b04d33ad38ffcac64c7dfe", + "max": 366, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_607fb88de6f44dd682878653c85a539e", + "value": 366 + } }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "xwdX1Ler07iE", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "e26c6baa-2952-47af-e386-fdd9d103b5bf" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+\n", - "| embeddings|\n", - "+--------------------+\n", - "|[-0.18189742, -0....|\n", - "+--------------------+\n", - "\n" - ] - } + "dd80da9f70c041c186b2fe9a347e05ec": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e4fdba14d81f4709b4377885faacbc84": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e84a92e1d713426c8fe341a623904c1c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e8c9a619bf2a4e1a8ab88d04e8571f36": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e8fa9fb8456a4a499ae3ab348a89f0e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6c1a99fe6ec84ca7afd06ed5be648eeb", + "placeholder": "​", + "style": "IPY_MODEL_a87d5e5fa0a94fdc959435575a1f0b21", + "value": "model.safetensors: 100%" + } + }, + "edf3057d83c84f71afbd77a839fb1a77": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ee0fa667215148858fd66cc301d1ada1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee18a8efc7564117b7243dcf266b6930": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_72ec65cd646f47b7b8787313ed308c4a", + "placeholder": "​", + "style": "IPY_MODEL_942546f9f9d34fc6b978d3efadeddb06", + "value": " 711k/711k [00:00<00:00, 26.5MB/s]" + } + }, + "ef655c7c3a08470fa85d00bc55b2bb51": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e8fa9fb8456a4a499ae3ab348a89f0e5", + "IPY_MODEL_a483a2cc4c7449db85992600938d1fd4", + "IPY_MODEL_53590c0136a44efb94ed26e30594552e" ], - "source": [ - "result.selectExpr(\"explode(bge.embeddings) as embeddings\").show()" - ] + "layout": "IPY_MODEL_6b37614be8144aaa8a370cfae6b4ab75" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "BQSavS4z07iF" - }, - "source": [ - "That's it! You can now go wild and use hundreds of E5 models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] + "f0fc8b2e6ff840309c9a219ea217fcc0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "F20-61Pd_rvB" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - }, - "accelerator": "GPU", - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "b6aeecd601db44c4b8d92abef34156c2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_6f2b77719e3344b1954eaaf46e7d512e", - "IPY_MODEL_73b4a51894114681abdc74131668e43e", - "IPY_MODEL_2c75bce84ce749c797446623bfcfaf39" - ], - "layout": "IPY_MODEL_48cff192c62444a292343467b1eec46a" - } - }, - "6f2b77719e3344b1954eaaf46e7d512e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_6ba1b72f64d94cd6878bce77ca47172b", - "placeholder": "​", - "style": "IPY_MODEL_bf2876273c1b409f969cbee51441309b", - "value": "config.json: 100%" - } - }, - "73b4a51894114681abdc74131668e43e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a4ca89a620284318996aa3ef1e1b9d28", - "max": 719, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_26970bab519f4c2293bb2d88455909a7", - "value": 719 - } - }, - "2c75bce84ce749c797446623bfcfaf39": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_bc73afc3091d40ca9c61d9352611a7a7", - "placeholder": "​", - "style": "IPY_MODEL_0e2c74dfa0364df6a4eb7d0b5f41e694", - "value": " 719/719 [00:00<00:00, 40.7kB/s]" - } - }, - "48cff192c62444a292343467b1eec46a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6ba1b72f64d94cd6878bce77ca47172b": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "bf2876273c1b409f969cbee51441309b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a4ca89a620284318996aa3ef1e1b9d28": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "26970bab519f4c2293bb2d88455909a7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "bc73afc3091d40ca9c61d9352611a7a7": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0e2c74dfa0364df6a4eb7d0b5f41e694": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a14c903e07e84eebbef3d4da3f9ffb53": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_563eba9b7c924ba9ad0e8abdb139a8e2", - "IPY_MODEL_82f71bd255b6416f9808f5805f017a26", - "IPY_MODEL_1c126c817ec941c58914112992764f24" - ], - "layout": "IPY_MODEL_394c30efbf924843869e9073a95af196" - } - }, - "563eba9b7c924ba9ad0e8abdb139a8e2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fdb324b3829f4a0791ebcfc11499062a", - "placeholder": "​", - "style": "IPY_MODEL_d0d4e25e98f941ad91481b1d3e0aeb7b", - "value": "model.safetensors: 100%" - } - }, - "82f71bd255b6416f9808f5805f017a26": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3e8ceaae9fe7468a9e14b2bbab3ece53", - "max": 437955512, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_34a28ead887e450b802ea8be16c0f592", - "value": 437955512 - } - }, - "1c126c817ec941c58914112992764f24": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_086dbc2868c0488b9d9dc5d3c8972133", - "placeholder": "​", - "style": "IPY_MODEL_9221a3d70fb644749e9828ea60b6ee3d", - "value": " 438M/438M [00:11<00:00, 39.8MB/s]" - } - }, - "394c30efbf924843869e9073a95af196": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fdb324b3829f4a0791ebcfc11499062a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d0d4e25e98f941ad91481b1d3e0aeb7b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3e8ceaae9fe7468a9e14b2bbab3ece53": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "34a28ead887e450b802ea8be16c0f592": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "086dbc2868c0488b9d9dc5d3c8972133": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9221a3d70fb644749e9828ea60b6ee3d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b94853f0f9c6437f82170bf67e9debc9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_0dac17a4e0304f33b1ad997590a663ad", - "IPY_MODEL_9f785d2b81c54695997e370210a61226", - "IPY_MODEL_26fa5822a4974257b8862581a60e3179" - ], - "layout": "IPY_MODEL_cf7cfc607b2041a993df35b525aa0d45" - } - }, - "0dac17a4e0304f33b1ad997590a663ad": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fa84fc0acea44fde89aa9ffd93a5aa9a", - "placeholder": "​", - "style": "IPY_MODEL_0b9da9243f9e49bdb6411c131a7e4682", - "value": "tokenizer_config.json: 100%" - } - }, - "9f785d2b81c54695997e370210a61226": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_cec343ddabf94dd0832e4ac7aec14ecc", - "max": 366, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_82e0ea7d46b5465ebc8faad3beab43b5", - "value": 366 - } - }, - "26fa5822a4974257b8862581a60e3179": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fb7485359aa74e82963d8255b41cf81f", - "placeholder": "​", - "style": "IPY_MODEL_aee26846c0634c9b8ae61b4dc14ed675", - "value": " 366/366 [00:00<00:00, 26.2kB/s]" - } - }, - "cf7cfc607b2041a993df35b525aa0d45": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fa84fc0acea44fde89aa9ffd93a5aa9a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0b9da9243f9e49bdb6411c131a7e4682": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "cec343ddabf94dd0832e4ac7aec14ecc": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "82e0ea7d46b5465ebc8faad3beab43b5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "fb7485359aa74e82963d8255b41cf81f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "aee26846c0634c9b8ae61b4dc14ed675": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e1e339b3d7a14048b0fc35b7d05cb60a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_b0a1cd96c31848b78b150258b0637a82", - "IPY_MODEL_84ba32d3a129477591e78264ddf22cb7", - "IPY_MODEL_8bdeba24a5b0463abcaa0213674b388b" - ], - "layout": "IPY_MODEL_82052250508f40b483fd5d5a2e3bc316" - } - }, - "b0a1cd96c31848b78b150258b0637a82": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5bc65236ab3443eb89b761452f5f776c", - "placeholder": "​", - "style": "IPY_MODEL_30696b2ee77846149f4b0d88f9d1cc32", - "value": "vocab.txt: 100%" - } - }, - "84ba32d3a129477591e78264ddf22cb7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7c03e098f4aa4e61bc13da78d6b7b7d8", - "max": 231508, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_de6ef93762a047998bb0d888d2a2720a", - "value": 231508 - } - }, - "8bdeba24a5b0463abcaa0213674b388b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_291ce98198874d7ebac0affca544861c", - "placeholder": "​", - "style": "IPY_MODEL_749855c3e9ac4566a509d834204ab0e4", - "value": " 232k/232k [00:00<00:00, 1.42MB/s]" - } - }, - "82052250508f40b483fd5d5a2e3bc316": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5bc65236ab3443eb89b761452f5f776c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "30696b2ee77846149f4b0d88f9d1cc32": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7c03e098f4aa4e61bc13da78d6b7b7d8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "de6ef93762a047998bb0d888d2a2720a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "291ce98198874d7ebac0affca544861c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "749855c3e9ac4566a509d834204ab0e4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "82438bb56d4841eaa7e4ff609e79edd9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_2aff7e730cce4d468136bab9c735d2e9", - "IPY_MODEL_517eb17af21949908b597e76f1c835b4", - "IPY_MODEL_0600b51130bc4f5bbe68e38bb5662ce1" - ], - "layout": "IPY_MODEL_0b0f81687aac4432bdc06a75bcc53320" - } - }, - "2aff7e730cce4d468136bab9c735d2e9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f013cfd30cd948159d6a3216dc00ab94", - "placeholder": "​", - "style": "IPY_MODEL_21f7d6700ffe4ce09d67147c17d13f20", - "value": "tokenizer.json: 100%" - } - }, - "517eb17af21949908b597e76f1c835b4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_69ac10476cd5458a8b06333a559c9d28", - "max": 711396, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_2e3a3b76638c44d481569a7d16d71cb7", - "value": 711396 - } - }, - "0600b51130bc4f5bbe68e38bb5662ce1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e0112abb7d0643f981094fb8ed67390f", - "placeholder": "​", - "style": "IPY_MODEL_0d7d08ec18b04866ae4c9f0870d8d8f6", - "value": " 711k/711k [00:00<00:00, 2.18MB/s]" - } - }, - "0b0f81687aac4432bdc06a75bcc53320": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f013cfd30cd948159d6a3216dc00ab94": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "21f7d6700ffe4ce09d67147c17d13f20": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "69ac10476cd5458a8b06333a559c9d28": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2e3a3b76638c44d481569a7d16d71cb7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "e0112abb7d0643f981094fb8ed67390f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0d7d08ec18b04866ae4c9f0870d8d8f6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "85542508f8fa4545a8b48829d5b69ded": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_1338f4e57b554fafba3d0fb2acf59b02", - "IPY_MODEL_bfcf2b27861a4c7b9451d6dc6ef600b6", - "IPY_MODEL_fdf60696d8be49ae9f9d98200b7c35e8" - ], - "layout": "IPY_MODEL_3febcb4b4101454eafbe2be02b06d7b4" - } - }, - "1338f4e57b554fafba3d0fb2acf59b02": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fcaf78ec75af4b76a95de9d73194516d", - "placeholder": "​", - "style": "IPY_MODEL_4b21dc70cd214d64ad2da330f38e1638", - "value": "special_tokens_map.json: 100%" - } - }, - "bfcf2b27861a4c7b9451d6dc6ef600b6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_66652fb210ff46cea975b7f02c34df5d", - "max": 125, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_7fa5ffc59fb14dc39feac160a10bce66", - "value": 125 - } - }, - "fdf60696d8be49ae9f9d98200b7c35e8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_03a985766f57423991c4a416d487b451", - "placeholder": "​", - "style": "IPY_MODEL_a40c77d69fe641c298616c62381faaf1", - "value": " 125/125 [00:00<00:00, 5.69kB/s]" - } - }, - "3febcb4b4101454eafbe2be02b06d7b4": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fcaf78ec75af4b76a95de9d73194516d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4b21dc70cd214d64ad2da330f38e1638": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "66652fb210ff46cea975b7f02c34df5d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7fa5ffc59fb14dc39feac160a10bce66": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "03a985766f57423991c4a416d487b451": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a40c77d69fe641c298616c62381faaf1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } + "faeec3bcda1b4682a9c5754394e7515c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fbe3133d96334b3588ccdc747a4dc860": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5127f6c9f27f45a281e3de25b798f442", + "placeholder": "​", + "style": "IPY_MODEL_5b9976ca08014c5ebe13c299b280bdb1", + "value": "tokenizer.json: 100%" + } + }, + "ff2effbb2e10466e850209f56c354636": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d6ed52dfbff341dba71ec72853407d80", + "IPY_MODEL_05843a7fbe90487395d085273a862474", + "IPY_MODEL_a95909b2bc4841f4bd64dc9f4732d9c8" + ], + "layout": "IPY_MODEL_a6bd88a906a74c2483fb5f1ef48ff445" + } } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb index 7972181e127b7e..4827be2fa9373f 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb @@ -36,7 +36,7 @@ "metadata": {}, "source": [ "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", - "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", "- Albert uses SentencePiece, so we will have to install that as well" ] }, @@ -116,7 +116,7 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece tensorflow" + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum sentencepiece tensorflow" ] }, { diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForSequenceClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForSequenceClassification.ipynb index 7f27dd9ef33c49..9d29064eb44abc 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForSequenceClassification.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForSequenceClassification.ipynb @@ -36,7 +36,7 @@ "metadata": {}, "source": [ "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", - "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", "- Albert uses SentencePiece, so we will have to install that as well" ] }, @@ -121,7 +121,7 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece tensorflow" + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum sentencepiece tensorflow" ] }, { diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForTokenClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForTokenClassification.ipynb index 43b9511376ff72..a42aa763837c37 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForTokenClassification.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForTokenClassification.ipynb @@ -6,7 +6,7 @@ "source": [ "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace%20ONNX%20in%20Spark%20NLP%20-%20AlbertForTokenClassification.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace%20ONNX%20in%20Spark%20NLP%20-%20BertForTokenClassification.ipynb)" ] }, { @@ -116,7 +116,7 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece tensorflow" + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum sentencepiece tensorflow" ] }, { diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForZeroShotClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForZeroShotClassification.ipynb index 00bdadfdf0f31a..5fc57b584455a8 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForZeroShotClassification.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForZeroShotClassification.ipynb @@ -1,2532 +1,2383 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForZeroShotClassification.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import ONNX BertForZeroShotClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `BertForZeroShotClassification ` is only available since in `Spark NLP 5.2.4` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import BERT models trained/fine-tuned for zero shot classification via `BertForSequenceClassification` or `TFBertForSequenceClassification`. These models are usually under `Zero-Shot Classification` category and have `bert` in their labels\n", + "- Reference: [TFBertForSequenceClassification](https://huggingface.co/transformers/model_doc/bert.html#tfbertforsequenceclassification)\n", + "- Some [example models](https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=downloads&search=bert)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- Albert uses SentencePiece, so we will have to install that as well" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "tIhDjN37_WEc" - }, - "source": [ - "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForZeroShotClassification.ipynb)" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m402.5/402.5 kB\u001b[0m \u001b[31m36.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m81.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.8/455.8 kB\u001b[0m \u001b[31m41.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m95.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m23.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m40.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m79.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m92.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m72.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m88.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m81.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m72.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m98.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m79.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m104.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m87.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m80.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m104.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m77.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m104.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m47.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m26.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m42.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m85.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m102.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m41.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m78.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m81.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m37.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m110.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m50.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m105.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m73.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m59.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m97.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m98.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m40.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m39.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m40.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m37.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m60.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m64.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m61.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m71.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m91.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m465.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum sentencepiece tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [NbAiLab/nb-bert-base-mnli](https://huggingface.co/NbAiLab/nb-bert-base-mnli) model from HuggingFace as an example and load it as a `ORTModelForSequenceClassification`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "jtvJkUO7_WEf" - }, - "source": [ - "## Import ONNX BertForZeroShotClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", - "- `BertForZeroShotClassification ` is only available since in `Spark NLP 5.2.4` and after. So please make sure you have upgraded to the latest Spark NLP release\n", - "- You can import BERT models trained/fine-tuned for zero shot classification via `BertForSequenceClassification` or `TFBertForSequenceClassification`. These models are usually under `Zero-Shot Classification` category and have `bert` in their labels\n", - "- Reference: [TFBertForSequenceClassification](https://huggingface.co/transformers/model_doc/bert.html#tfbertforsequenceclassification)\n", - "- Some [example models](https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=downloads&search=bert)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "Biy6z0oM_WEg" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dc2010ab9c8b4ce2a61fd2f5bf584466", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Export and Save HuggingFace model" + "text/plain": [ + "config.json: 0%| | 0.00/639 [00:00=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", - "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece tensorflow" + "text/plain": [ + "model.safetensors: 0%| | 0.00/438M [00:00 False\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForSequenceClassification\n", - "import tensorflow as tf\n", - "\n", - "MODEL_NAME = 'aloxatel/bert-base-mnli'\n", - "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(EXPORT_PATH)" + "text/plain": [ + "vocab.txt: 0%| | 0.00/232k [00:00 False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForSequenceClassification\n", + "import tensorflow as tf\n", + "\n", + "MODEL_NAME = 'aloxatel/bert-base-mnli'\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(EXPORT_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "HKR22Lek_WEp" - }, - "source": [ - "- As you can see, we need to move `vocabs.txt` from the tokenizer to assets folder which Spark NLP will look for\n", - "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 428876\n", + "-rw-r--r-- 1 root root 813 Feb 1 10:15 config.json\n", + "-rw-r--r-- 1 root root 438204942 Feb 1 10:15 model.onnx\n", + "-rw-r--r-- 1 root root 125 Feb 1 10:15 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 366 Feb 1 10:15 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 711396 Feb 1 10:15 tokenizer.json\n", + "-rw-r--r-- 1 root root 231508 Feb 1 10:15 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- As you can see, we need to move `vocabs.txt` from the tokenizer to assets folder which Spark NLP will look for\n", + "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get label2id dictionary\n", + "labels = ort_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(EXPORT_PATH + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mv {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0Fz5W4s5_WEq" - }, - "outputs": [], - "source": [ - "!mkdir {EXPORT_PATH}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "onnx_models/aloxatel/bert-base-mnli:\n", + "total 428652\n", + "drwxr-xr-x 2 root root 4096 Feb 1 10:15 assets\n", + "-rw-r--r-- 1 root root 813 Feb 1 10:15 config.json\n", + "-rw-r--r-- 1 root root 438204942 Feb 1 10:15 model.onnx\n", + "-rw-r--r-- 1 root root 125 Feb 1 10:15 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 366 Feb 1 10:15 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 711396 Feb 1 10:15 tokenizer.json\n", + "\n", + "onnx_models/aloxatel/bert-base-mnli/assets:\n", + "total 232\n", + "-rw-r--r-- 1 root root 32 Feb 1 10:15 labels.txt\n", + "-rw-r--r-- 1 root root 231508 Feb 1 10:15 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -lR {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save BertForZeroShotClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFR7rLmp_WEq" - }, - "outputs": [], - "source": [ - "# get label2id dictionary\n", - "labels = ort_model.config.id2label\n", - "# sort the dictionary based on the id\n", - "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", - "\n", - "with open(EXPORT_PATH + '/assets/labels.txt', 'w') as f:\n", - " f.write('\\n'.join(labels))" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-09-29 19:41:03-- http://setup.johnsnowlabs.com/colab.sh\n", + "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", + "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", + "HTTP request sent, awaiting response... 302 Moved Temporarily\n", + "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", + "--2023-09-29 19:41:04-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1191 (1.2K) [text/plain]\n", + "Saving to: ‘STDOUT’\n", + "\n", + "- 100%[===================>] 1.16K --.-KB/s in 0s \n", + "\n", + "2023-09-29 19:41:04 (106 MB/s) - written to stdout [1191/1191]\n", + "\n", + "Installing PySpark 3.2.3 and Spark NLP 5.1.2\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.2\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m536.3/536.3 kB\u001b[0m \u001b[31m38.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m19.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4vOLnld4_WEq" - }, - "outputs": [], - "source": [ - "!mv {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `BertForZeroShotClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `BertForZeroShotClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "zero_shot_classifier = BertForZeroShotClassification.loadSavedModel(\n", + " EXPORT_PATH,\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document\", \"token\"]) \\\n", + " .setOutputCol(\"class\") \\\n", + " .setCandidateLabels([\"urgent\", \"mobile\", \"travel\", \"movie\", \"music\", \"sport\", \"weather\", \"technology\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zero_shot_classifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(MODEL_NAME))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your BertForZeroShotClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "hNPl3hqx_WEr" - }, - "source": [ - "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 429464\n", + "-rw-r--r-- 1 root root 439759046 Sep 29 19:42 bert_classification_onnx\n", + "drwxr-xr-x 4 root root 4096 Sep 29 19:42 fields\n", + "drwxr-xr-x 2 root root 4096 Sep 29 19:42 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BertForZeroShotClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zero_shot_classifier_loaded = BertForZeroShotClassification.load(\"./{}_spark_nlp_onnx\".format(MODEL_NAME))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uQNah_q7_WEr", - "outputId": "157c7f9e-9568-494c-d7c4-aa90d49942ee", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "onnx_models/aloxatel/bert-base-mnli:\n", - "total 428652\n", - "drwxr-xr-x 2 root root 4096 Feb 1 10:15 assets\n", - "-rw-r--r-- 1 root root 813 Feb 1 10:15 config.json\n", - "-rw-r--r-- 1 root root 438204942 Feb 1 10:15 model.onnx\n", - "-rw-r--r-- 1 root root 125 Feb 1 10:15 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 366 Feb 1 10:15 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 711396 Feb 1 10:15 tokenizer.json\n", - "\n", - "onnx_models/aloxatel/bert-base-mnli/assets:\n", - "total 232\n", - "-rw-r--r-- 1 root root 32 Feb 1 10:15 labels.txt\n", - "-rw-r--r-- 1 root root 231508 Feb 1 10:15 vocab.txt\n" - ] - } - ], - "source": [ - "!ls -lR {EXPORT_PATH}" + "data": { + "text/plain": [ + "['NEU', 'POS', 'NEG']" ] - }, + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "zero_shot_classifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "0s4GZ9SD_WEr" - }, - "source": [ - "## Import and Save BertForZeroShotClassification in Spark NLP\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------+------+\n", + "| text|result|\n", + "+------------------+------+\n", + "|Te quiero. Te amo.| [POS]|\n", + "+------------------+------+\n", + "\n" + ] + } + ], + "source": [ + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline, PipelineModel\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "tokenizer = Tokenizer().setInputCols(\"document\").setOutputCol(\"token\")\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " zero_shot_classifier_loaded\n", + "])\n", + "\n", + "text = [[\"I have a problem with my iphone that needs to be resolved asap!!\"],\n", + " [\"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\"],\n", + " [\"I have a phone and I love it!\"],\n", + " [\"I really want to visit Germany and I am planning to go there next year.\"],\n", + " [\"Let's watch some movies tonight! I am in the mood for a horror movie.\"],\n", + " [\"Have you watched the match yesterday? It was a great game!\"],\n", + " [\"We need to harry up and get to the airport. We are going to miss our flight!\"]]\n", + "\n", + "# create a DataFrame in PySpark\n", + "inputDataset = spark.createDataFrame(text, [\"text\"])\n", + "model = pipeline.fit(inputDataset)\n", + "model.transform(inputDataset).select(\"class.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of `BertForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "05e2cc6a1b8d445b9f4c75b8eafc8569": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9ff55de49cfb4c5da242f79c29d765c0", + "placeholder": "​", + "style": "IPY_MODEL_2ca6b6198d1247a086ad6c77ef98745d", + "value": "vocab.txt: 100%" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "7JnUm4oH_WEr" - }, - "source": [ - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script" - ] + "081a21e0131647a4a934b80ce35f99fe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nkuttlrp_WEr", - "outputId": "b1b0d012-62ea-4567-e9e5-3f8f466ceda3" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2023-09-29 19:41:03-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2023-09-29 19:41:04-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2023-09-29 19:41:04 (106 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 5.1.2\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.2\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m536.3/536.3 kB\u001b[0m \u001b[31m38.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m19.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } + "110a9c4ba8144b23bec988991c50ea69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "18cba1a7ebf54e389b0512644fd4eb8d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1b35a6c060fb45bb8a7c50e4e58b3e70": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2650006b821e493abfb0005f680b4bbb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2b7c0e9b54a04512be2914256c92a8b0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_374ab98ad46c4518b0800c58d12702b5", + "placeholder": "​", + "style": "IPY_MODEL_fc725112860647558d1c3614ada8be40", + "value": " 112/112 [00:00<00:00, 7.23kB/s]" + } + }, + "2ca6b6198d1247a086ad6c77ef98745d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2e6d65b7f9e943c294b604769f714bdb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f9b2816622b4ea78d333b28d2e1d528": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5c9ee9d2bfff4dd9bf111523b15f28b8", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4fc274d64c994e19851775ff1f5b7bfd", + "value": 231508 + } + }, + "31926af1bcb341f1805e31c9f6a105a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "31e8635f8ed543fe96980f24c1977435": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "374ab98ad46c4518b0800c58d12702b5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3a713cbb11ee47ca8a1c5e09c5e2134c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f9a44a68c55546a6bd1425f649e84fce", + "IPY_MODEL_a65ebf8907284391bd098d8906997a65", + "IPY_MODEL_5a00dcdd1b3840538bbdffe4c9dfccac" ], - "source": [ - "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] + "layout": "IPY_MODEL_fea490f5d6ce4b0cb044aed1fcac40c6" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "7_MYXgjf_WEs" - }, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] + "46ce6bbe733144b493460694864e042f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_892540858af848e2989805c7facf9eaa", + "IPY_MODEL_8bc01972359142c6b20c98fee0608c26", + "IPY_MODEL_9fb30845b7e245829867afeaac10e8a1" + ], + "layout": "IPY_MODEL_a04cbc3089c24b8d9056ed56dcb383fe" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kOc3hrRI_WEs", - "outputId": "53287f60-6d7e-46aa-8845-d4789596b0a6" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apache Spark version: 3.2.3\n" - ] - } + "4a79b68ed33047edb71fcc08d4e098ad": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4bbc0cfc3e194e58b688d1ddad08e355": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_05e2cc6a1b8d445b9f4c75b8eafc8569", + "IPY_MODEL_2f9b2816622b4ea78d333b28d2e1d528", + "IPY_MODEL_eac29afa1787461d80b19843a322b35b" ], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()\n", - "\n", - "print(\"Apache Spark version: {}\".format(spark.version))" - ] + "layout": "IPY_MODEL_600ef6e0875047feb0edeb29f1e2cec4" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "FA3w5kEt_WEs" - }, - "source": [ - "- Let's use `loadSavedModel` functon in `BertForZeroShotClassification` which allows us to load TensorFlow model in SavedModel format\n", - "- Most params can be set later when you are loading this model in `BertForZeroShotClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" - ] + "4d4920f93b2c4f1c96670ac9396fb244": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ab2CrizU_WEs" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *\n", - "\n", - "zero_shot_classifier = BertForZeroShotClassification.loadSavedModel(\n", - " '{}/saved_model/1'.format(MODEL_NAME),\n", - " spark\n", - " )\\\n", - " .setInputCols([\"document\", \"token\"]) \\\n", - " .setOutputCol(\"class\") \\\n", - " .setCandidateLabels([\"urgent\", \"mobile\", \"travel\", \"movie\", \"music\", \"sport\", \"weather\", \"technology\"])" - ] + "4fc274d64c994e19851775ff1f5b7bfd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "paajUy-T_WEs" - }, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] + "557b68d5c6bd47c586571bb6d03fb75b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2650006b821e493abfb0005f680b4bbb", + "placeholder": "​", + "style": "IPY_MODEL_5ee1cbef2ffe4cc2b1af445726a25871", + "value": "config.json: 100%" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DiXFayTa_WEs" - }, - "outputs": [], - "source": [ - "zero_shot_classifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(MODEL_NAME))" - ] + "5a00dcdd1b3840538bbdffe4c9dfccac": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8d554fc5ff7e4a149fccda1e5d197326", + "placeholder": "​", + "style": "IPY_MODEL_a581cd1016fb4d9083039b55162d03c8", + "value": " 438M/438M [00:27<00:00, 17.0MB/s]" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "5yTVmF8r_WEt" - }, - "source": [ - "Let's clean up stuff we don't need anymore" - ] + "5c9ee9d2bfff4dd9bf111523b15f28b8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1PPJw45m_WEt" - }, - "outputs": [], - "source": [ - "!rm -rf {EXPORT_PATH}" - ] + "5ee1cbef2ffe4cc2b1af445726a25871": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "F5uVLNjp_WEt" - }, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your BertForZeroShotClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] + "600ef6e0875047feb0edeb29f1e2cec4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XPs8R23U_WEt", - "outputId": "b302795a-74be-4859-96b8-dfefe9fe5b69" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 429464\n", - "-rw-r--r-- 1 root root 439759046 Sep 29 19:42 bert_classification_onnx\n", - "drwxr-xr-x 4 root root 4096 Sep 29 19:42 fields\n", - "drwxr-xr-x 2 root root 4096 Sep 29 19:42 metadata\n" - ] - } - ], - "source": [ - "! ls -l {MODEL_NAME}_spark_nlp_onnx" - ] + "608a47e426c44acea7e60f8b386e0677": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a273ab182aab40e79a1f3b8517452322", + "placeholder": "​", + "style": "IPY_MODEL_31926af1bcb341f1805e31c9f6a105a6", + "value": " 639/639 [00:00<00:00, 33.1kB/s]" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "AEsYTR2T_WEt" - }, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BertForZeroShotClassification model 😊" - ] + "698707314ce945198dbf494154d0d3d9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yozw4m76_WEt" - }, - "outputs": [], - "source": [ - "sequenceClassifier_loaded = BertForZeroShotClassification.load(\"./{}_spark_nlp_onnx\".format(MODEL_NAME))\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"class\")" - ] + "76891611ea974bed8ce9825197d6ede6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "rATPyeeR_WEu" - }, - "source": [ - "You can see what labels were used to train this model via `getClasses` function:" - ] + "7c20f9b4aa514244b413026b484773be": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fMNJ9mfr_WEu", - "outputId": "46d0ab80-fac2-4cb7-e091-fa8895e31217" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['NEU', 'POS', 'NEG']" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } + "7cfb8103d0914f8db95a98d0298241c1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "892540858af848e2989805c7facf9eaa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_698707314ce945198dbf494154d0d3d9", + "placeholder": "​", + "style": "IPY_MODEL_76891611ea974bed8ce9825197d6ede6", + "value": "tokenizer_config.json: 100%" + } + }, + "8bc01972359142c6b20c98fee0608c26": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4a79b68ed33047edb71fcc08d4e098ad", + "max": 48, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_110a9c4ba8144b23bec988991c50ea69", + "value": 48 + } + }, + "8c3c8bc1c60c4be1b2ce6eb91fb3d80c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8d554fc5ff7e4a149fccda1e5d197326": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "98ccf59935c148af9266163fa8e12f36": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9fb30845b7e245829867afeaac10e8a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1b35a6c060fb45bb8a7c50e4e58b3e70", + "placeholder": "​", + "style": "IPY_MODEL_7cfb8103d0914f8db95a98d0298241c1", + "value": " 48.0/48.0 [00:00<00:00, 3.08kB/s]" + } + }, + "9ff55de49cfb4c5da242f79c29d765c0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a04cbc3089c24b8d9056ed56dcb383fe": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a273ab182aab40e79a1f3b8517452322": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a581cd1016fb4d9083039b55162d03c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a65ebf8907284391bd098d8906997a65": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b637bec487a949dbad7283eedc25cf51", + "max": 437961724, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_081a21e0131647a4a934b80ce35f99fe", + "value": 437961724 + } + }, + "a82f20d3bade4075a67de714763267d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7c20f9b4aa514244b413026b484773be", + "max": 112, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8c3c8bc1c60c4be1b2ce6eb91fb3d80c", + "value": 112 + } + }, + "b637bec487a949dbad7283eedc25cf51": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bd84a229a7e2423e9afff844346027a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cc1743bea22342ff9f81223b231ac387": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ce68fc9368cb4860a3c65fb33a3b9362": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2e6d65b7f9e943c294b604769f714bdb", + "placeholder": "​", + "style": "IPY_MODEL_e6ca865f09b440f0b646f74b85a930c2", + "value": "special_tokens_map.json: 100%" + } + }, + "d046f93abb3945c9beec40f5b5e7034d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dc2010ab9c8b4ce2a61fd2f5bf584466": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_557b68d5c6bd47c586571bb6d03fb75b", + "IPY_MODEL_fb27d436ed9b41b3813afdabc7e7168e", + "IPY_MODEL_608a47e426c44acea7e60f8b386e0677" ], - "source": [ - "# .getClasses was introduced in spark-nlp==3.4.0\n", - "sequenceClassifier_loaded.getClasses()" - ] + "layout": "IPY_MODEL_d046f93abb3945c9beec40f5b5e7034d" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "wqNxl8_E_WEu" - }, - "source": [ - "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" - ] + "e6ca865f09b440f0b646f74b85a930c2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SAulQjDX_WEu", - "outputId": "fef295e4-0b21-48fa-af0c-139579c50527" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------------+------+\n", - "| text|result|\n", - "+------------------+------+\n", - "|Te quiero. Te amo.| [POS]|\n", - "+------------------+------+\n", - "\n" - ] - } + "eac29afa1787461d80b19843a322b35b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_98ccf59935c148af9266163fa8e12f36", + "placeholder": "​", + "style": "IPY_MODEL_18cba1a7ebf54e389b0512644fd4eb8d", + "value": " 232k/232k [00:00<00:00, 952kB/s]" + } + }, + "ef611b4c4adf4cdfb9e118d93fbe346a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ce68fc9368cb4860a3c65fb33a3b9362", + "IPY_MODEL_a82f20d3bade4075a67de714763267d3", + "IPY_MODEL_2b7c0e9b54a04512be2914256c92a8b0" ], - "source": [ - "from sparknlp.base import *\n", - "from sparknlp.annotator import *\n", - "from pyspark.ml import Pipeline, PipelineModel\n", - "\n", - "document_assembler = DocumentAssembler() \\\n", - " .setInputCol(\"text\") \\\n", - " .setOutputCol(\"document\")\n", - "\n", - "tokenizer = Tokenizer().setInputCols(\"document\").setOutputCol(\"token\")\n", - "\n", - "pipeline = Pipeline(stages=[\n", - " document_assembler,\n", - " tokenizer,\n", - " zero_shot_classifier_loaded\n", - "])\n", - "\n", - "text = [[\"I have a problem with my iphone that needs to be resolved asap!!\"],\n", - " [\"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\"],\n", - " [\"I have a phone and I love it!\"],\n", - " [\"I really want to visit Germany and I am planning to go there next year.\"],\n", - " [\"Let's watch some movies tonight! I am in the mood for a horror movie.\"],\n", - " [\"Have you watched the match yesterday? It was a great game!\"],\n", - " [\"We need to harry up and get to the airport. We are going to miss our flight!\"]]\n", - "\n", - "# create a DataFrame in PySpark\n", - "inputDataset = spark.createDataFrame(text, [\"text\"])\n", - "model = pipeline.fit(inputDataset)\n", - "model.transform(inputDataset).select(\"class.result\").show()" - ] + "layout": "IPY_MODEL_f96f6adfbec643c5aed81d1d3df80daa" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "JzEwOk48_WEu" - }, - "source": [ - "That's it! You can now go wild and use hundreds of `BertForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4" + "f96f6adfbec643c5aed81d1d3df80daa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "f9a44a68c55546a6bd1425f649e84fce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_31e8635f8ed543fe96980f24c1977435", + "placeholder": "​", + "style": "IPY_MODEL_bd84a229a7e2423e9afff844346027a6", + "value": "model.safetensors: 100%" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "fb27d436ed9b41b3813afdabc7e7168e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cc1743bea22342ff9f81223b231ac387", + "max": 639, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4d4920f93b2c4f1c96670ac9396fb244", + "value": 639 + } }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "dc2010ab9c8b4ce2a61fd2f5bf584466": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_557b68d5c6bd47c586571bb6d03fb75b", - "IPY_MODEL_fb27d436ed9b41b3813afdabc7e7168e", - "IPY_MODEL_608a47e426c44acea7e60f8b386e0677" - ], - "layout": "IPY_MODEL_d046f93abb3945c9beec40f5b5e7034d" - } - }, - "557b68d5c6bd47c586571bb6d03fb75b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2650006b821e493abfb0005f680b4bbb", - "placeholder": "​", - "style": "IPY_MODEL_5ee1cbef2ffe4cc2b1af445726a25871", - "value": "config.json: 100%" - } - }, - "fb27d436ed9b41b3813afdabc7e7168e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_cc1743bea22342ff9f81223b231ac387", - "max": 639, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_4d4920f93b2c4f1c96670ac9396fb244", - "value": 639 - } - }, - "608a47e426c44acea7e60f8b386e0677": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a273ab182aab40e79a1f3b8517452322", - "placeholder": "​", - "style": "IPY_MODEL_31926af1bcb341f1805e31c9f6a105a6", - "value": " 639/639 [00:00<00:00, 33.1kB/s]" - } - }, - "d046f93abb3945c9beec40f5b5e7034d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2650006b821e493abfb0005f680b4bbb": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5ee1cbef2ffe4cc2b1af445726a25871": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "cc1743bea22342ff9f81223b231ac387": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4d4920f93b2c4f1c96670ac9396fb244": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "a273ab182aab40e79a1f3b8517452322": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "31926af1bcb341f1805e31c9f6a105a6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3a713cbb11ee47ca8a1c5e09c5e2134c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_f9a44a68c55546a6bd1425f649e84fce", - "IPY_MODEL_a65ebf8907284391bd098d8906997a65", - "IPY_MODEL_5a00dcdd1b3840538bbdffe4c9dfccac" - ], - "layout": "IPY_MODEL_fea490f5d6ce4b0cb044aed1fcac40c6" - } - }, - "f9a44a68c55546a6bd1425f649e84fce": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_31e8635f8ed543fe96980f24c1977435", - "placeholder": "​", - "style": "IPY_MODEL_bd84a229a7e2423e9afff844346027a6", - "value": "model.safetensors: 100%" - } - }, - "a65ebf8907284391bd098d8906997a65": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b637bec487a949dbad7283eedc25cf51", - "max": 437961724, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_081a21e0131647a4a934b80ce35f99fe", - "value": 437961724 - } - }, - "5a00dcdd1b3840538bbdffe4c9dfccac": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_8d554fc5ff7e4a149fccda1e5d197326", - "placeholder": "​", - "style": "IPY_MODEL_a581cd1016fb4d9083039b55162d03c8", - "value": " 438M/438M [00:27<00:00, 17.0MB/s]" - } - }, - "fea490f5d6ce4b0cb044aed1fcac40c6": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "31e8635f8ed543fe96980f24c1977435": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "bd84a229a7e2423e9afff844346027a6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b637bec487a949dbad7283eedc25cf51": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "081a21e0131647a4a934b80ce35f99fe": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "8d554fc5ff7e4a149fccda1e5d197326": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a581cd1016fb4d9083039b55162d03c8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "46ce6bbe733144b493460694864e042f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_892540858af848e2989805c7facf9eaa", - "IPY_MODEL_8bc01972359142c6b20c98fee0608c26", - "IPY_MODEL_9fb30845b7e245829867afeaac10e8a1" - ], - "layout": "IPY_MODEL_a04cbc3089c24b8d9056ed56dcb383fe" - } - }, - "892540858af848e2989805c7facf9eaa": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_698707314ce945198dbf494154d0d3d9", - "placeholder": "​", - "style": "IPY_MODEL_76891611ea974bed8ce9825197d6ede6", - "value": "tokenizer_config.json: 100%" - } - }, - "8bc01972359142c6b20c98fee0608c26": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4a79b68ed33047edb71fcc08d4e098ad", - "max": 48, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_110a9c4ba8144b23bec988991c50ea69", - "value": 48 - } - }, - "9fb30845b7e245829867afeaac10e8a1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1b35a6c060fb45bb8a7c50e4e58b3e70", - "placeholder": "​", - "style": "IPY_MODEL_7cfb8103d0914f8db95a98d0298241c1", - "value": " 48.0/48.0 [00:00<00:00, 3.08kB/s]" - } - }, - "a04cbc3089c24b8d9056ed56dcb383fe": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "698707314ce945198dbf494154d0d3d9": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "76891611ea974bed8ce9825197d6ede6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4a79b68ed33047edb71fcc08d4e098ad": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "110a9c4ba8144b23bec988991c50ea69": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "1b35a6c060fb45bb8a7c50e4e58b3e70": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7cfb8103d0914f8db95a98d0298241c1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4bbc0cfc3e194e58b688d1ddad08e355": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_05e2cc6a1b8d445b9f4c75b8eafc8569", - "IPY_MODEL_2f9b2816622b4ea78d333b28d2e1d528", - "IPY_MODEL_eac29afa1787461d80b19843a322b35b" - ], - "layout": "IPY_MODEL_600ef6e0875047feb0edeb29f1e2cec4" - } - }, - "05e2cc6a1b8d445b9f4c75b8eafc8569": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9ff55de49cfb4c5da242f79c29d765c0", - "placeholder": "​", - "style": "IPY_MODEL_2ca6b6198d1247a086ad6c77ef98745d", - "value": "vocab.txt: 100%" - } - }, - "2f9b2816622b4ea78d333b28d2e1d528": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5c9ee9d2bfff4dd9bf111523b15f28b8", - "max": 231508, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_4fc274d64c994e19851775ff1f5b7bfd", - "value": 231508 - } - }, - "eac29afa1787461d80b19843a322b35b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_98ccf59935c148af9266163fa8e12f36", - "placeholder": "​", - "style": "IPY_MODEL_18cba1a7ebf54e389b0512644fd4eb8d", - "value": " 232k/232k [00:00<00:00, 952kB/s]" - } - }, - "600ef6e0875047feb0edeb29f1e2cec4": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9ff55de49cfb4c5da242f79c29d765c0": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2ca6b6198d1247a086ad6c77ef98745d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "5c9ee9d2bfff4dd9bf111523b15f28b8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4fc274d64c994e19851775ff1f5b7bfd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "98ccf59935c148af9266163fa8e12f36": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "18cba1a7ebf54e389b0512644fd4eb8d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ef611b4c4adf4cdfb9e118d93fbe346a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_ce68fc9368cb4860a3c65fb33a3b9362", - "IPY_MODEL_a82f20d3bade4075a67de714763267d3", - "IPY_MODEL_2b7c0e9b54a04512be2914256c92a8b0" - ], - "layout": "IPY_MODEL_f96f6adfbec643c5aed81d1d3df80daa" - } - }, - "ce68fc9368cb4860a3c65fb33a3b9362": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2e6d65b7f9e943c294b604769f714bdb", - "placeholder": "​", - "style": "IPY_MODEL_e6ca865f09b440f0b646f74b85a930c2", - "value": "special_tokens_map.json: 100%" - } - }, - "a82f20d3bade4075a67de714763267d3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7c20f9b4aa514244b413026b484773be", - "max": 112, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_8c3c8bc1c60c4be1b2ce6eb91fb3d80c", - "value": 112 - } - }, - "2b7c0e9b54a04512be2914256c92a8b0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_374ab98ad46c4518b0800c58d12702b5", - "placeholder": "​", - "style": "IPY_MODEL_fc725112860647558d1c3614ada8be40", - "value": " 112/112 [00:00<00:00, 7.23kB/s]" - } - }, - "f96f6adfbec643c5aed81d1d3df80daa": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2e6d65b7f9e943c294b604769f714bdb": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e6ca865f09b440f0b646f74b85a930c2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7c20f9b4aa514244b413026b484773be": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8c3c8bc1c60c4be1b2ce6eb91fb3d80c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "374ab98ad46c4518b0800c58d12702b5": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fc725112860647558d1c3614ada8be40": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } + "fc725112860647558d1c3614ada8be40": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "accelerator": "GPU" - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "fea490f5d6ce4b0cb044aed1fcac40c6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertSentenceEmbeddings.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertSentenceEmbeddings.ipynb index ce8caba7472d61..6716957245597a 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertSentenceEmbeddings.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertSentenceEmbeddings.ipynb @@ -414,9 +414,8 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python [conda env:sparknlp_dev]", - "language": "python", - "name": "conda-env-sparknlp_dev-py" + "display_name": "Python 3", + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CLIP.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CLIP.ipynb index 4310d452eba8af..f9d70656a7f95c 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CLIP.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CLIP.ipynb @@ -88,8 +88,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - ], + "outputs": [], "source": [ "! optimum-cli export onnx --model {MODEL_NAME} {EXPORT_PATH}" ] @@ -314,8 +313,8 @@ ], "source": [ "from PIL import Image\n", - "\n", - "Image.open(\"../../../../src/test/resources/image/egyptian_cat.jpeg\")" + "!wget https://github.com/JohnSnowLabs/spark-nlp/raw/master/src/test/resources/image/egyptian_cat.jpeg\n", + "Image.open(\"egyptian_cat.jpeg\")" ] }, { @@ -352,7 +351,7 @@ "imageDF = spark.read \\\n", " .format(\"image\") \\\n", " .option(\"dropInvalid\", value = True) \\\n", - " .load(\"../../../../src/test/resources/image/egyptian_cat.jpeg\")\n", + " .load(\"egyptian_cat.jpeg\")\n", "\n", "imageAssembler = ImageAssembler() \\\n", " .setInputCol(\"image\") \\\n", @@ -387,10 +386,10 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" -}, + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBERT.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBERT.ipynb index 5c04b3e61de50d..acf34540b2246a 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBERT.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBERT.ipynb @@ -1,2422 +1,2285 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBERT.ipynb)\n", + "\n", + "# Import ONNX CamemBERT models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for CamemBERT from HuggingFace and they have to be in `Fill Mask` category. Meaning, you cannot use CamemBERT models trained/fine-tuned on a specific task such as token/sequence classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "7Ba7C4s3YxPm" - }, - "source": [ - "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBert.ipynb)\n", - "\n", - "# Import ONNX CamemBERT models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", - "- You can import models for CamemBERT from HuggingFace and they have to be in `Fill Mask` category. Meaning, you cannot use CamemBERT models trained/fine-tuned on a specific task such as token/sequence classification." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m407.1/407.1 kB\u001b[0m \u001b[31m24.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m60.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.8/455.8 kB\u001b[0m \u001b[31m38.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m69.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m536.7/536.7 kB\u001b[0m \u001b[31m40.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m84.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m112.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m107.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m98.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m103.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m110.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m110.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m107.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m85.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m105.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m102.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m100.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m108.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m92.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m23.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m27.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m106.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m47.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m94.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m101.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m84.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m77.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m102.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m41.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m85.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m110.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m35.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m88.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m61.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m69.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m75.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.1/17.1 MB\u001b[0m \u001b[31m82.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m84.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m67.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m108.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m45.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m95.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m39.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m30.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m27.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m67.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m82.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m69.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m34.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m83.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [camembert-base](https://huggingface.co/camembert-base) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n", + "- In addition to the CamemBERT model, we also need to save the `CamembertTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "GksUloJ0YxPo" - }, - "source": [ - "## Export and Save HuggingFace model" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "vrxtF-pPYxPp" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "566c95aa32904d73923ed7e93d99d373", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", - "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + "text/plain": [ + "config.json: 0%| | 0.00/508 [00:00=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", - "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Framework not specified. Using pt to export the model.\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "RF5us7X6YxPr" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "20bb65ce09c440658155d6f1f2590a09", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", - "- We'll use [camembert-base](https://huggingface.co/camembert-base) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n", - "- In addition to the CamemBERT model, we also need to save the `CamembertTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + "text/plain": [ + "model.safetensors: 0%| | 0.00/445M [00:00 False\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", - "\n", - "MODEL_NAME = \"camembert-base\"\n", - "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(EXPORT_PATH)\n", - "\n", - "# Create directory for assets and move the tokenizer files.\n", - "# A separate folder is needed for Spark NLP.\n", - "!mkdir {EXPORT_PATH}/assets\n", - "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets/" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']\n", + "- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "oQqUoZMPYxPt" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4a8d4f7ef796473a91c5bb1f7b0db38c", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "Let's have a look inside these two directories and see what we are dealing with:" + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/25.0 [00:00 False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", + "\n", + "MODEL_NAME = \"camembert-base\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets\n", + "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "dYGONeQ_YxPu", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "a45cbf0b-f8c1-44e1-c5a3-b45ba6d8b110" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Installing PySpark 3.2.3 and Spark NLP 5.3.0\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 5.3.0\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m564.8/564.8 kB\u001b[0m \u001b[31m49.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m26.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], - "source": [ - "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 432436\n", + "drwxr-xr-x 2 root root 4096 Mar 1 01:03 assets\n", + "-rw-r--r-- 1 root root 673 Mar 1 01:03 config.json\n", + "-rw-r--r-- 1 root root 440372299 Mar 1 01:03 model.onnx\n", + "-rw-r--r-- 1 root root 354 Mar 1 01:03 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 491 Mar 1 01:03 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2418800 Mar 1 01:03 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "AlqjcksvYxPv" - }, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 792\n", + "-rw-r--r-- 1 root root 810912 Mar 1 01:03 sentencepiece.bpe.model\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save CamemBERT in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "XyGjm38dYxPv" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.3.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.3.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m564.8/564.8 kB\u001b[0m \u001b[31m49.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m26.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `CamemBertEmbeddings` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `CamemBertEmbeddings` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- `setStorageRef` is very important. When you are training a task like NER or any Text Classification, we use this reference to bound the trained model to this specific embeddings so you won't load a different embeddings by mistake and see terrible results 😊\n", + "- It's up to you what you put in `setStorageRef` but it cannot be changed later on. We usually use the name of the model to be clear, but you can get creative if you want!\n", + "- The `dimension` param is is purely cosmetic and won't change anything. It's mostly for you to know later via `.getDimension` what is the dimension of your model. So set this accordingly.\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "camembert = CamemBertEmbeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"camembert\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setDimension(768)\\\n", + " .setStorageRef('camembert_base')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "camembert.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX CamemBERT model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "xVh0x6TiYxPv" - }, - "source": [ - "- Let's use `loadSavedModel` functon in `CamemBertEmbeddings` which allows us to load the ONNX model\n", - "- Most params will be set automatically. They can also be set later after loading the model in `CamemBertEmbeddings` during runtime, so don't worry about setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- `setStorageRef` is very important. When you are training a task like NER or any Text Classification, we use this reference to bound the trained model to this specific embeddings so you won't load a different embeddings by mistake and see terrible results 😊\n", - "- It's up to you what you put in `setStorageRef` but it cannot be changed later on. We usually use the name of the model to be clear, but you can get creative if you want!\n", - "- The `dimension` param is is purely cosmetic and won't change anything. It's mostly for you to know later via `.getDimension` what is the dimension of your model. So set this accordingly.\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 430920\n", + "-rw-r--r-- 1 root root 440439641 Mar 1 01:05 camembert_onnx\n", + "-rw-r--r-- 1 root root 810912 Mar 1 01:05 camembert_spp\n", + "drwxr-xr-x 2 root root 4096 Mar 1 01:05 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBERT model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "tokenizer = Tokenizer()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"token\")\n", + "\n", + "camembert_loaded = CamemBertEmbeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"camembert\")\\\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " document_assembler,\n", + " tokenizer,\n", + " camembert_loaded\n", + " ])\n", + "\n", + "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", + "model = pipeline.fit(data)\n", + "result = model.transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "NY5PvPq9YxPv" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "\n", - "# All these params should be identical to the original ONNX model\n", - "camembert = CamemBertEmbeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"camembert\")\\\n", - " .setCaseSensitive(True)\\\n", - " .setDimension(768)\\\n", - " .setStorageRef('camembert_base')" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| embeddings|\n", + "+--------------------+\n", + "|[-0.049330253, 0....|\n", + "|[0.003116008, 0.1...|\n", + "|[-0.021312904, -0...|\n", + "|[0.046165787, 0.0...|\n", + "|[0.09459148, 0.07...|\n", + "|[0.071022525, 0.2...|\n", + "|[0.08610784, -0.3...|\n", + "|[0.20012067, 0.49...|\n", + "|[0.10958594, -0.0...|\n", + "|[0.19859709, 0.09...|\n", + "|[0.09361851, 0.21...|\n", + "|[0.12071304, 0.41...|\n", + "|[0.12088075, 0.41...|\n", + "|[0.034318373, -0....|\n", + "|[0.02465238, 0.16...|\n", + "|[-0.019737713, 0....|\n", + "|[0.08724952, -0.0...|\n", + "|[-0.02866838, 0.2...|\n", + "|[-0.047727797, 0....|\n", + "|[0.07970655, -0.0...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "result.selectExpr(\"explode(camembert.embeddings) as embeddings\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of CamemBERT models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0ad83b11b2f94abb84c84a7029356683": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "QzK06tJbYxPw" - }, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] + "0b8eb9f25faa4993802ed33f05eaf91a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e1c692cb85b64ae6999e2febcb92e556", + "placeholder": "​", + "style": "IPY_MODEL_932203a42244430f903ef45fc3d70f76", + "value": " 811k/811k [00:00<00:00, 4.83MB/s]" + } }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "TkLhBJTaYxPw" - }, - "outputs": [], - "source": [ - "camembert.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" - ] + "0f08885250af4f419b14dab6df6932f7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "olWYb2EoYxPw" - }, - "source": [ - "Let's clean up stuff we don't need anymore" - ] + "11907ede30534508a2793241e515a818": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "kVUjfeKoYxPw" - }, - "outputs": [], - "source": [ - "!rm -rf {EXPORT_PATH}" - ] + "1696e83714ed4c5ca3231eabad56665a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "1-Yh1zoJYxPw" - }, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your ONNX CamemBERT model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] + "18b50db31cdf49b1ab28e045e71ed1d8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "qLlgnpPiYxPw", - "outputId": "74867d22-b90b-406e-8f5a-294de2a6063c", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "total 430920\n", - "-rw-r--r-- 1 root root 440439641 Mar 1 01:05 camembert_onnx\n", - "-rw-r--r-- 1 root root 810912 Mar 1 01:05 camembert_spp\n", - "drwxr-xr-x 2 root root 4096 Mar 1 01:05 metadata\n" - ] - } + "19403be31a184b3d96a4e92c2c00140e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1991b5c440194f9a8729a01d382af5c3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1c02b6ac167543109981ef1908a2ce8f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "20bb65ce09c440658155d6f1f2590a09": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c2d1126afc674192a56f52668380ab94", + "IPY_MODEL_3ca16338e2c14772a57d0c1e5de2668f", + "IPY_MODEL_ebc68a01a2354c55953779f328523c1b" ], - "source": [ - "! ls -l {MODEL_NAME}_spark_nlp" - ] + "layout": "IPY_MODEL_e7a68b8fdfbd41999a3cec53c187f1ef" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "aJCkVxTxYxPx" - }, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBERT model 😊" - ] + "247ea5e1983b4c3885408e364a3ccdc2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "6DQmw6ySYxPx" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "\n", - "from sparknlp.base import *\n", - "from sparknlp.annotator import *\n", - "\n", - "document_assembler = DocumentAssembler()\\\n", - " .setInputCol(\"text\")\\\n", - " .setOutputCol(\"document\")\n", - "\n", - "tokenizer = Tokenizer()\\\n", - " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"token\")\n", - "\n", - "camembert_loaded = CamemBertEmbeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"camembert\")\\\n", - "\n", - "pipeline = Pipeline(\n", - " stages = [\n", - " document_assembler,\n", - " tokenizer,\n", - " camembert_loaded\n", - " ])\n", - "\n", - "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", - "model = pipeline.fit(data)\n", - "result = model.transform(data)" - ] + "26cba613ef664b97a445b206887f231b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0ad83b11b2f94abb84c84a7029356683", + "max": 810912, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_11907ede30534508a2793241e515a818", + "value": 810912 + } }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "X7cKqpDCYxPx", - "outputId": "54e87d8e-0ab4-4e7b-be30-e947f1e98992", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+\n", - "| embeddings|\n", - "+--------------------+\n", - "|[-0.049330253, 0....|\n", - "|[0.003116008, 0.1...|\n", - "|[-0.021312904, -0...|\n", - "|[0.046165787, 0.0...|\n", - "|[0.09459148, 0.07...|\n", - "|[0.071022525, 0.2...|\n", - "|[0.08610784, -0.3...|\n", - "|[0.20012067, 0.49...|\n", - "|[0.10958594, -0.0...|\n", - "|[0.19859709, 0.09...|\n", - "|[0.09361851, 0.21...|\n", - "|[0.12071304, 0.41...|\n", - "|[0.12088075, 0.41...|\n", - "|[0.034318373, -0....|\n", - "|[0.02465238, 0.16...|\n", - "|[-0.019737713, 0....|\n", - "|[0.08724952, -0.0...|\n", - "|[-0.02866838, 0.2...|\n", - "|[-0.047727797, 0....|\n", - "|[0.07970655, -0.0...|\n", - "+--------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } + "2c16c911327041379eae3410825174ce": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f9b57a9d24848b4ac7d3d254a445251": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_be26390927f246348533f0704c18fe60", + "IPY_MODEL_47bd91449e67415cbbbca16e9f9de31f", + "IPY_MODEL_b57f460a026541c28ab83c06c0d047f8" ], - "source": [ - "result.selectExpr(\"explode(camembert.embeddings) as embeddings\").show()" - ] + "layout": "IPY_MODEL_ed1cc1cce2d84c10babf49a9239028f5" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "0HO06arnYxPx" - }, - "source": [ - "That's it! You can now go wild and use hundreds of CamemBERT models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4" + "377ca31e8f0c4cfe8d027e73f1908612": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7f4455090e7746f5b4be7b3ac8b3b589", + "max": 508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1696e83714ed4c5ca3231eabad56665a", + "value": 508 + } }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "3ca16338e2c14772a57d0c1e5de2668f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7984e7fac83e4001815c440c7b7919ad", + "max": 445008750, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1991b5c440194f9a8729a01d382af5c3", + "value": 445008750 + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" + "3d0b2b4f4dce4697811f861bc0ea2df1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "453d1e3c486f4b9aa7e2d9acecf4176b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ed985a8db8234b90917695713da4a13a", + "placeholder": "​", + "style": "IPY_MODEL_8816bd1505374989a4b8135ead3b5abf", + "value": "config.json: 100%" + } + }, + "477fd05f4c8f49fdb4002d47582f6313": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d78a70686e9348aa9c8a654c352915f4", + "IPY_MODEL_26cba613ef664b97a445b206887f231b", + "IPY_MODEL_0b8eb9f25faa4993802ed33f05eaf91a" + ], + "layout": "IPY_MODEL_97a5876e545b4c24b7498683bb5876b5" + } + }, + "47bd91449e67415cbbbca16e9f9de31f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_247ea5e1983b4c3885408e364a3ccdc2", + "max": 1395301, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_dddfc0e2eadc494aa0412e67f894730b", + "value": 1395301 + } + }, + "4a8d4f7ef796473a91c5bb1f7b0db38c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_bf6c1dbba58b41448a3350c8967fabfc", + "IPY_MODEL_8d52a81376024e90a0d4bfef55285ca9", + "IPY_MODEL_8c8a79b44687450f95884ea9a809d6e4" + ], + "layout": "IPY_MODEL_cd091521a65c47d19bd40d45b1bf4895" + } }, - "accelerator": "GPU", - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "566c95aa32904d73923ed7e93d99d373": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_453d1e3c486f4b9aa7e2d9acecf4176b", - "IPY_MODEL_377ca31e8f0c4cfe8d027e73f1908612", - "IPY_MODEL_a1f8086b1b4d40cbac31618adffb1b11" - ], - "layout": "IPY_MODEL_2c16c911327041379eae3410825174ce" - } - }, - "453d1e3c486f4b9aa7e2d9acecf4176b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ed985a8db8234b90917695713da4a13a", - "placeholder": "​", - "style": "IPY_MODEL_8816bd1505374989a4b8135ead3b5abf", - "value": "config.json: 100%" - } - }, - "377ca31e8f0c4cfe8d027e73f1908612": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7f4455090e7746f5b4be7b3ac8b3b589", - "max": 508, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1696e83714ed4c5ca3231eabad56665a", - "value": 508 - } - }, - "a1f8086b1b4d40cbac31618adffb1b11": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d95e1ea826cd4edbac988692aba618b1", - "placeholder": "​", - "style": "IPY_MODEL_6c33e478484d4866b6b4d44bc1af9c3b", - "value": " 508/508 [00:00<00:00, 34.9kB/s]" - } - }, - "2c16c911327041379eae3410825174ce": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ed985a8db8234b90917695713da4a13a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8816bd1505374989a4b8135ead3b5abf": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7f4455090e7746f5b4be7b3ac8b3b589": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1696e83714ed4c5ca3231eabad56665a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "d95e1ea826cd4edbac988692aba618b1": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6c33e478484d4866b6b4d44bc1af9c3b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "20bb65ce09c440658155d6f1f2590a09": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_c2d1126afc674192a56f52668380ab94", - "IPY_MODEL_3ca16338e2c14772a57d0c1e5de2668f", - "IPY_MODEL_ebc68a01a2354c55953779f328523c1b" - ], - "layout": "IPY_MODEL_e7a68b8fdfbd41999a3cec53c187f1ef" - } - }, - "c2d1126afc674192a56f52668380ab94": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9d84925c75e546a683a580e58d8e763f", - "placeholder": "​", - "style": "IPY_MODEL_3d0b2b4f4dce4697811f861bc0ea2df1", - "value": "model.safetensors: 100%" - } - }, - "3ca16338e2c14772a57d0c1e5de2668f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7984e7fac83e4001815c440c7b7919ad", - "max": 445008750, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1991b5c440194f9a8729a01d382af5c3", - "value": 445008750 - } - }, - "ebc68a01a2354c55953779f328523c1b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_85ede09e39074fd3a9bf08c6bc68b753", - "placeholder": "​", - "style": "IPY_MODEL_d676d382842d4784add17dfca06348a2", - "value": " 445M/445M [00:02<00:00, 237MB/s]" - } - }, - "e7a68b8fdfbd41999a3cec53c187f1ef": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9d84925c75e546a683a580e58d8e763f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3d0b2b4f4dce4697811f861bc0ea2df1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7984e7fac83e4001815c440c7b7919ad": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1991b5c440194f9a8729a01d382af5c3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "85ede09e39074fd3a9bf08c6bc68b753": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d676d382842d4784add17dfca06348a2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4a8d4f7ef796473a91c5bb1f7b0db38c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_bf6c1dbba58b41448a3350c8967fabfc", - "IPY_MODEL_8d52a81376024e90a0d4bfef55285ca9", - "IPY_MODEL_8c8a79b44687450f95884ea9a809d6e4" - ], - "layout": "IPY_MODEL_cd091521a65c47d19bd40d45b1bf4895" - } - }, - "bf6c1dbba58b41448a3350c8967fabfc": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1c02b6ac167543109981ef1908a2ce8f", - "placeholder": "​", - "style": "IPY_MODEL_6dc758d8cb124f96873fac98d745a2f2", - "value": "tokenizer_config.json: 100%" - } - }, - "8d52a81376024e90a0d4bfef55285ca9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_18b50db31cdf49b1ab28e045e71ed1d8", - "max": 25, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_d0ce9537c09b4321a08d71164049e753", - "value": 25 - } - }, - "8c8a79b44687450f95884ea9a809d6e4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_6fc8723befd549b6aabe9a84da74b04a", - "placeholder": "​", - "style": "IPY_MODEL_ab012f044d614acba8d1b2330ecae31d", - "value": " 25.0/25.0 [00:00<00:00, 127B/s]" - } - }, - "cd091521a65c47d19bd40d45b1bf4895": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1c02b6ac167543109981ef1908a2ce8f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6dc758d8cb124f96873fac98d745a2f2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "18b50db31cdf49b1ab28e045e71ed1d8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d0ce9537c09b4321a08d71164049e753": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "6fc8723befd549b6aabe9a84da74b04a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ab012f044d614acba8d1b2330ecae31d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "477fd05f4c8f49fdb4002d47582f6313": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_d78a70686e9348aa9c8a654c352915f4", - "IPY_MODEL_26cba613ef664b97a445b206887f231b", - "IPY_MODEL_0b8eb9f25faa4993802ed33f05eaf91a" - ], - "layout": "IPY_MODEL_97a5876e545b4c24b7498683bb5876b5" - } - }, - "d78a70686e9348aa9c8a654c352915f4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_69610c818dfa442c86f9af8a04e6019f", - "placeholder": "​", - "style": "IPY_MODEL_c6ecfecbc45144c385ce89057e86e627", - "value": "sentencepiece.bpe.model: 100%" - } - }, - "26cba613ef664b97a445b206887f231b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0ad83b11b2f94abb84c84a7029356683", - "max": 810912, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_11907ede30534508a2793241e515a818", - "value": 810912 - } - }, - "0b8eb9f25faa4993802ed33f05eaf91a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e1c692cb85b64ae6999e2febcb92e556", - "placeholder": "​", - "style": "IPY_MODEL_932203a42244430f903ef45fc3d70f76", - "value": " 811k/811k [00:00<00:00, 4.83MB/s]" - } - }, - "97a5876e545b4c24b7498683bb5876b5": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "69610c818dfa442c86f9af8a04e6019f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c6ecfecbc45144c385ce89057e86e627": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0ad83b11b2f94abb84c84a7029356683": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "11907ede30534508a2793241e515a818": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "e1c692cb85b64ae6999e2febcb92e556": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "932203a42244430f903ef45fc3d70f76": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "2f9b57a9d24848b4ac7d3d254a445251": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_be26390927f246348533f0704c18fe60", - "IPY_MODEL_47bd91449e67415cbbbca16e9f9de31f", - "IPY_MODEL_b57f460a026541c28ab83c06c0d047f8" - ], - "layout": "IPY_MODEL_ed1cc1cce2d84c10babf49a9239028f5" - } - }, - "be26390927f246348533f0704c18fe60": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7ce7766a45494859a5cdcb668994c3f6", - "placeholder": "​", - "style": "IPY_MODEL_879960f4c4e34d07928275c812674dd6", - "value": "tokenizer.json: 100%" - } - }, - "47bd91449e67415cbbbca16e9f9de31f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_247ea5e1983b4c3885408e364a3ccdc2", - "max": 1395301, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_dddfc0e2eadc494aa0412e67f894730b", - "value": 1395301 - } - }, - "b57f460a026541c28ab83c06c0d047f8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_19403be31a184b3d96a4e92c2c00140e", - "placeholder": "​", - "style": "IPY_MODEL_0f08885250af4f419b14dab6df6932f7", - "value": " 1.40M/1.40M [00:00<00:00, 5.08MB/s]" - } - }, - "ed1cc1cce2d84c10babf49a9239028f5": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7ce7766a45494859a5cdcb668994c3f6": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "879960f4c4e34d07928275c812674dd6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "247ea5e1983b4c3885408e364a3ccdc2": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "dddfc0e2eadc494aa0412e67f894730b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "19403be31a184b3d96a4e92c2c00140e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0f08885250af4f419b14dab6df6932f7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } + "566c95aa32904d73923ed7e93d99d373": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_453d1e3c486f4b9aa7e2d9acecf4176b", + "IPY_MODEL_377ca31e8f0c4cfe8d027e73f1908612", + "IPY_MODEL_a1f8086b1b4d40cbac31618adffb1b11" + ], + "layout": "IPY_MODEL_2c16c911327041379eae3410825174ce" + } + }, + "69610c818dfa442c86f9af8a04e6019f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6c33e478484d4866b6b4d44bc1af9c3b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6dc758d8cb124f96873fac98d745a2f2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6fc8723befd549b6aabe9a84da74b04a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7984e7fac83e4001815c440c7b7919ad": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7ce7766a45494859a5cdcb668994c3f6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7f4455090e7746f5b4be7b3ac8b3b589": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "85ede09e39074fd3a9bf08c6bc68b753": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "879960f4c4e34d07928275c812674dd6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8816bd1505374989a4b8135ead3b5abf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8c8a79b44687450f95884ea9a809d6e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6fc8723befd549b6aabe9a84da74b04a", + "placeholder": "​", + "style": "IPY_MODEL_ab012f044d614acba8d1b2330ecae31d", + "value": " 25.0/25.0 [00:00<00:00, 127B/s]" + } + }, + "8d52a81376024e90a0d4bfef55285ca9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_18b50db31cdf49b1ab28e045e71ed1d8", + "max": 25, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d0ce9537c09b4321a08d71164049e753", + "value": 25 + } + }, + "932203a42244430f903ef45fc3d70f76": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "97a5876e545b4c24b7498683bb5876b5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9d84925c75e546a683a580e58d8e763f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a1f8086b1b4d40cbac31618adffb1b11": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d95e1ea826cd4edbac988692aba618b1", + "placeholder": "​", + "style": "IPY_MODEL_6c33e478484d4866b6b4d44bc1af9c3b", + "value": " 508/508 [00:00<00:00, 34.9kB/s]" + } + }, + "ab012f044d614acba8d1b2330ecae31d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b57f460a026541c28ab83c06c0d047f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_19403be31a184b3d96a4e92c2c00140e", + "placeholder": "​", + "style": "IPY_MODEL_0f08885250af4f419b14dab6df6932f7", + "value": " 1.40M/1.40M [00:00<00:00, 5.08MB/s]" + } + }, + "be26390927f246348533f0704c18fe60": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ce7766a45494859a5cdcb668994c3f6", + "placeholder": "​", + "style": "IPY_MODEL_879960f4c4e34d07928275c812674dd6", + "value": "tokenizer.json: 100%" + } + }, + "bf6c1dbba58b41448a3350c8967fabfc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1c02b6ac167543109981ef1908a2ce8f", + "placeholder": "​", + "style": "IPY_MODEL_6dc758d8cb124f96873fac98d745a2f2", + "value": "tokenizer_config.json: 100%" + } + }, + "c2d1126afc674192a56f52668380ab94": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9d84925c75e546a683a580e58d8e763f", + "placeholder": "​", + "style": "IPY_MODEL_3d0b2b4f4dce4697811f861bc0ea2df1", + "value": "model.safetensors: 100%" + } + }, + "c6ecfecbc45144c385ce89057e86e627": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cd091521a65c47d19bd40d45b1bf4895": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d0ce9537c09b4321a08d71164049e753": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d676d382842d4784add17dfca06348a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d78a70686e9348aa9c8a654c352915f4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_69610c818dfa442c86f9af8a04e6019f", + "placeholder": "​", + "style": "IPY_MODEL_c6ecfecbc45144c385ce89057e86e627", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "d95e1ea826cd4edbac988692aba618b1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dddfc0e2eadc494aa0412e67f894730b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e1c692cb85b64ae6999e2febcb92e556": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e7a68b8fdfbd41999a3cec53c187f1ef": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ebc68a01a2354c55953779f328523c1b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_85ede09e39074fd3a9bf08c6bc68b753", + "placeholder": "​", + "style": "IPY_MODEL_d676d382842d4784add17dfca06348a2", + "value": " 445M/445M [00:02<00:00, 237MB/s]" + } + }, + "ed1cc1cce2d84c10babf49a9239028f5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ed985a8db8234b90917695713da4a13a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb index 0aa530ff44ed50..68b32b12391e99 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "vfU3Ee88cwGj" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", "\n", @@ -13,9 +11,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "fM_4ix0mcwGm" - }, + "metadata": {}, "source": [ "## Import ONNX CamemBertForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀\n", "\n", @@ -30,34 +26,24 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "EVzmVKX8cwGn" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "WDSalCHsd9-z" - }, + "metadata": {}, "source": [ "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", - "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", "- CamembertTokenizer requires the `SentencePiece` library, so we install that as well" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qSx09sNyegma", - "outputId": "c12f53f3-970e-40b8-c092-e67153a21a7a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -130,14 +116,12 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece tensorflow" + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum sentencepiece tensorflow" ] }, { "cell_type": "markdown", - "metadata": { - "id": "uFkFe1YUewJR" - }, + "metadata": {}, "source": [ "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", "- We'll use [illuin/camembert-base-fquad](https://huggingface.co/illuin/camembert-base-fquad) model from HuggingFace as an example and load it as a `ORTModelForQuestionAnswering`, representing an ONNX model." @@ -145,72 +129,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 281, - "referenced_widgets": [ - "60824b807eb34823970f0afe669fab77", - "579cc0e3f0b74666b269eba1095fd54a", - "cc4728272f11453ca0c03d2c3840ac0d", - "2e012432b76d46bca6dc05df6e1f78d1", - "1164fd1351b741dead98629581dbceea", - "bf5e3cf99e994011aea18983e0794770", - "b1c76170057e409f811eb164f74b991e", - "2cd9b87a2a4b4562b29cea3938d7b60a", - "3fa71888d6a84f7998ea340e0984aea4", - "ea092b510afa465192742a0baf2ed85e", - "c5dbc3bb275e419f8209451e521efad6", - "b394db14253a4f0c8d5611698cf427b7", - "83bebd8cf8f04a63a58d89fe8a09954e", - "2466e7029af64b1c9bedd7ce35163899", - "674cfd9ff77d4147bd56978528448e90", - "0e2fa741d4744549be91d54239a056db", - "9e09160e16ec4b5fbf57af47ecf30c01", - "76261617f60340a58bd7db1ee6889b54", - "f376a02795b343ffb2f1045b414d8bdb", - "04c11c2902274294b9bdf935912013a0", - "e2caa4055a114b00ac0136a5e52cc517", - "8b94d0f7b9404cb5ac7d37dd6989d1fe", - "0fd98d8510ab4a05bdddac8727f9a618", - "93e4f9fb19a34c95a189871d94f8b845", - "b9957383a64b46b7bff6762bb21cda34", - "f473d9d2086742b38e24daf86d839827", - "14bf9182226d4663b917ee435b8014d2", - "c168bda2ae104316b404bfc532265780", - "0d42197f53404e98b4cc1536842e3e04", - "4d34bf4e3e334d65b908c6cfc0e3b6c2", - "077e7918ec344c60b3041714077f7e76", - "eda670714b574ccb81f1614edd771cc4", - "b0535795047546de90a690a7467a35b3", - "5186a7123c064af58f948eeca9249972", - "e3230d96b9e1458eb4abcad151da533b", - "47ea1c464c7f4e088025238f9af6d462", - "0595b9a3e25b4c97a204c8024da0bf28", - "1f1f591e62f94390908fb4660a97de0a", - "7363cd559e2947dbba473f13fbb2e282", - "78a2a618f96b4b27886a969ed8749bcc", - "895b5df6613f4d7db780ba93307bf6fd", - "92170245c9984fa78a4c0be174345aec", - "f9dd34dd913742a2ac469207964d40a8", - "eb1a55e3c53749db853fd8465634766c", - "7224240d590a460b9e7396e783eb17bb", - "6c33510efcf949b4bd0f0b7c68901da9", - "c891ff3fd87e4c1ba700c234e76f239d", - "dabe0a4be0204c989b582a531c56c439", - "c8e7e8365da44f8fbe15057040b8d6f9", - "0db6c31436ff403d8a44e197ac435b38", - "937af17f98d24828a022d92106e4d901", - "a410f3289fb046a6a4d33422892bedf1", - "94b04d2fba604d6f9d09001778242f48", - "6ec5e976fb0d473ebe21ed6ea312714a", - "3982f5c5417e47259d51fb9cfa2d3224" - ] - }, - "id": "FtWcH9nycwGq", - "outputId": "f4156389-8a42-4617-c3c7-1a729a42d5a3" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -316,23 +236,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "zwiHGrMzcwGr" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "DNv4wNUjcwGs", - "outputId": "d3a4bd83-efd1-4bc2-c360-b6ed527f8cc8" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -354,19 +266,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "A729i3lKcwGt" - }, + "metadata": {}, "source": [ "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "YeAxCPRucwGs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!mkdir {ONNX_MODEL}/assets" @@ -374,10 +282,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "iGYjyVVscwGt" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" @@ -385,23 +291,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "jDnAwQNzcwGt" - }, + "metadata": {}, "source": [ "Voila! We have our `spiece.model` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "I0bHNH4DcwGu", - "outputId": "c504743a-c117-4cbe-9592-d693647b3cba" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -428,18 +326,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "9YqU2MCUcwGu" - }, + "metadata": {}, "source": [ "## Import and Save CamemBertForQuestionAnswering in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "DY8O4gmNcwGu" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -447,14 +341,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "sf3trd1AcwGu", - "outputId": "f052434f-b296-4fc8-82b7-460eae529c2c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -476,23 +364,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "c20mfw2XcwGv" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hpyY43a1cwGv", - "outputId": "267e6909-8cea-4008-9acd-e8e05f41089f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -512,9 +392,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MphL0pwXcwGv" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `CamemBertForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `CamemBertForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -524,10 +402,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "EYtqFJdicwGv" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -545,19 +421,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "onlY3f49cwGw" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "r8acHJGIcwGw" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spanClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" @@ -565,19 +437,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "zDktkvATcwGw" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "QcB2MXA_cwGx" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {ONNX_MODEL}" @@ -585,9 +453,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Z1onLYQQcwGx" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -596,14 +462,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lBGyJYtdcwGy", - "outputId": "0d75b5aa-32cc-4751-bca7-29573c8143fc" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -622,23 +482,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "dnrCzOhBcwGy" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForQuestionAnswering model in Spark NLP 🚀 pipeline!" ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6pRIq0DycwGz", - "outputId": "35b44310-ee12-4c28-8b2c-c7d4af52ca72" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -670,7 +522,7 @@ "context = \"Mon nom est Wolfgang et je vis à Berlin\"\n", "question = \"Où est-ce que je vis?\"\n", "\n", - "example = spark.createDataFrame([[question, contexilluin/camembert-base-fquadt]]).toDF(\"question\", \"context\")\n", + "example = spark.createDataFrame([[question, context]]).toDF(\"question\", \"context\")\n", "result = pipeline.fit(example).transform(example)\n", "\n", "result.select(\"question\", \"answer.result\").show(truncate=False)" @@ -678,9 +530,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "tpTcy12BcwGz" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `CamemBertForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀\n" ] @@ -706,8 +556,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForSequenceClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForSequenceClassification.ipynb index aebc1ead26991d..4fb7a0de126606 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForSequenceClassification.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForSequenceClassification.ipynb @@ -2,20 +2,16 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "Kz2JVabcBeK6" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForSequenceClassification.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForSequenceClassification.ipynb)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "LGmftFTVBeK_" - }, + "metadata": {}, "source": [ "## Import ONNX CamemBertForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", "\n", @@ -30,34 +26,24 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "C6h2JYiqBeLB" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "VG7MbbfwBeLB" - }, + "metadata": {}, "source": [ "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", - "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", "- CamembertTokenizer requires the `SentencePiece` library, so we install that as well" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "O-v9Ox3RBeLC", - "outputId": "5ab09375-e283-4110-98b8-d315adeabaed" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -126,14 +112,12 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece tensorflow" + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum sentencepiece tensorflow" ] }, { "cell_type": "markdown", - "metadata": { - "id": "vX94VKVqDBys" - }, + "metadata": {}, "source": [ "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", "- We'll use [tblard/tf-allocine](https://huggingface.co/tblard/tf-allocine) model from HuggingFace as an example and load it as a `ORTModelForSequenceClassification`, representing an ONNX model." @@ -141,72 +125,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 368, - "referenced_widgets": [ - "1fc8a82b2cc049eebf6cf548c4d8bc31", - "d38d929335154784a2f64c4c9a867bde", - "6baf6b23f8b340fdb485a1822d3fbb17", - "74779f1b332e4d87b4e74f54a45b061d", - "f4ebd094b530442e8f801e86ac62e266", - "f943daa5bd4c4477bc0b96ae7266d0ef", - "19f767de7d6441a78112982a4b2e5689", - "f8b33f71a72c4cdeafbb6dcb56fada3d", - "7e1ba263493a42e5898c90dcfc76ea9a", - "bf75a87261e548c0ab3a13ba713fcd3f", - "312e8c30d9264f0799b251ca9b5ddaec", - "8f61c2b8a66c4d57b232b75670077d16", - "03f9cff31e0849328fbc0bcdae68504a", - "b1febffea0fc4a29a1ddaa089b46f963", - "b9ce748b6bca4a4fb87ee78d123cec4f", - "fd60da5ac18443ba84fa1a3019d4ec17", - "670125413ddd416dba9691a0eecd5bf2", - "e1f215647df14777b76565f9e17b6b42", - "6ee0d336f8cf4e54bc9b16b102a6f6a6", - "d6b5087105694b1d9d31ca752d7a1a02", - "5a1688724bf541df9a663b4e3fbb9c4a", - "158a4e71807c4035898f571d090196c4", - "eedcb59087aa463787ccfb431491612e", - "03b2bf55225a4f3b90c58c6a2aebd8d6", - "e03bab197a814f979d1f580da8cfd34f", - "5af2961b3ba3480cb89f8ac474f017b6", - "73ebf2da268a447ebc76b53a46efe7ae", - "1db65061f028451585293514adcd15a0", - "e0c9bff56942449aab8ebde097afe65a", - "df6fbe6070e545fea98371b38b1a7968", - "a855a3255f064400a72323cd7012f00e", - "4940c4b39d1e4f1699f2a831310b92b3", - "19f001d8e3be4674b582cbeaa6769ac1", - "5e86f09721874d8db44266e8efdd756d", - "941982def08449689fa10b9060df21dd", - "04b9564aab874822883cd67bab2693da", - "b649e106a52c45369c60f52dc29858fe", - "971fce9704734bbcbff90ee730238a53", - "a649d60a24c0459a8910631de2d4ba09", - "21303552af4540beaefa93507328127d", - "0aab68e6209743efbe733ab76d22d522", - "f89e56bb9b174fc9886578897f29a129", - "06507c6d96ce42139c5b07b6071fb187", - "5e6208deee5e4ee891d2b79ee36c4b3c", - "69ae18d4f80843889ed37540ee99f6ee", - "8e73282bb7c7443dbb8b50fb291f7596", - "abd0b36bd6714517bb480f8bbb212ed1", - "8e76d740007c406fa93bfe1260ac73f4", - "a3d2d1e0d5b84a158ed651509cd7f71e", - "0f95972368cf4c9bafcbae7bd7e6af17", - "9ba93d6ff0f64bb19bcb968b9a5d2786", - "239ee55351684b15a86fc7b1216c084a", - "f3f27505e47d4d609433fc39898daa01", - "b6f5e8a982904d5e83e734d3efe3e9ca", - "aa05663dbc2446358f023015125340e0" - ] - }, - "id": "1KO14D7FDYbV", - "outputId": "96c0057a-d0f4-43c4-8cd0-f88f08b31a4d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -323,23 +243,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "M9DSQEJgBeLE" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "eQMgq6JyBeLF", - "outputId": "64ff36e5-91c1-4078-d616-1761300a106b" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -361,9 +273,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "j2vb2xNZBeLG" - }, + "metadata": {}, "source": [ "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to assets folder which Spark NLP will look for\n", "- In addition to vocabs, we also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" @@ -371,10 +281,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "QCzfmhNwFg9J" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!mkdir {ONNX_MODEL}/assets" @@ -382,10 +290,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "8762MVaBBeLG" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary\n", @@ -399,10 +305,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "aDlALBQrBeLG" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" @@ -410,23 +314,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "96PWQ6MYBeLG" - }, + "metadata": {}, "source": [ "Voila! We have our `sentencepiece.bpe.model` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "onqDAch3BeLH", - "outputId": "17b0d23f-6653-4610-8145-5fbfd229d714" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -454,18 +350,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "HNqY0JX7BeLH" - }, + "metadata": {}, "source": [ "## Import and Save CamemBertForSequenceClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "KBftz1gHBeLH" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -473,14 +365,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CNsH1BkpBeLH", - "outputId": "50ba7045-a29e-4501-b2c7-4fcd2894c6b5" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -518,23 +404,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "1GFZq_URBeLI" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5Ti3X-BJBeLI", - "outputId": "c4fef397-1747-4def-e3c8-f84d88b4840d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -554,9 +432,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4RPu9eDfBeLI" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `CamemBertForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `CamemBertForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -566,10 +442,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "gFIeDIyVBeLI" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -586,19 +460,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "iDgUKFgyBeLI" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "x5wvcWXrBeLI" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" @@ -606,19 +476,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "vSjvf8woBeLI" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "Blq6dSf1BeLJ" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {ONNX_MODEL}" @@ -626,9 +492,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "NR0Dge9gBeLJ" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -637,14 +501,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "xKvpIhytBeLJ", - "outputId": "46a0c1ff-c423-483a-d97d-fa66080a055c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -664,19 +522,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "GylPMYsEBeLJ" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForSequenceClassification model 😊" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "PeY7xqh7BeLJ" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "sequenceClassifier_loaded = CamemBertForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", @@ -686,23 +540,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ZRNL6MuYBeLJ" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wkYTgXO5BeLJ", - "outputId": "d85f21b9-b545-4033-a67c-85b3fef033ea" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -710,7 +556,7 @@ "['NEGATIVE', 'POSITIVE']" ] }, - "execution_count": 17, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -722,23 +568,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "1ROrCoh7BeLK" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "752d8NgmBeLK", - "outputId": "869ea71a-88fe-45bc-890b-7600c1d4c285" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -785,9 +623,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "jdXhBn3wBeLK" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `CamemBertForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" ] @@ -813,8 +649,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForTokenClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForTokenClassification.ipynb index 89488034812dc1..89028457390477 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForTokenClassification.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CamemBertForTokenClassification.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "t609_kwpJbwZ" - }, + "metadata": {}, "source": [ "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", "\n", @@ -13,9 +11,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "zAKzkwH5Jbwf" - }, + "metadata": {}, "source": [ "## Import ONNX CamemBertForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", "\n", @@ -30,34 +26,24 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "4Wd_fyDVJbwg" - }, + "metadata": {}, "source": [ "## Export and Save HuggingFace model" ] }, { "cell_type": "markdown", - "metadata": { - "id": "dyDoTKBbJbwg" - }, + "metadata": {}, "source": [ "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", - "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases\n", "- CamembertTokenizer requires the `SentencePiece` library, so we install that as well" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Mm0OMPsCJbwh", - "outputId": "153cedd1-9f53-450f-d595-172914745937" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -126,14 +112,12 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece tensorflow" + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum sentencepiece tensorflow" ] }, { "cell_type": "markdown", - "metadata": { - "id": "MvbxodR1Jbwi" - }, + "metadata": {}, "source": [ "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", "- We'll use [Jean-Baptiste/camembert-ner](https://huggingface.co/Jean-Baptiste/camembert-ner) model from HuggingFace as an example\n", @@ -142,72 +126,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 281, - "referenced_widgets": [ - "2ba8ff842c074fab8fab3bc525260a11", - "5e9677b9d95f4543b87e3fb4e09e6bde", - "0b62c365f7704f46a9ef959ba529a159", - "b73eb7c03f064876b4380e82cb8e583c", - "ccc1c9742b1c484b9ea4ad616b71a87d", - "e41b4406b5b349259880f92990a334b5", - "ed3989d2c8dc4986ba7bb8a014f49c1a", - "f5c2ec307a9b4ee6a39e9748e19eb5d5", - "c55bf3c58b8a46e7abee8d7fea7a1561", - "409a90ec548e462c80a97891ec686aa3", - "dcc62c0294ef4334b3508984c22b5a70", - "2a29fc0fffc2482487f63ab522ec35db", - "39048167e0ad4015be7f8f51419206d9", - "c777b6fec9f143d281ebe9ba4103d283", - "82c60427d78a4b639ee203f9ee788683", - "ab7bf1c3000048ff87797d068f1367a8", - "edd7b3df09a84b0488369090591f656d", - "aa825fbcab524208a21f35e33d058290", - "9ac67d31c7ac450a87a5d0222c42e35c", - "14368fb00c55452fa76ba6497e6d942e", - "05772272612a4848a196233f6cf75811", - "9e8b12bafe0c4fcf98fccdc8dfb1c3ed", - "3d9433bb5f0c407eb11e9790fdefd8df", - "b0324bc4bda04fb48a5a7b4c56fb9026", - "edfb6a656ec644b8b76148003cea9f94", - "688d5fa38a8d42cebefbc07fa6a45f47", - "7cfb6ecb38834276afdd3822ab258dc7", - "b3c59848de5b4a9d8ef10590dac6db43", - "f2da19a7e20b47a98fc36a82946ce2ff", - "a181ef0216ba49a19fc8520e04eb905d", - "6e83b3a5aa1a43a6b98b99cae4661936", - "fecfdac50bd6472e93eef245b1194670", - "806acc042ca143759e47d2814e229759", - "16c1c0ff48d54d61a8f669d945e95c5d", - "6d474cbdef9749e8a57a6d36b8f47646", - "660aa338eb314cc0843e470b209a80e4", - "b3b40e6fa37b476c9053a1abeb6f5161", - "b6055e80cd98444d9ee1d62a49888abf", - "22450b273b9b4d19ae09b082f7e1f88f", - "0893cfe3bf4a41fc8ea582ab795f10fa", - "734fb73ad2794b54bc9ca3c8cae64d13", - "6aee6131b2404ee2aeea75798a46b8b8", - "ea23116c7bfc48a6942c826feb08f58b", - "697717fe14444024bc24b23e41e31dbb", - "e2d79e49240d42e4981874f04fb81f79", - "c46d3e59f2b34fc88bbb66058b24d7ed", - "b26578c8ddeb4ae7acd8a29fa3c66092", - "b170b6bf9cc648f9b42e04502afd830d", - "3e5bff8b13c44da8923242dd57faf147", - "e6d35d4877984babb780f46b7b747c5a", - "115c9ab451834e76a911aa51b1bffd06", - "304abae09efb4f13b6a9be85e2439311", - "ffcadb4b3181404c8b9ce05faaca3a96", - "c9cf94eccc5545dea21618e802725ebe", - "21f40edbe45948e5add865e8dcdd2fa6" - ] - }, - "id": "69dqor6oJbwj", - "outputId": "dde2fba0-185e-45d8-dfd0-76662ee03e16" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -313,23 +233,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "b_izs6-IJbwj" - }, + "metadata": {}, "source": [ "Let's have a look inside these two directories and see what we are dealing with:" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MlDEX0RPJbwk", - "outputId": "2d6c2275-8c48-41ab-e46d-b019d9aa2ab4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -351,10 +263,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "LlNnvSmEJbwl" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!mkdir {ONNX_MODEL}/assets" @@ -362,9 +272,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MMgh4l6YJbwl" - }, + "metadata": {}, "source": [ "- As you can see, we need to move `spiece.model` from the tokenizer to assets folder which Spark NLP will look for\n", "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" @@ -372,10 +280,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "o9Iwtm4yJbwm" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# get label2id dictionary\n", @@ -389,10 +295,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "uXyBoTy8RGXs" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" @@ -400,23 +304,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "iVWGRQVeJbwm" - }, + "metadata": {}, "source": [ "Voila! We have our `sentencepiece.bpe.model` and `labels.txt` inside assets directory" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zS2XOFsgJbwm", - "outputId": "093f2354-e749-4620-84cc-61470b61485d" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -444,18 +340,14 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "cDYX_LrGJbwm" - }, + "metadata": {}, "source": [ "## Import and Save CamemBertForTokenClassification in Spark NLP\n" ] }, { "cell_type": "markdown", - "metadata": { - "id": "lDxYBOKvJbwn" - }, + "metadata": {}, "source": [ "- Let's install and setup Spark NLP in Google Colab\n", "- This part is pretty easy via our simple script" @@ -463,14 +355,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0BRm0BAtJbwn", - "outputId": "0e86c28b-403a-43d4-c53a-4f157f5c2b7a" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -492,23 +378,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "V_4on1mZJbwn" - }, + "metadata": {}, "source": [ "Let's start Spark with Spark NLP included via our simple `start()` function" ] }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EEgIAj-1Jbwo", - "outputId": "8dbd34fc-f958-4601-8748-45f5874b56f4" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -528,9 +406,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "ny4my0lWJbwo" - }, + "metadata": {}, "source": [ "- Let's use `loadSavedModel` functon in `CamemBertForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", "- Most params can be set later when you are loading this model in `CamemBertForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", @@ -540,10 +416,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "DfyIDo5YJbwo" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sparknlp.annotator import *\n", @@ -561,19 +435,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "F1-FhOlmJbwo" - }, + "metadata": {}, "source": [ "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "23ttd2FZJbwo" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" @@ -581,19 +451,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "P0DfbHYqJbwo" - }, + "metadata": {}, "source": [ "Let's clean up stuff we don't need anymore" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "fmAPpXxAJbwp" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "!rm -rf {ONNX_MODEL}" @@ -601,9 +467,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "MWUOPc0ZJbwp" - }, + "metadata": {}, "source": [ "Awesome 😎 !\n", "\n", @@ -612,14 +476,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "khdBEds6Jbwp", - "outputId": "7305ca32-3b48-4900-ec0b-4166ee5d8a9c" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -639,19 +497,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "nY_AQ5wjJbwp" - }, + "metadata": {}, "source": [ "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForTokenClassification model 😊" ] }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "TLaswe_-Jbwp" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "tokenClassifier_loaded = CamemBertForTokenClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", @@ -661,23 +515,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "lLf8G5u-Jbwp" - }, + "metadata": {}, "source": [ "You can see what labels were used to train this model via `getClasses` function:" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ynmsq45nJbwq", - "outputId": "68596e4a-ff57-46d9-8edf-95d849f2c9b6" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "data": { @@ -685,7 +531,7 @@ "['I-ORG', 'I-MISC', 'I-LOC', 'I-PER', 'O']" ] }, - "execution_count": 18, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -697,23 +543,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "iASxSDQXJbwq" - }, + "metadata": {}, "source": [ "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "D8cNS4atJbwq", - "outputId": "0fcfc224-dd1c-4bc2-c3bd-9a0ff84e4c4f" - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -757,9 +595,7 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "DUPss2DdJbwq" - }, + "metadata": {}, "source": [ "That's it! You can now go wild and use hundreds of `CamemBertForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" ] @@ -785,8 +621,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" + "pygments_lexer": "ipython3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb index 64ea287fc9376f..bf087ea8342937 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "source": [ "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", - "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." ] }, { @@ -70,7 +70,7 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum" + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum" ] }, { diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBertaForQuestionAnswering.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBertaForQuestionAnswering.ipynb index 6dca7dc5b6644f..bd218a3a41eece 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBertaForQuestionAnswering.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBertaForQuestionAnswering.ipynb @@ -1,3165 +1,2993 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBertaForQuestionAnswering.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import ONNX DeBertaForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `DeBertaForQuestionAnswering` is only available since in `Spark NLP 5.2.1` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import DeBerta models trained/fine-tuned for question answering via `DeBertaForQuestionAnswering` or `TFDeBertaForQuestionAnswering`. These models are usually under `Question Answering` category and have `DeBerta` in their labels\n", + "- Reference: [TFDeBertaForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/deberta#transformers.TFDebertaForQuestionAnswering)\n", + "- Some [example models](https://huggingface.co/models?filter=deberta&pipeline_tag=question-answering)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- Albert uses SentencePiece, so we will have to install that as well" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "vizs6Bi9VdSl" - }, - "source": [ - "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBertaForQuestionAnswering.ipynb)" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m22.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m68.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m65.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m72.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m64.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m108.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m41.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m48.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m106.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m54.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m73.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m18.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m47.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m48.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m48.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m93.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m29.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m94.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m108.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m93.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow==2.11.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- - We'll use [nbroad/deberta-v3-xsmall-squad2](https://huggingface.co/nbroad/deberta-v3-xsmall-squad2) model from HuggingFace as an example and load it as a `ORTModelForQuestionAnswering`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "mNs5zLPbVdSo" - }, - "source": [ - "## Import ONNX DeBertaForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", - "- `DeBertaForQuestionAnswering` is only available since in `Spark NLP 5.2.1` and after. So please make sure you have upgraded to the latest Spark NLP release\n", - "- You can import DeBerta models trained/fine-tuned for question answering via `DeBertaForQuestionAnswering` or `TFDeBertaForQuestionAnswering`. These models are usually under `Question Answering` category and have `DeBerta` in their labels\n", - "- Reference: [TFDeBertaForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/deberta#transformers.TFDebertaForQuestionAnswering)\n", - "- Some [example models](https://huggingface.co/models?filter=deberta&pipeline_tag=question-answering)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:72: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "_pi-2aJlVdSo" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f9254e58721a48248f1730e695aded32", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Export and Save HuggingFace model" + "text/plain": [ + "config.json: 0%| | 0.00/884 [00:00=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", - "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow==2.11.0" + "text/plain": [ + "pytorch_model.bin: 0%| | 0.00/283M [00:00=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow==2.11.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [davanstrien/deberta-v3-base_fine_tuned_food_ner](https://huggingface.co/davanstrien/deberta-v3-base_fine_tuned_food_ner) model from HuggingFace as an example\n", + "- In addition to `TFDeBertaForTokenClassification` we also need to save the `DeBertaTokenizer`. This is the same for every model, these are assets needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "rllRrPX5edjW" - }, - "source": [ - "## Import ONNX DeBertaForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", - "- `DeBertaForTokenClassification` is only available since in `Spark NLP 5.1.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", - "- You can import DeBerta models trained/fine-tuned for token classification via `DeBertaForTokenClassification` or `TFDeBertaForTokenClassification`. These models are usually under `Token Classification` category and have `bert` in their labels\n", - "- Reference: [TFDeBertaForTokenClassification](https://huggingface.co/docs/transformers/model_doc/deberta#transformers.TFDebertaForTokenClassification)\n", - "- Some [example models](https://huggingface.co/models?filter=deberta&pipeline_tag=token-classification)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:72: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "BxfHE_l9edjW" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "28f58f45348b490aa1aa15e42555927f", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Export and Save HuggingFace model" + "text/plain": [ + "config.json: 0%| | 0.00/2.40k [00:00=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", - "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow==2.11.0" + "text/plain": [ + "model.safetensors: 0%| | 0.00/735M [00:00=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", - "- We'll use [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "inxS_UuG7k1K" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb)\n", + "\n", + "# Import ONNX E5 models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support for this annotator was introduced in `Spark NLP 5.1.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for E5 from HuggingFace and they have to be in `Sentence Similarity` category. Meaning, you cannot use E5 models trained/fine-tuned on a specific task such as token/sequence classification." + ] + }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Framework not specified. Using pt to export to ONNX.\n", - "Using framework PyTorch: 2.0.1+cu118\n", - "Overriding 1 configuration item(s)\n", - "\t- use_cache -> False\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "rES2GQnL7k1L" + }, + "source": [ + "## Export and Save HuggingFace model" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", - "verbose: False, log level: Level.ERROR\n", - "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", - "\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", - "\n", - "MODEL_NAME = \"intfloat/e5-small-v2\"\n", - "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(EXPORT_PATH)\n", - "\n", - "# Create directory for assets and move the tokenizer files.\n", - "# A separate folder is needed for Spark NLP.\n", - "!mkdir {EXPORT_PATH}/assets\n", - "!mv {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's have a look inside these two directories and see what we are dealing with:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "_J-t5wW37k1M" + }, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 130692\n", - "drwxr-xr-x 2 root root 4096 Sep 5 09:03 assets\n", - "-rw-r--r-- 1 root root 626 Sep 5 09:03 config.json\n", - "-rw-r--r-- 1 root root 133093467 Sep 5 09:03 model.onnx\n", - "-rw-r--r-- 1 root root 125 Sep 5 09:03 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 314 Sep 5 09:03 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 711396 Sep 5 09:03 tokenizer.json\n" - ] - } - ], - "source": [ - "!ls -l {EXPORT_PATH}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "0OBtmDTB7k1M", + "outputId": "9d07d7c3-c9c9-4666-b620-3524d7daa9de", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.0/410.0 kB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m589.8/589.8 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m54.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.8/455.8 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m59.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.3/5.3 MB\u001b[0m \u001b[31m68.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m46.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m50.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m31.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.2/311.2 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m45.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m63.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m74.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m61.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m76.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m80.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m75.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m76.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m80.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m74.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m52.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m68.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m77.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m65.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m73.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m84.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m69.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m77.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m76.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m34.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m70.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m86.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m70.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m61.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m82.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m35.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m64.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m90.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m37.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m82.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m57.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m80.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m78.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.1/17.1 MB\u001b[0m \u001b[31m69.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m63.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m57.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m81.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m33.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m80.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m13.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m39.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m38.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m38.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m66.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m55.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m59.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m59.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m59.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m19.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\n", + "tf-keras 2.15.1 requires tensorflow<2.16,>=2.15, but you have tensorflow 2.11.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 228\n", - "-rw-r--r-- 1 root root 231508 Sep 5 09:03 vocab.txt\n" - ] - } - ], - "source": [ - "!ls -l {EXPORT_PATH}/assets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import and Save E5 in Spark NLP\n", - "\n", - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "M6DKNctk7k1N" + }, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Installing PySpark 3.2.3 and Spark NLP 5.1.0\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.0\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m531.2/531.2 kB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], - "source": [ - "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Let's use `loadSavedModel` functon in `E5Embeddings` which allows us to load the ONNX model\n", - "- Most params will be set automatically. They can also be set later after loading the model in `E5Embeddings` during runtime, so don't worry about setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "\n", - "# All these params should be identical to the original ONNX model\n", - "E5 = E5Embeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", - " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"E5\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "E5.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's clean up stuff we don't need anymore" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!rm -rf {EXPORT_PATH}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your ONNX E5 model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "Nrcm1-V-7k1N", + "outputId": "d0ba8939-127f-4600-c074-5514a3fb033f", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n", + "Framework not specified. Using pt to export the model.\n", + "Using the export variant default. Available variants are:\n", + " - default: The default ONNX variant.\n", + "Using framework PyTorch: 2.2.1+cu121\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", + "\n", + "MODEL_NAME = \"intfloat/e5-small-v2\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets\n", + "!mv {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets/" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 130008\n", - "-rw-r--r-- 1 root root 133113905 Sep 5 08:57 e5_onnx\n", - "drwxr-xr-x 3 root root 4096 Sep 5 08:57 fields\n", - "drwxr-xr-x 2 root root 4096 Sep 5 08:57 metadata\n" - ] - } - ], - "source": [ - "! ls -l {MODEL_NAME}_spark_nlp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny E5 model 😊" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sparknlp\n", - "\n", - "from sparknlp.base import *\n", - "from sparknlp.annotator import *\n", - "\n", - "document_assembler = DocumentAssembler()\\\n", - " .setInputCol(\"text\")\\\n", - " .setOutputCol(\"document\")\n", - "\n", - "E5_loaded = E5Embeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", - " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"E5\")\\\n", - "\n", - "pipeline = Pipeline(\n", - " stages = [\n", - " document_assembler,\n", - " E5_loaded\n", - " ])\n", - "\n", - "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", - "model = pipeline.fit(data)\n", - "result = model.transform(data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "EbR3_SbG7k1O" + }, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "cosFNNCM7k1P", + "outputId": "f5d098d7-aa31-4dc1-999f-5a218d26d90f", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "total 130692\n", + "drwxr-xr-x 2 root root 4096 Apr 12 11:02 assets\n", + "-rw-r--r-- 1 root root 626 Apr 12 11:02 config.json\n", + "-rw-r--r-- 1 root root 133093468 Apr 12 11:02 model.onnx\n", + "-rw-r--r-- 1 root root 125 Apr 12 11:02 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 314 Apr 12 11:02 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 711396 Apr 12 11:02 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+\n", - "| embeddings|\n", - "+--------------------+\n", - "|[-0.35357836, 0.3...|\n", - "+--------------------+\n", - "\n" - ] + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "FmE_V8JI7k1P", + "outputId": "85de6e65-e485-45e7-b0a0-eae6212bf6af", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "total 228\n", + "-rw-r--r-- 1 root root 231508 Apr 12 11:02 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jFxCu5FZ7k1R" + }, + "source": [ + "## Import and Save E5 in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script\n", + "- However, we need to upgrade Spark to a more recent version to use this annotator." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "IND1Fqp07k1R", + "outputId": "cf139caf-875c-4998-a047-fb842a7adc65", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.3.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.3.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m568.4/568.4 kB\u001b[0m \u001b[31m44.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash\n", + "! pip install -U pyspark==3.4.1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2hAdyHc_7k1R" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "ViVKUlqj7k1S" + }, + "outputs": [], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fazfE9lb7k1S" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `E5Embeddings` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `E5Embeddings` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "-soDJIu-7k1S" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "MODEL_NAME = \"intfloat/e5-small-v2\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "E5 = E5Embeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"E5\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0BU5PjTc7k1S" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "jpUjHNk67k1T" + }, + "outputs": [], + "source": [ + "E5.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AbK6dSr37k1T" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "A-RW-l_87k1T" + }, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hjuhoNym7k1U" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX E5 model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "t8WxTSsS7k1U", + "outputId": "2c8417c5-5622-4aeb-fc1a-d3e50453d901", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "total 130008\n", + "-rw-r--r-- 1 root root 133113906 Apr 12 11:05 e5_onnx\n", + "drwxr-xr-x 3 root root 4096 Apr 12 11:04 fields\n", + "drwxr-xr-x 2 root root 4096 Apr 12 11:04 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qMb-ejgJ7k1U" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny E5 model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "YXy8sgat7k1U" + }, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "E5_loaded = E5Embeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"E5\")\\\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " document_assembler,\n", + " E5_loaded\n", + " ])\n", + "\n", + "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", + "model = pipeline.fit(data)\n", + "result = model.transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "G_tyqtE77k1U", + "outputId": "9ea755c6-0686-43af-91b1-f9a19006d070", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+--------------------+\n", + "| embeddings|\n", + "+--------------------+\n", + "|[-0.042928364, 0....|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "result.selectExpr(\"explode(E5.embeddings) as embeddings\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sJqE6o8m7k1Z" + }, + "source": [ + "That's it! You can now go wild and use hundreds of E5 models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" } - ], - "source": [ - "result.selectExpr(\"explode(E5.embeddings) as embeddings\").show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "That's it! You can now go wild and use hundreds of E5 models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb index af5cebcfe3e9ba..9b97fabd7756d0 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb @@ -40,27 +40,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m19.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.6/380.6 kB\u001b[0m \u001b[31m33.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m27.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m54.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m44.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m79.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.3/519.3 kB\u001b[0m \u001b[31m44.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m86.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m73.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m71.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.0/410.0 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m589.8/589.8 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m42.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.8/455.8 kB\u001b[0m \u001b[31m31.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m44.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.3/5.3 MB\u001b[0m \u001b[31m48.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m52.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m47.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m43.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.2/311.2 kB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m39.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "tf-keras 2.15.1 requires tensorflow<2.16,>=2.15, but you have tensorflow 2.16.1 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0m" ] } @@ -82,15 +85,31 @@ "execution_count": null, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n", + "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "aaa606f9a3ff42a79f352eab50bafe2e", + "model_id": "6ba49f9ffa4147d2bb4dfbd6bc76b61c", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Downloading (…)lve/main/config.json: 0%| | 0.00/571 [00:00=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow numpy==1.23.0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [arpanghoshal/EmoRoBERTa](https://huggingface.co/arpanghoshal/EmoRoBERTa) model from HuggingFace as an example and load it as a `ORTModelForSequenceClassification`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "JRK1uzQWc7l4" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5fbfd91779024dd98573a8251b72791d", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Import ONNX RoBertaForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", - "- `RoBertaForSequenceClassification` is only available since in `Spark NLP 5.1.4` and after. So please make sure you have upgraded to the latest Spark NLP release\n", - "- You can import RoBERTa models trained/fine-tuned for sequence classification via `RobertaForSequenceClassification` or `TFRobertaForSequenceClassification`. These models are usually under `Text Classification` category and have `roberta` in their labels\n", - "- Reference: [TFRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForSequenceClassification)\n", - "- Some [example models](https://huggingface.co/models?filter=roberta&pipeline_tag=text-classification)" + "text/plain": [ + "(…)shal/EmoRoBERTa/resolve/main/config.json: 0%| | 0.00/1.72k [00:00=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.\n", + "\n", + "All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "qyUn2L2gc7mF" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8bcfb10ceba3482b8a36cbe49b9ee981", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", - "- We'll use [arpanghoshal/EmoRoBERTa](https://huggingface.co/arpanghoshal/EmoRoBERTa) model from HuggingFace as an example and load it as a `ORTModelForSequenceClassification`, representing an ONNX model." + "text/plain": [ + "(…)BERTa/resolve/main/tokenizer_config.json: 0%| | 0.00/25.0 [00:00 False\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForSequenceClassification\n", - "import tensorflow as tf\n", - "\n", - "MODEL_NAME = 'arpanghoshal/EmoRoBERTa'\n", - "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(ONNX_MODEL)" + "text/plain": [ + "(…)oshal/EmoRoBERTa/resolve/main/vocab.json: 0%| | 0.00/798k [00:00 False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForSequenceClassification\n", + "import tensorflow as tf\n", + "\n", + "MODEL_NAME = 'arpanghoshal/EmoRoBERTa'\n", + "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(ONNX_MODEL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# Read the vocab JSON file\n", + "with open('{}/vocab.json'.format(ONNX_MODEL), 'r') as json_file:\n", + " tokenizer = json.load(json_file)\n", + "\n", + "# let's save the vocab as txt file\n", + "with open('{}/vocab.txt'.format(ONNX_MODEL), 'w') as keys_file:\n", + " for item in tokenizer.keys():\n", + " keys_file.write(\"%s\\n\" % item)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "ag3hROoTc7mI" - }, - "source": [ - "- As you can see, we need to move `vocab.txt` and `merges.txt` from the tokenizer to `assets` folder which Spark NLP will look for\n", - "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 491140\n", + "-rw-r--r-- 1 root root 1894 Oct 16 21:06 config.json\n", + "-rw-r--r-- 1 root root 456318 Oct 16 21:06 merges.txt\n", + "-rw-r--r-- 1 root root 499132924 Oct 16 21:06 model.onnx\n", + "-rw-r--r-- 1 root root 280 Oct 16 21:06 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 1337 Oct 16 21:06 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2108619 Oct 16 21:06 tokenizer.json\n", + "-rw-r--r-- 1 root root 798293 Oct 16 21:06 vocab.json\n", + "-rw-r--r-- 1 root root 407065 Oct 16 21:07 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- As you can see, we need to move `vocab.txt` and `merges.txt` from the tokenizer to `assets` folder which Spark NLP will look for\n", + "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get label2id dictionary\n", + "labels = ort_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/vocab.txt {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/merges.txt {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila! We have our `vocab.txt`, `merges.txt` and `labels.txt` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1lNEPm_Ic7mI" - }, - "outputs": [], - "source": [ - "!mkdir {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "onnx_models/arpanghoshal/EmoRoBERTa:\n", + "total 490296\n", + "drwxr-xr-x 2 root root 4096 Oct 16 21:08 assets\n", + "-rw-r--r-- 1 root root 1894 Oct 16 21:06 config.json\n", + "-rw-r--r-- 1 root root 499132924 Oct 16 21:06 model.onnx\n", + "-rw-r--r-- 1 root root 280 Oct 16 21:06 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 1337 Oct 16 21:06 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2108619 Oct 16 21:06 tokenizer.json\n", + "-rw-r--r-- 1 root root 798293 Oct 16 21:06 vocab.json\n", + "\n", + "onnx_models/arpanghoshal/EmoRoBERTa/assets:\n", + "total 852\n", + "-rw-r--r-- 1 root root 248 Oct 16 21:08 labels.txt\n", + "-rw-r--r-- 1 root root 456318 Oct 16 21:06 merges.txt\n", + "-rw-r--r-- 1 root root 407065 Oct 16 21:07 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -lR {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save RoBertaForSequenceClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1eD9itghc7mJ" - }, - "outputs": [], - "source": [ - "# get label2id dictionary\n", - "labels = ort_model.config.id2label\n", - "# sort the dictionary based on the id\n", - "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", - "\n", - "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", - " f.write('\\n'.join(labels))" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-10-16 21:08:22-- http://setup.johnsnowlabs.com/colab.sh\n", + "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", + "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", + "HTTP request sent, awaiting response... 302 Moved Temporarily\n", + "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", + "--2023-10-16 21:08:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1191 (1.2K) [text/plain]\n", + "Saving to: ‘STDOUT’\n", + "\n", + "- 100%[===================>] 1.16K --.-KB/s in 0s \n", + "\n", + "2023-10-16 21:08:23 (93.8 MB/s) - written to stdout [1191/1191]\n", + "\n", + "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m21.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NoAl9qo3c7mJ" - }, - "outputs": [], - "source": [ - "!mv {ONNX_MODEL}/vocab.txt {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `RoBertaForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `RoBertaForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "sequenceClassifier = RoBertaForSequenceClassification.loadSavedModel(\n", + " ONNX_MODEL,\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your RoBertaForSequenceClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Iy-uQsGBc7mJ" - }, - "outputs": [], - "source": [ - "!mv {ONNX_MODEL}/merges.txt {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 487524\n", + "drwxr-xr-x 5 root root 4096 Oct 16 21:15 fields\n", + "drwxr-xr-x 2 root root 4096 Oct 16 21:15 metadata\n", + "-rw-r--r-- 1 root root 499209257 Oct 16 21:16 roberta_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForSequenceClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sequenceClassifier_loaded = RoBertaForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "AcIBtJgYc7mK" - }, - "source": [ - "Voila! We have our `vocab.txt`, `merges.txt` and `labels.txt` inside assets directory" + "data": { + "text/plain": [ + "['disgust',\n", + " 'optimism',\n", + " 'embarrassment',\n", + " 'amusement',\n", + " 'realization',\n", + " 'surprise',\n", + " 'grief',\n", + " 'caring',\n", + " 'disapproval',\n", + " 'disappointment',\n", + " 'joy',\n", + " 'confusion',\n", + " 'excitement',\n", + " 'approval',\n", + " 'curiosity',\n", + " 'anger',\n", + " 'love',\n", + " 'admiration',\n", + " 'gratitude',\n", + " 'annoyance',\n", + " 'remorse',\n", + " 'nervousness',\n", + " 'neutral',\n", + " 'pride',\n", + " 'fear',\n", + " 'sadness',\n", + " 'desire',\n", + " 'relief']" ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "sequenceClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " sequenceClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"I love you!\"], ['I feel lucky to be here.']]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"class.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of `RoBertaForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0053473f98634c6db3fdc1a98375395e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3C62iosEc7mK", - "outputId": "640df9da-a3ed-4548-a1e9-5004b765545e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "onnx_models/arpanghoshal/EmoRoBERTa:\n", - "total 490296\n", - "drwxr-xr-x 2 root root 4096 Oct 16 21:08 assets\n", - "-rw-r--r-- 1 root root 1894 Oct 16 21:06 config.json\n", - "-rw-r--r-- 1 root root 499132924 Oct 16 21:06 model.onnx\n", - "-rw-r--r-- 1 root root 280 Oct 16 21:06 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 1337 Oct 16 21:06 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 2108619 Oct 16 21:06 tokenizer.json\n", - "-rw-r--r-- 1 root root 798293 Oct 16 21:06 vocab.json\n", - "\n", - "onnx_models/arpanghoshal/EmoRoBERTa/assets:\n", - "total 852\n", - "-rw-r--r-- 1 root root 248 Oct 16 21:08 labels.txt\n", - "-rw-r--r-- 1 root root 456318 Oct 16 21:06 merges.txt\n", - "-rw-r--r-- 1 root root 407065 Oct 16 21:07 vocab.txt\n" - ] - } + "015f0d45838e4af9af076781b7aa972d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d49da31cb9ae404c9ec91b149664869a", + "placeholder": "​", + "style": "IPY_MODEL_a9c84da30f4f427a96aa2c19abf76e68", + "value": "tf_model.h5: 100%" + } + }, + "093cdd054a864e9a874b446d8a08804d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0adaff4deffd49e5afd1ce940c7d39bb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5eb8c09ffde046fa9dc04747e68ce62a", + "IPY_MODEL_d52070540deb459fac543ed8f25235ac", + "IPY_MODEL_51ed8cb8b4e5479fab1dc8e6a68e6e51" ], - "source": [ - "!ls -lR {ONNX_MODEL}" - ] + "layout": "IPY_MODEL_f4a5589dd1fa4a969c0d1b7fc8e48899" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "zK6xJduGc7mK" - }, - "source": [ - "## Import and Save RoBertaForSequenceClassification in Spark NLP\n" - ] + "16009d1ead7b429b850233aa837a7b2d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e9869c9db5eb49b097057715ce38aa81", + "max": 1720, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1db6eb8573fe496083a9dc7eecf04423", + "value": 1720 + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "Tx7PXwVdc7mL" - }, - "source": [ - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script" - ] + "1a84293ac3ed46299b7eea091fdd974d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PDf1ieS-c7mL", - "outputId": "eadcbbe7-fde3-410e-e8b6-e35d5c2704d1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2023-10-16 21:08:22-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2023-10-16 21:08:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2023-10-16 21:08:23 (93.8 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m21.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } + "1d42f739e22740dd9a6a48b2ea9a842b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1db6eb8573fe496083a9dc7eecf04423": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2379cb61017b4b489b1afe1cbee69271": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2499b948f0ed4e228983136bcf5edb4a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2ae0331621f44a7695e5d9d0a020fe92": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_015f0d45838e4af9af076781b7aa972d", + "IPY_MODEL_d6bb4c9501d440c888eb46111c838879", + "IPY_MODEL_6db4fe3b814945f0a6a5cdc0e4b51f6b" ], - "source": [ - "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] + "layout": "IPY_MODEL_55431ee7275b421494d58326adc2fc6b" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "q0BWnXsac7mL" - }, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] + "3028097af9f44d4c90fa052606381fb5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QrJGDaJmc7mL", - "outputId": "deab4121-a931-40de-9f57-1bb336a6900b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apache Spark version: 3.2.3\n" - ] - } + "3042f6cff3bd471cbd98f56175051895": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "32e846056bf14e16a5b232a73a947c01": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9ca99a56ef5a4bcfad453e03db0da9c3", + "IPY_MODEL_52b4033e2cec4d3eb4988fd1974782ab", + "IPY_MODEL_da76feb9e0e842e7b00fcdde6bf8f06b" ], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()\n", - "\n", - "print(\"Apache Spark version: {}\".format(spark.version))" - ] + "layout": "IPY_MODEL_3042f6cff3bd471cbd98f56175051895" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "OJwHAwCBc7mM" - }, - "source": [ - "- Let's use `loadSavedModel` functon in `RoBertaForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", - "- Most params can be set later when you are loading this model in `RoBertaForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." - ] + "3669d7ae2362449da4a0a0780d5f63c5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GTuQL16tc7mM" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *\n", - "\n", - "sequenceClassifier = RoBertaForSequenceClassification.loadSavedModel(\n", - " ONNX_MODEL,\n", - " spark\n", - " )\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"class\")\\\n", - " .setCaseSensitive(True)\\\n", - " .setMaxSentenceLength(128)" - ] + "391db5e30ad6496992ea0cb6d3b9987a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c997ceb059a146fcb0c703351c1761dc", + "max": 798293, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8096ba7c617848bd8d55da036098e1f1", + "value": 798293 + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "onEFpv7Tc7mM" - }, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] + "39a48467ad544aa2a87051d2f20a40b8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UJaNm35nc7mM" - }, - "outputs": [], - "source": [ - "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" - ] + "3b447409a2be4ae885e660e1b4466f98": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "790iTH95c7mN" - }, - "source": [ - "Let's clean up stuff we don't need anymore" - ] + "3ba005b695274184a587bc747e1b1f2f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KD-CLAnPc7mN" - }, - "outputs": [], - "source": [ - "!rm -rf {ONNX_MODEL}" - ] + "3dcf304e43d548398c3a1ec31e35d175": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "92VPKttJc7mN" - }, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your RoBertaForSequenceClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] + "3e10082ca6d84864b50d9af3732ab3e0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "N_nIp5DOc7mN", - "outputId": "f10a34d1-9792-418c-9571-a9f3ebaa371e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 487524\n", - "drwxr-xr-x 5 root root 4096 Oct 16 21:15 fields\n", - "drwxr-xr-x 2 root root 4096 Oct 16 21:15 metadata\n", - "-rw-r--r-- 1 root root 499209257 Oct 16 21:16 roberta_classification_onnx\n" - ] - } + "41873434547e4ab3b18ca625d92a5b7e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4bf72dff2ada442297768dc3cdf5a128": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5157e3971ff5473f97f7e8289664740a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "51ed8cb8b4e5479fab1dc8e6a68e6e51": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3028097af9f44d4c90fa052606381fb5", + "placeholder": "​", + "style": "IPY_MODEL_f95544b3034e4a8c913d7214847b5ee4", + "value": " 239/239 [00:00<00:00, 9.82kB/s]" + } + }, + "525dfac737ee45d79926005b93c32651": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "52b4033e2cec4d3eb4988fd1974782ab": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3b447409a2be4ae885e660e1b4466f98", + "max": 456356, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2499b948f0ed4e228983136bcf5edb4a", + "value": 456356 + } + }, + "55431ee7275b421494d58326adc2fc6b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5bc6df5269f046d999597dbd19603b71": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5c69b2b921364ed689b86c7df266b9ac": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a357207cf2364537b6b7af4bcc3a023d", + "placeholder": "​", + "style": "IPY_MODEL_8c686da42706418393303a5a20877092", + "value": "(…)shal/EmoRoBERTa/resolve/main/config.json: 100%" + } + }, + "5eb8c09ffde046fa9dc04747e68ce62a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f053be36c9bd4219819805bdd7c2d889", + "placeholder": "​", + "style": "IPY_MODEL_d5c7d6bb07f446c580cc828278d96ddc", + "value": "(…)RTa/resolve/main/special_tokens_map.json: 100%" + } + }, + "5fa90b3e28204414b4f90b9bd60f74f7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_525dfac737ee45d79926005b93c32651", + "placeholder": "​", + "style": "IPY_MODEL_b9c29e2ddc7e45798222aab437cc478d", + "value": "(…)oshal/EmoRoBERTa/resolve/main/vocab.json: 100%" + } + }, + "5fbfd91779024dd98573a8251b72791d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5c69b2b921364ed689b86c7df266b9ac", + "IPY_MODEL_16009d1ead7b429b850233aa837a7b2d", + "IPY_MODEL_7eaf31b5e29d443faa7a51b7db827591" ], - "source": [ - "! ls -l {ONNX_MODEL}_spark_nlp_onnx" - ] + "layout": "IPY_MODEL_3dcf304e43d548398c3a1ec31e35d175" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "urmb3Gjuc7mN" - }, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForSequenceClassification model 😊" - ] + "6c4a17cdb4ef4b9fa1149b0974abea15": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mvLEGwPSc7mO" - }, - "outputs": [], - "source": [ - "sequenceClassifier_loaded = RoBertaForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"class\")" - ] + "6db4fe3b814945f0a6a5cdc0e4b51f6b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d0bffccc8dd241bd9c3f9aea84f9df91", + "placeholder": "​", + "style": "IPY_MODEL_2379cb61017b4b489b1afe1cbee69271", + "value": " 501M/501M [00:01<00:00, 212MB/s]" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "exdim7FZc7mO" - }, - "source": [ - "You can see what labels were used to train this model via `getClasses` function:" - ] + "7ae2064f5300443bb2fd19479fb27153": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "umjbpKeGc7mO", - "outputId": "1b989ddf-cbb4-4870-bfe0-39f38c420926" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['disgust',\n", - " 'optimism',\n", - " 'embarrassment',\n", - " 'amusement',\n", - " 'realization',\n", - " 'surprise',\n", - " 'grief',\n", - " 'caring',\n", - " 'disapproval',\n", - " 'disappointment',\n", - " 'joy',\n", - " 'confusion',\n", - " 'excitement',\n", - " 'approval',\n", - " 'curiosity',\n", - " 'anger',\n", - " 'love',\n", - " 'admiration',\n", - " 'gratitude',\n", - " 'annoyance',\n", - " 'remorse',\n", - " 'nervousness',\n", - " 'neutral',\n", - " 'pride',\n", - " 'fear',\n", - " 'sadness',\n", - " 'desire',\n", - " 'relief']" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } + "7eaf31b5e29d443faa7a51b7db827591": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3e10082ca6d84864b50d9af3732ab3e0", + "placeholder": "​", + "style": "IPY_MODEL_6c4a17cdb4ef4b9fa1149b0974abea15", + "value": " 1.72k/1.72k [00:00<00:00, 35.4kB/s]" + } + }, + "8096ba7c617848bd8d55da036098e1f1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8bcfb10ceba3482b8a36cbe49b9ee981": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9b641237a71c4bdf859cb4156e0d1bf3", + "IPY_MODEL_9677f089e0d2429180c9da716359a330", + "IPY_MODEL_a7f11a68b922413baf9b08f31404bf99" ], - "source": [ - "# .getClasses was introduced in spark-nlp==3.4.0\n", - "sequenceClassifier_loaded.getClasses()" - ] + "layout": "IPY_MODEL_7ae2064f5300443bb2fd19479fb27153" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "lV66JB3oc7mP" - }, - "source": [ - "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" - ] + "8c4562067daf4971920ddb36672e6c9c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lSAqXURrc7mP" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *\n", - "\n", - "document_assembler = DocumentAssembler() \\\n", - " .setInputCol('text') \\\n", - " .setOutputCol('document')\n", - "\n", - "tokenizer = Tokenizer() \\\n", - " .setInputCols(['document']) \\\n", - " .setOutputCol('token')\n", - "\n", - "pipeline = Pipeline(stages=[\n", - " document_assembler,\n", - " tokenizer,\n", - " sequenceClassifier_loaded\n", - "])\n", - "\n", - "# couple of simple examples\n", - "example = spark.createDataFrame([[\"I love you!\"], ['I feel lucky to be here.']]).toDF(\"text\")\n", - "\n", - "result = pipeline.fit(example).transform(example)\n", - "\n", - "# result is a DataFrame\n", - "result.select(\"text\", \"class.result\").show()" - ] + "8c686da42706418393303a5a20877092": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "HK07UoXQc7mP" - }, - "source": [ - "That's it! You can now go wild and use hundreds of `RoBertaForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "0053473f98634c6db3fdc1a98375395e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "015f0d45838e4af9af076781b7aa972d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d49da31cb9ae404c9ec91b149664869a", - "placeholder": "​", - "style": "IPY_MODEL_a9c84da30f4f427a96aa2c19abf76e68", - "value": "tf_model.h5: 100%" - } - }, - "093cdd054a864e9a874b446d8a08804d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0adaff4deffd49e5afd1ce940c7d39bb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_5eb8c09ffde046fa9dc04747e68ce62a", - "IPY_MODEL_d52070540deb459fac543ed8f25235ac", - "IPY_MODEL_51ed8cb8b4e5479fab1dc8e6a68e6e51" - ], - "layout": "IPY_MODEL_f4a5589dd1fa4a969c0d1b7fc8e48899" - } - }, - "16009d1ead7b429b850233aa837a7b2d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e9869c9db5eb49b097057715ce38aa81", - "max": 1720, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1db6eb8573fe496083a9dc7eecf04423", - "value": 1720 - } - }, - "1a84293ac3ed46299b7eea091fdd974d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "1d42f739e22740dd9a6a48b2ea9a842b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1db6eb8573fe496083a9dc7eecf04423": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "2379cb61017b4b489b1afe1cbee69271": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "2499b948f0ed4e228983136bcf5edb4a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "2ae0331621f44a7695e5d9d0a020fe92": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_015f0d45838e4af9af076781b7aa972d", - "IPY_MODEL_d6bb4c9501d440c888eb46111c838879", - "IPY_MODEL_6db4fe3b814945f0a6a5cdc0e4b51f6b" - ], - "layout": "IPY_MODEL_55431ee7275b421494d58326adc2fc6b" - } - }, - "3028097af9f44d4c90fa052606381fb5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3042f6cff3bd471cbd98f56175051895": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "32e846056bf14e16a5b232a73a947c01": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_9ca99a56ef5a4bcfad453e03db0da9c3", - "IPY_MODEL_52b4033e2cec4d3eb4988fd1974782ab", - "IPY_MODEL_da76feb9e0e842e7b00fcdde6bf8f06b" - ], - "layout": "IPY_MODEL_3042f6cff3bd471cbd98f56175051895" - } - }, - "3669d7ae2362449da4a0a0780d5f63c5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "391db5e30ad6496992ea0cb6d3b9987a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c997ceb059a146fcb0c703351c1761dc", - "max": 798293, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_8096ba7c617848bd8d55da036098e1f1", - "value": 798293 - } - }, - "39a48467ad544aa2a87051d2f20a40b8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3b447409a2be4ae885e660e1b4466f98": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3ba005b695274184a587bc747e1b1f2f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3dcf304e43d548398c3a1ec31e35d175": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3e10082ca6d84864b50d9af3732ab3e0": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "41873434547e4ab3b18ca625d92a5b7e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4bf72dff2ada442297768dc3cdf5a128": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5157e3971ff5473f97f7e8289664740a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "51ed8cb8b4e5479fab1dc8e6a68e6e51": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3028097af9f44d4c90fa052606381fb5", - "placeholder": "​", - "style": "IPY_MODEL_f95544b3034e4a8c913d7214847b5ee4", - "value": " 239/239 [00:00<00:00, 9.82kB/s]" - } - }, - "525dfac737ee45d79926005b93c32651": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "52b4033e2cec4d3eb4988fd1974782ab": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3b447409a2be4ae885e660e1b4466f98", - "max": 456356, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_2499b948f0ed4e228983136bcf5edb4a", - "value": 456356 - } - }, - "55431ee7275b421494d58326adc2fc6b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5bc6df5269f046d999597dbd19603b71": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5c69b2b921364ed689b86c7df266b9ac": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a357207cf2364537b6b7af4bcc3a023d", - "placeholder": "​", - "style": "IPY_MODEL_8c686da42706418393303a5a20877092", - "value": "(…)shal/EmoRoBERTa/resolve/main/config.json: 100%" - } - }, - "5eb8c09ffde046fa9dc04747e68ce62a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f053be36c9bd4219819805bdd7c2d889", - "placeholder": "​", - "style": "IPY_MODEL_d5c7d6bb07f446c580cc828278d96ddc", - "value": "(…)RTa/resolve/main/special_tokens_map.json: 100%" - } - }, - "5fa90b3e28204414b4f90b9bd60f74f7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_525dfac737ee45d79926005b93c32651", - "placeholder": "​", - "style": "IPY_MODEL_b9c29e2ddc7e45798222aab437cc478d", - "value": "(…)oshal/EmoRoBERTa/resolve/main/vocab.json: 100%" - } - }, - "5fbfd91779024dd98573a8251b72791d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_5c69b2b921364ed689b86c7df266b9ac", - "IPY_MODEL_16009d1ead7b429b850233aa837a7b2d", - "IPY_MODEL_7eaf31b5e29d443faa7a51b7db827591" - ], - "layout": "IPY_MODEL_3dcf304e43d548398c3a1ec31e35d175" - } - }, - "6c4a17cdb4ef4b9fa1149b0974abea15": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "6db4fe3b814945f0a6a5cdc0e4b51f6b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d0bffccc8dd241bd9c3f9aea84f9df91", - "placeholder": "​", - "style": "IPY_MODEL_2379cb61017b4b489b1afe1cbee69271", - "value": " 501M/501M [00:01<00:00, 212MB/s]" - } - }, - "7ae2064f5300443bb2fd19479fb27153": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7eaf31b5e29d443faa7a51b7db827591": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3e10082ca6d84864b50d9af3732ab3e0", - "placeholder": "​", - "style": "IPY_MODEL_6c4a17cdb4ef4b9fa1149b0974abea15", - "value": " 1.72k/1.72k [00:00<00:00, 35.4kB/s]" - } - }, - "8096ba7c617848bd8d55da036098e1f1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "8bcfb10ceba3482b8a36cbe49b9ee981": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_9b641237a71c4bdf859cb4156e0d1bf3", - "IPY_MODEL_9677f089e0d2429180c9da716359a330", - "IPY_MODEL_a7f11a68b922413baf9b08f31404bf99" - ], - "layout": "IPY_MODEL_7ae2064f5300443bb2fd19479fb27153" - } - }, - "8c4562067daf4971920ddb36672e6c9c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "8c686da42706418393303a5a20877092": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "952294f9ffbf48648b3f1cdd961e3aed": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_5fa90b3e28204414b4f90b9bd60f74f7", - "IPY_MODEL_391db5e30ad6496992ea0cb6d3b9987a", - "IPY_MODEL_c9ad96305414428cbf85fe47a9190e43" - ], - "layout": "IPY_MODEL_5bc6df5269f046d999597dbd19603b71" - } - }, - "9677f089e0d2429180c9da716359a330": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0053473f98634c6db3fdc1a98375395e", - "max": 25, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1a84293ac3ed46299b7eea091fdd974d", - "value": 25 - } - }, - "99be2ddc8fdd4b0ca92ac3c404f165fc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "9b641237a71c4bdf859cb4156e0d1bf3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f9159108ec0248f7ad72963860d8a225", - "placeholder": "​", - "style": "IPY_MODEL_8c4562067daf4971920ddb36672e6c9c", - "value": "(…)BERTa/resolve/main/tokenizer_config.json: 100%" - } - }, - "9ca99a56ef5a4bcfad453e03db0da9c3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3669d7ae2362449da4a0a0780d5f63c5", - "placeholder": "​", - "style": "IPY_MODEL_39a48467ad544aa2a87051d2f20a40b8", - "value": "(…)oshal/EmoRoBERTa/resolve/main/merges.txt: 100%" - } - }, - "a357207cf2364537b6b7af4bcc3a023d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a7f11a68b922413baf9b08f31404bf99": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fcc3121c28b446429a3dada8413b26d4", - "placeholder": "​", - "style": "IPY_MODEL_093cdd054a864e9a874b446d8a08804d", - "value": " 25.0/25.0 [00:00<00:00, 1.64kB/s]" - } - }, - "a9c84da30f4f427a96aa2c19abf76e68": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ad20fa821a3c45369ca41b78c34d7410": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b9c29e2ddc7e45798222aab437cc478d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c0cbff305800422e81169bbf6148a06e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "c997ceb059a146fcb0c703351c1761dc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c9ad96305414428cbf85fe47a9190e43": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_41873434547e4ab3b18ca625d92a5b7e", - "placeholder": "​", - "style": "IPY_MODEL_ad20fa821a3c45369ca41b78c34d7410", - "value": " 798k/798k [00:00<00:00, 3.23MB/s]" - } - }, - "d0bffccc8dd241bd9c3f9aea84f9df91": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d49da31cb9ae404c9ec91b149664869a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d52070540deb459fac543ed8f25235ac": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1d42f739e22740dd9a6a48b2ea9a842b", - "max": 239, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_c0cbff305800422e81169bbf6148a06e", - "value": 239 - } - }, - "d5c7d6bb07f446c580cc828278d96ddc": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "d6bb4c9501d440c888eb46111c838879": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5157e3971ff5473f97f7e8289664740a", - "max": 501322656, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_99be2ddc8fdd4b0ca92ac3c404f165fc", - "value": 501322656 - } - }, - "da76feb9e0e842e7b00fcdde6bf8f06b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4bf72dff2ada442297768dc3cdf5a128", - "placeholder": "​", - "style": "IPY_MODEL_3ba005b695274184a587bc747e1b1f2f", - "value": " 456k/456k [00:00<00:00, 937kB/s]" - } - }, - "e9869c9db5eb49b097057715ce38aa81": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f053be36c9bd4219819805bdd7c2d889": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f4a5589dd1fa4a969c0d1b7fc8e48899": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f9159108ec0248f7ad72963860d8a225": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f95544b3034e4a8c913d7214847b5ee4": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "fcc3121c28b446429a3dada8413b26d4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - } - } + "952294f9ffbf48648b3f1cdd961e3aed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5fa90b3e28204414b4f90b9bd60f74f7", + "IPY_MODEL_391db5e30ad6496992ea0cb6d3b9987a", + "IPY_MODEL_c9ad96305414428cbf85fe47a9190e43" + ], + "layout": "IPY_MODEL_5bc6df5269f046d999597dbd19603b71" + } + }, + "9677f089e0d2429180c9da716359a330": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0053473f98634c6db3fdc1a98375395e", + "max": 25, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1a84293ac3ed46299b7eea091fdd974d", + "value": 25 + } + }, + "99be2ddc8fdd4b0ca92ac3c404f165fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9b641237a71c4bdf859cb4156e0d1bf3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f9159108ec0248f7ad72963860d8a225", + "placeholder": "​", + "style": "IPY_MODEL_8c4562067daf4971920ddb36672e6c9c", + "value": "(…)BERTa/resolve/main/tokenizer_config.json: 100%" + } + }, + "9ca99a56ef5a4bcfad453e03db0da9c3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3669d7ae2362449da4a0a0780d5f63c5", + "placeholder": "​", + "style": "IPY_MODEL_39a48467ad544aa2a87051d2f20a40b8", + "value": "(…)oshal/EmoRoBERTa/resolve/main/merges.txt: 100%" + } + }, + "a357207cf2364537b6b7af4bcc3a023d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a7f11a68b922413baf9b08f31404bf99": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fcc3121c28b446429a3dada8413b26d4", + "placeholder": "​", + "style": "IPY_MODEL_093cdd054a864e9a874b446d8a08804d", + "value": " 25.0/25.0 [00:00<00:00, 1.64kB/s]" + } + }, + "a9c84da30f4f427a96aa2c19abf76e68": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ad20fa821a3c45369ca41b78c34d7410": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b9c29e2ddc7e45798222aab437cc478d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c0cbff305800422e81169bbf6148a06e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c997ceb059a146fcb0c703351c1761dc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c9ad96305414428cbf85fe47a9190e43": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_41873434547e4ab3b18ca625d92a5b7e", + "placeholder": "​", + "style": "IPY_MODEL_ad20fa821a3c45369ca41b78c34d7410", + "value": " 798k/798k [00:00<00:00, 3.23MB/s]" + } + }, + "d0bffccc8dd241bd9c3f9aea84f9df91": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d49da31cb9ae404c9ec91b149664869a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d52070540deb459fac543ed8f25235ac": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1d42f739e22740dd9a6a48b2ea9a842b", + "max": 239, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c0cbff305800422e81169bbf6148a06e", + "value": 239 + } + }, + "d5c7d6bb07f446c580cc828278d96ddc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d6bb4c9501d440c888eb46111c838879": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5157e3971ff5473f97f7e8289664740a", + "max": 501322656, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_99be2ddc8fdd4b0ca92ac3c404f165fc", + "value": 501322656 + } + }, + "da76feb9e0e842e7b00fcdde6bf8f06b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4bf72dff2ada442297768dc3cdf5a128", + "placeholder": "​", + "style": "IPY_MODEL_3ba005b695274184a587bc747e1b1f2f", + "value": " 456k/456k [00:00<00:00, 937kB/s]" + } + }, + "e9869c9db5eb49b097057715ce38aa81": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f053be36c9bd4219819805bdd7c2d889": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f4a5589dd1fa4a969c0d1b7fc8e48899": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f9159108ec0248f7ad72963860d8a225": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f95544b3034e4a8c913d7214847b5ee4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fcc3121c28b446429a3dada8413b26d4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForTokenClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForTokenClassification.ipynb index 042a46f3bfe27c..8b32924f9d79fe 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForTokenClassification.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForTokenClassification.ipynb @@ -1,3261 +1,3097 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForTokenClassification.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import ONNX RoBertaForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `RoBertaForTokenClassification` is only available since in `Spark NLP 5.1.4` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import RoBERTa models trained/fine-tuned for token classification via `RobertaForTokenClassification` or `TFRobertaForTokenClassification`. These models are usually under `Token Classification` category and have `roberta` in their labels\n", + "- Reference: [TFRobertaForTokenClassification](https://huggingface.co/transformers/model_doc/roberta.html#tfrobertafortokenclassification)\n", + "- Some [example models](https://huggingface.co/models?filter=roberta&pipeline_tag=token-classification)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases\n", + "- Albert uses SentencePiece, so we will have to install that as well" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "vctEEFUYk8Nu" - }, - "source": [ - "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForTokenClassification.ipynb)" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m53.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.0/301.0 kB\u001b[0m \u001b[31m28.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.0/302.0 kB\u001b[0m \u001b[31m32.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m79.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m81.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m24.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.6/519.6 kB\u001b[0m \u001b[31m37.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m56.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m68.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m70.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m60.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m112.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m109.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m115.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m110.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m102.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m113.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m89.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m119.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m123.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m86.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m78.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m112.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m46.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m113.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m61.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m83.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m119.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m91.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m80.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m112.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m46.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m112.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m48.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m48.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m96.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m103.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m101.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for optimum (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow-datasets 4.9.3 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [philschmid/distilroberta-base-ner-wikiann-conll2003-3-class](https://huggingface.co/philschmid/distilroberta-base-ner-wikiann-conll2003-3-class) model from HuggingFace as an example and load it as a `ORTModelForSequenceClassification`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "O3-MQYmLk8Nx" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "19403261179149178f0b54c0a125f198", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Import ONNX RoBertaForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", - "- `RoBertaForTokenClassification` is only available since in `Spark NLP 5.1.4` and after. So please make sure you have upgraded to the latest Spark NLP release\n", - "- You can import RoBERTa models trained/fine-tuned for token classification via `RobertaForTokenClassification` or `TFRobertaForTokenClassification`. These models are usually under `Token Classification` category and have `roberta` in their labels\n", - "- Reference: [TFRobertaForTokenClassification](https://huggingface.co/transformers/model_doc/roberta.html#tfrobertafortokenclassification)\n", - "- Some [example models](https://huggingface.co/models?filter=roberta&pipeline_tag=token-classification)" + "text/plain": [ + "(…)nll2003-3-class/resolve/main/config.json: 0%| | 0.00/962 [00:00=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + "text/plain": [ + "(…)class/resolve/main/tokenizer_config.json: 0%| | 0.00/293 [00:00 False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", - "verbose: False, log level: Level.ERROR\n", - "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", - "\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForTokenClassification\n", - "import tensorflow as tf\n", - "\n", - "MODEL_NAME = 'philschmid/distilroberta-base-ner-wikiann-conll2003-3-class'\n", - "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForTokenClassification.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(ONNX_MODEL)" + "text/plain": [ + "(…)onll2003-3-class/resolve/main/merges.txt: 0%| | 0.00/456k [00:00 False\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "i24WdH62k8N7" - }, - "outputs": [], - "source": [ - "!mkdir {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", + "verbose: False, log level: Level.ERROR\n", + "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", + "\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForTokenClassification\n", + "import tensorflow as tf\n", + "\n", + "MODEL_NAME = 'philschmid/distilroberta-base-ner-wikiann-conll2003-3-class'\n", + "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForTokenClassification.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(ONNX_MODEL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# Read the vocab JSON file\n", + "with open('{}/vocab.json'.format(ONNX_MODEL), 'r') as json_file:\n", + " tokenizer = json.load(json_file)\n", + "\n", + "# let's save the vocab as txt file\n", + "with open('{}/vocab.txt'.format(ONNX_MODEL), 'w') as keys_file:\n", + " for item in tokenizer.keys():\n", + " keys_file.write(\"%s\\n\" % item)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "R7clJnf-k8N7" - }, - "source": [ - "- As you can see, we need to move `vocab.txt` and `merges.txt` from the tokenizer to `assets` folder which Spark NLP will look for\n", - "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 321892\n", + "drwxr-xr-x 2 root root 4096 Oct 16 22:13 assets\n", + "-rw-r--r-- 1 root root 1034 Oct 16 22:10 config.json\n", + "-rw-r--r-- 1 root root 326278966 Oct 16 22:10 model.onnx\n", + "-rw-r--r-- 1 root root 280 Oct 16 22:10 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 350 Oct 16 22:10 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2108715 Oct 16 22:10 tokenizer.json\n", + "-rw-r--r-- 1 root root 798293 Oct 16 22:10 vocab.json\n", + "-rw-r--r-- 1 root root 407065 Oct 16 22:18 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {ONNX_MODEL}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- As you can see, we need to move `vocab.txt` and `merges.txt` from the tokenizer to `assets` folder which Spark NLP will look for\n", + "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get label2id dictionary\n", + "labels = ort_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/vocab.txt {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/merges.txt {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila! We have our `vocab.txt` and `merges.txt` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Acu1x8BQk8N7" - }, - "outputs": [], - "source": [ - "# get label2id dictionary\n", - "labels = ort_model.config.id2label\n", - "# sort the dictionary based on the id\n", - "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", - "\n", - "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", - " f.write('\\n'.join(labels))" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "onnx_models/philschmid/distilroberta-base-ner-wikiann-conll2003-3-class:\n", + "total 321892\n", + "drwxr-xr-x 2 root root 4096 Oct 16 22:18 assets\n", + "-rw-r--r-- 1 root root 1034 Oct 16 22:10 config.json\n", + "-rw-r--r-- 1 root root 326278966 Oct 16 22:10 model.onnx\n", + "-rw-r--r-- 1 root root 280 Oct 16 22:10 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 350 Oct 16 22:10 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2108715 Oct 16 22:10 tokenizer.json\n", + "-rw-r--r-- 1 root root 798293 Oct 16 22:10 vocab.json\n", + "-rw-r--r-- 1 root root 407065 Oct 16 22:18 vocab.txt\n", + "\n", + "onnx_models/philschmid/distilroberta-base-ner-wikiann-conll2003-3-class/assets:\n", + "total 852\n", + "-rw-r--r-- 1 root root 37 Oct 16 22:18 labels.txt\n", + "-rw-r--r-- 1 root root 456318 Oct 16 22:10 merges.txt\n", + "-rw-r--r-- 1 root root 407065 Oct 16 22:12 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -lR {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save RoBertaForTokenClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7Jm6IFTSk8N8" - }, - "outputs": [], - "source": [ - "!mv {ONNX_MODEL}/vocab.txt {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m26.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PfDg1SyDk8N8" - }, - "outputs": [], - "source": [ - "!mv {ONNX_MODEL}/merges.txt {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `RoBertaForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `RoBertaForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "tokenClassifier = RoBertaForTokenClassification\\\n", + " .loadSavedModel(ONNX_MODEL, spark)\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your RoBertaForTokenClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "m4cXGPOEk8N8" - }, - "source": [ - "Voila! We have our `vocab.txt` and `merges.txt` inside assets directory" + "name": "stdout", + "output_type": "stream", + "text": [ + "total 318696\n", + "drwxr-xr-x 5 root root 4096 Oct 16 22:21 fields\n", + "drwxr-xr-x 2 root root 4096 Oct 16 22:21 metadata\n", + "-rw-r--r-- 1 root root 326328924 Oct 16 22:21 roberta_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForTokenClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenClassifier_loaded = RoBertaForTokenClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['B-LOC', 'I-ORG', 'I-LOC', 'I-PER', 'B-ORG', 'O', 'B-PER']" ] - }, + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "tokenClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dWb6ZEtXk8N8", - "outputId": "7f110ef4-8cb5-48e1-925d-338ae57c5046" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "onnx_models/philschmid/distilroberta-base-ner-wikiann-conll2003-3-class:\n", - "total 321892\n", - "drwxr-xr-x 2 root root 4096 Oct 16 22:18 assets\n", - "-rw-r--r-- 1 root root 1034 Oct 16 22:10 config.json\n", - "-rw-r--r-- 1 root root 326278966 Oct 16 22:10 model.onnx\n", - "-rw-r--r-- 1 root root 280 Oct 16 22:10 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 350 Oct 16 22:10 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 2108715 Oct 16 22:10 tokenizer.json\n", - "-rw-r--r-- 1 root root 798293 Oct 16 22:10 vocab.json\n", - "-rw-r--r-- 1 root root 407065 Oct 16 22:18 vocab.txt\n", - "\n", - "onnx_models/philschmid/distilroberta-base-ner-wikiann-conll2003-3-class/assets:\n", - "total 852\n", - "-rw-r--r-- 1 root root 37 Oct 16 22:18 labels.txt\n", - "-rw-r--r-- 1 root root 456318 Oct 16 22:10 merges.txt\n", - "-rw-r--r-- 1 root root 407065 Oct 16 22:12 vocab.txt\n" - ] - } + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| text| result|\n", + "+--------------------+--------------------+\n", + "|My name is Clara ...|[O, O, O, B-PER, ...|\n", + "|My name is Clara ...|[O, O, O, B-PER, ...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " tokenClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"My name is Clara and I live in Berkeley, California.\"], ['My name is Clara and I live in Berkeley, California.']]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"ner.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of `RoBertaForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "00d4770b7983470192967410038d0068": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c33367067b5c41529e4cb8301bb4631b", + "IPY_MODEL_f56039a6fb3f4dc7913ea06536e476c3", + "IPY_MODEL_f4f066292c894698a145d97645ef0852" ], - "source": [ - "!ls -lR {ONNX_MODEL}" - ] + "layout": "IPY_MODEL_74cda4b89a124b009c187cb98a04899d" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "W9-Fowe_k8N9" - }, - "source": [ - "## Import and Save RoBertaForTokenClassification in Spark NLP\n" - ] + "025eda03fbad4dd18d7dae72aedd0106": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "h0II8wYvk8N9" - }, - "source": [ - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script" - ] + "050dbc230ffa47e1a8b293f622b4ea57": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fb53f3bf55664c4e9aa685809d9b550f", + "max": 326181207, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7d587ac5d3ee4a89a99bc5c0b8044669", + "value": 326181207 + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gsYqnf_gk8N9", - "outputId": "9dfa476f-0c5c-48ae-daf6-0b7fb3ef4bcb" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m26.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], - "source": [ - "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] + "0993a78aca3348468b8615d096466b80": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "Q1MWCjcLk8N9" - }, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] + "0b89fef36cfa4301a27a58e6a1dec354": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ugpRusXwk8N9", - "outputId": "4f1f2146-f7ea-46ee-becb-c7d45b224b89" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apache Spark version: 3.2.3\n" - ] - } + "0fc0a55a8d234a17a7d725a93c45fd50": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "12eee2449390429192df0e0394598062": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1383a4cde8674b039c59c15a63901461": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0fc0a55a8d234a17a7d725a93c45fd50", + "placeholder": "​", + "style": "IPY_MODEL_b7201dc4f9584e1c97488425a766c4c6", + "value": "(…)2003-3-class/resolve/main/tokenizer.json: 100%" + } + }, + "166671c87f7d48feafb05bb58c739600": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1383a4cde8674b039c59c15a63901461", + "IPY_MODEL_3de9ee6582f1423598931cea294c532c", + "IPY_MODEL_ac0bec7637084a0e8e51231de626f69e" ], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()\n", - "\n", - "print(\"Apache Spark version: {}\".format(spark.version))" - ] + "layout": "IPY_MODEL_d72f34d844b542b0a4e1ec0264880cab" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "pj76mzEuk8N-" - }, - "source": [ - "- Let's use `loadSavedModel` functon in `RoBertaForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", - "- Most params can be set later when you are loading this model in `RoBertaForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." - ] + "1689463b2a3d4b39bb427733c160287a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wX3vfOybk8N-" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *\n", - "\n", - "tokenClassifier = RoBertaForTokenClassification\\\n", - " .loadSavedModel(ONNX_MODEL, spark)\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"ner\")\\\n", - " .setCaseSensitive(True)\\\n", - " .setMaxSentenceLength(128)" - ] + "1898befd7f36447ea5194e2c68d00c31": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_da56089370b6403fa52b9787b84ad86d", + "placeholder": "​", + "style": "IPY_MODEL_b8ed253331fe4d4e9b7a10dd282ea172", + "value": " 326M/326M [00:06<00:00, 37.1MB/s]" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "RMulAxFNk8N-" - }, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] + "191f55fc572b4f5a9b41e0c0dbd20414": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2db07d4ad6ff49b5b5ce76ea60c655fe", + "placeholder": "​", + "style": "IPY_MODEL_528de7c76ae84ccfb4614faddf133cfb", + "value": " 962/962 [00:00<00:00, 26.2kB/s]" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dJjyUZOjk8N-" - }, - "outputs": [], - "source": [ - "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" - ] + "19403261179149178f0b54c0a125f198": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c1ac31ed4ded444586913047df105d63", + "IPY_MODEL_1ccb91d2654d47d7aa883c016a8b4e49", + "IPY_MODEL_191f55fc572b4f5a9b41e0c0dbd20414" + ], + "layout": "IPY_MODEL_595ee009a3604de7a1d1c12e127b8f85" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "y9Uez8OFk8N-" - }, - "source": [ - "Let's clean up stuff we don't need anymore" - ] + "1ccb91d2654d47d7aa883c016a8b4e49": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0b89fef36cfa4301a27a58e6a1dec354", + "max": 962, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_380ca1c370174ba58ec9b669a4e4a2ff", + "value": 962 + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uA5C0YHqk8N_" - }, - "outputs": [], - "source": [ - "!rm -rf {ONNX_MODEL}" - ] + "2094ab4f61fc4dbbb2a45b8cb10d696f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "fpI56D69k8N_" - }, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your RoBertaForTokenClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] + "29e72b30d40644b4afdd19b137f8952d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Eh1srgXGk8N_", - "outputId": "edf7b117-637b-4031-a831-4b0c254dd223" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 318696\n", - "drwxr-xr-x 5 root root 4096 Oct 16 22:21 fields\n", - "drwxr-xr-x 2 root root 4096 Oct 16 22:21 metadata\n", - "-rw-r--r-- 1 root root 326328924 Oct 16 22:21 roberta_classification_onnx\n" - ] - } + "2b8a0ac51adf4cd9b94deb879084696f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2db07d4ad6ff49b5b5ce76ea60c655fe": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f3625a6b69e4b28a0180d769ef3eafb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f5282ba4afc45d9b43b18ccd50cd984": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_389699b0b0404174be6e092dae71a8e8", + "IPY_MODEL_3112e06bfd6d41408438c93cddcd306a", + "IPY_MODEL_3b4f2c964e5d42428b9ce9db06be885c" ], - "source": [ - "! ls -l {ONNX_MODEL}_spark_nlp_onnx" - ] + "layout": "IPY_MODEL_624dde2b14de4f87b969772e7792666c" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "v4phtEi-k8N_" - }, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForTokenClassification model 😊" - ] + "3112e06bfd6d41408438c93cddcd306a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2f3625a6b69e4b28a0180d769ef3eafb", + "max": 798293, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_84fd9d1f1821408f9adbbba4d3b1ff04", + "value": 798293 + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5n0pmqzfk8N_" - }, - "outputs": [], - "source": [ - "tokenClassifier_loaded = RoBertaForTokenClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"ner\")" - ] + "33f7e429f91b4e359fd05bbef32e5a46": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "02FdQlx_k8OA" - }, - "source": [ - "You can see what labels were used to train this model via `getClasses` function:" - ] + "380ca1c370174ba58ec9b669a4e4a2ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yLWujMlek8OA", - "outputId": "ce590db8-19a7-4882-cf86-fbd028e6eeab" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['B-LOC', 'I-ORG', 'I-LOC', 'I-PER', 'B-ORG', 'O', 'B-PER']" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } + "389699b0b0404174be6e092dae71a8e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59b60b985b14488f817d2693785d966f", + "placeholder": "​", + "style": "IPY_MODEL_f582ccbb662e4a9bbf3699c7f69a56d5", + "value": "(…)onll2003-3-class/resolve/main/vocab.json: 100%" + } + }, + "3b4f2c964e5d42428b9ce9db06be885c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f86b586715894e38abc7196a303f54e7", + "placeholder": "​", + "style": "IPY_MODEL_025eda03fbad4dd18d7dae72aedd0106", + "value": " 798k/798k [00:00<00:00, 15.4MB/s]" + } + }, + "3de9ee6582f1423598931cea294c532c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f6daf5c19e5e48e193af8b70e61bc1a3", + "max": 1355931, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1689463b2a3d4b39bb427733c160287a", + "value": 1355931 + } + }, + "4c7a6185b4e549ae82bbd69c5951b89d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ded495ef523c49779063258bcec83c4e", + "placeholder": "​", + "style": "IPY_MODEL_9daae647890b4527bf66caccc15afff8", + "value": " 239/239 [00:00<00:00, 11.6kB/s]" + } + }, + "4dd732d7405d4c8bac2b1d4297ec8088": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "528de7c76ae84ccfb4614faddf133cfb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "595ee009a3604de7a1d1c12e127b8f85": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "59b60b985b14488f817d2693785d966f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "624dde2b14de4f87b969772e7792666c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "74aa18fd09fb43f099973ef2c77f1fea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "74cda4b89a124b009c187cb98a04899d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7d587ac5d3ee4a89a99bc5c0b8044669": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7d9fe46aa745480e9947350f443be964": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "82cac82421da40a4bb21aacad13aef90": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "84fd9d1f1821408f9adbbba4d3b1ff04": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8572225a06a14ca8965a8039b2298070": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "885a765e32834db28e6a6aa47a853a8f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8caea9c1009646e9839e9e410f1006b8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "985ccb56f8df409d97836c83c7f57e44": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0993a78aca3348468b8615d096466b80", + "placeholder": "​", + "style": "IPY_MODEL_33f7e429f91b4e359fd05bbef32e5a46", + "value": "pytorch_model.bin: 100%" + } + }, + "9b31a5e52daf472b8900efa4b0c396b1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b006c4b472c341ab9fe9781a79c78c0a", + "max": 293, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_12eee2449390429192df0e0394598062", + "value": 293 + } + }, + "9ba0de8569964cb6a092f0359711d28e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9daae647890b4527bf66caccc15afff8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a4d075edad9243b5af25945e727e011f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a4fc8b5fe2a643e384545787ac7f0f98": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e542527c11944d088846505d08c52806", + "IPY_MODEL_9b31a5e52daf472b8900efa4b0c396b1", + "IPY_MODEL_e0a9517ffb70428cba4d8b8749603444" ], - "source": [ - "# .getClasses was introduced in spark-nlp==3.4.0\n", - "tokenClassifier_loaded.getClasses()" - ] + "layout": "IPY_MODEL_af0ef342d5b14b86b18bfe3dba1c6b9f" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "zV9cWCX3k8OA" - }, - "source": [ - "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" - ] + "a56652b29f014151ad02630853888abe": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y1-iMx67k8OA", - "outputId": "e1bcf11b-91f3-416d-fe08-6ba3d96ba9a5" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+--------------------+\n", - "| text| result|\n", - "+--------------------+--------------------+\n", - "|My name is Clara ...|[O, O, O, B-PER, ...|\n", - "|My name is Clara ...|[O, O, O, B-PER, ...|\n", - "+--------------------+--------------------+\n", - "\n" - ] - } + "ac0bec7637084a0e8e51231de626f69e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2b8a0ac51adf4cd9b94deb879084696f", + "placeholder": "​", + "style": "IPY_MODEL_7d9fe46aa745480e9947350f443be964", + "value": " 1.36M/1.36M [00:00<00:00, 26.4MB/s]" + } + }, + "af0ef342d5b14b86b18bfe3dba1c6b9f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "afe9cc266e03429b84d094ab1cb29a97": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b006c4b472c341ab9fe9781a79c78c0a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b0491ab858894635bcaae3b307db5d71": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b7201dc4f9584e1c97488425a766c4c6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b8ed253331fe4d4e9b7a10dd282ea172": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c1ac31ed4ded444586913047df105d63": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dcd04fa06ba744b697e172fa8365b009", + "placeholder": "​", + "style": "IPY_MODEL_a4d075edad9243b5af25945e727e011f", + "value": "(…)nll2003-3-class/resolve/main/config.json: 100%" + } + }, + "c33367067b5c41529e4cb8301bb4631b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9ba0de8569964cb6a092f0359711d28e", + "placeholder": "​", + "style": "IPY_MODEL_4dd732d7405d4c8bac2b1d4297ec8088", + "value": "(…)onll2003-3-class/resolve/main/merges.txt: 100%" + } + }, + "c3a63933131d4d1788b3a552a3c16a8e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c56e4e6111074a75820abdce355adb39": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8572225a06a14ca8965a8039b2298070", + "max": 239, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_29e72b30d40644b4afdd19b137f8952d", + "value": 239 + } + }, + "ca78d9dbdc854ad7b45d8e0de3cf2d9f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d72f34d844b542b0a4e1ec0264880cab": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "da56089370b6403fa52b9787b84ad86d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dcd04fa06ba744b697e172fa8365b009": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ded495ef523c49779063258bcec83c4e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e0a9517ffb70428cba4d8b8749603444": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ca78d9dbdc854ad7b45d8e0de3cf2d9f", + "placeholder": "​", + "style": "IPY_MODEL_ec26975de7f3493795c3cdf5a471a59d", + "value": " 293/293 [00:00<00:00, 15.7kB/s]" + } + }, + "e542527c11944d088846505d08c52806": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8caea9c1009646e9839e9e410f1006b8", + "placeholder": "​", + "style": "IPY_MODEL_afe9cc266e03429b84d094ab1cb29a97", + "value": "(…)class/resolve/main/tokenizer_config.json: 100%" + } + }, + "ec26975de7f3493795c3cdf5a471a59d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ef98c16f3cf84728b593cc1be081c9b2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_985ccb56f8df409d97836c83c7f57e44", + "IPY_MODEL_050dbc230ffa47e1a8b293f622b4ea57", + "IPY_MODEL_1898befd7f36447ea5194e2c68d00c31" ], - "source": [ - "document_assembler = DocumentAssembler() \\\n", - " .setInputCol('text') \\\n", - " .setOutputCol('document')\n", - "\n", - "tokenizer = Tokenizer() \\\n", - " .setInputCols(['document']) \\\n", - " .setOutputCol('token')\n", - "\n", - "pipeline = Pipeline(stages=[\n", - " document_assembler,\n", - " tokenizer,\n", - " tokenClassifier_loaded\n", - "])\n", - "\n", - "# couple of simple examples\n", - "example = spark.createDataFrame([[\"My name is Clara and I live in Berkeley, California.\"], ['My name is Clara and I live in Berkeley, California.']]).toDF(\"text\")\n", - "\n", - "result = pipeline.fit(example).transform(example)\n", - "\n", - "# result is a DataFrame\n", - "result.select(\"text\", \"ner.result\").show()" - ] + "layout": "IPY_MODEL_a56652b29f014151ad02630853888abe" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "U_RooKYek8OB" - }, - "source": [ - "That's it! You can now go wild and use hundreds of `RoBertaForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "00d4770b7983470192967410038d0068": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_c33367067b5c41529e4cb8301bb4631b", - "IPY_MODEL_f56039a6fb3f4dc7913ea06536e476c3", - "IPY_MODEL_f4f066292c894698a145d97645ef0852" - ], - "layout": "IPY_MODEL_74cda4b89a124b009c187cb98a04899d" - } - }, - "025eda03fbad4dd18d7dae72aedd0106": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "050dbc230ffa47e1a8b293f622b4ea57": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fb53f3bf55664c4e9aa685809d9b550f", - "max": 326181207, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_7d587ac5d3ee4a89a99bc5c0b8044669", - "value": 326181207 - } - }, - "0993a78aca3348468b8615d096466b80": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0b89fef36cfa4301a27a58e6a1dec354": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0fc0a55a8d234a17a7d725a93c45fd50": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "12eee2449390429192df0e0394598062": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "1383a4cde8674b039c59c15a63901461": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0fc0a55a8d234a17a7d725a93c45fd50", - "placeholder": "​", - "style": "IPY_MODEL_b7201dc4f9584e1c97488425a766c4c6", - "value": "(…)2003-3-class/resolve/main/tokenizer.json: 100%" - } - }, - "166671c87f7d48feafb05bb58c739600": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_1383a4cde8674b039c59c15a63901461", - "IPY_MODEL_3de9ee6582f1423598931cea294c532c", - "IPY_MODEL_ac0bec7637084a0e8e51231de626f69e" - ], - "layout": "IPY_MODEL_d72f34d844b542b0a4e1ec0264880cab" - } - }, - "1689463b2a3d4b39bb427733c160287a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "1898befd7f36447ea5194e2c68d00c31": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_da56089370b6403fa52b9787b84ad86d", - "placeholder": "​", - "style": "IPY_MODEL_b8ed253331fe4d4e9b7a10dd282ea172", - "value": " 326M/326M [00:06<00:00, 37.1MB/s]" - } - }, - "191f55fc572b4f5a9b41e0c0dbd20414": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2db07d4ad6ff49b5b5ce76ea60c655fe", - "placeholder": "​", - "style": "IPY_MODEL_528de7c76ae84ccfb4614faddf133cfb", - "value": " 962/962 [00:00<00:00, 26.2kB/s]" - } - }, - "19403261179149178f0b54c0a125f198": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_c1ac31ed4ded444586913047df105d63", - "IPY_MODEL_1ccb91d2654d47d7aa883c016a8b4e49", - "IPY_MODEL_191f55fc572b4f5a9b41e0c0dbd20414" - ], - "layout": "IPY_MODEL_595ee009a3604de7a1d1c12e127b8f85" - } - }, - "1ccb91d2654d47d7aa883c016a8b4e49": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0b89fef36cfa4301a27a58e6a1dec354", - "max": 962, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_380ca1c370174ba58ec9b669a4e4a2ff", - "value": 962 - } - }, - "2094ab4f61fc4dbbb2a45b8cb10d696f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "29e72b30d40644b4afdd19b137f8952d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "2b8a0ac51adf4cd9b94deb879084696f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2db07d4ad6ff49b5b5ce76ea60c655fe": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2f3625a6b69e4b28a0180d769ef3eafb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2f5282ba4afc45d9b43b18ccd50cd984": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_389699b0b0404174be6e092dae71a8e8", - "IPY_MODEL_3112e06bfd6d41408438c93cddcd306a", - "IPY_MODEL_3b4f2c964e5d42428b9ce9db06be885c" - ], - "layout": "IPY_MODEL_624dde2b14de4f87b969772e7792666c" - } - }, - "3112e06bfd6d41408438c93cddcd306a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2f3625a6b69e4b28a0180d769ef3eafb", - "max": 798293, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_84fd9d1f1821408f9adbbba4d3b1ff04", - "value": 798293 - } - }, - "33f7e429f91b4e359fd05bbef32e5a46": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "380ca1c370174ba58ec9b669a4e4a2ff": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "389699b0b0404174be6e092dae71a8e8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_59b60b985b14488f817d2693785d966f", - "placeholder": "​", - "style": "IPY_MODEL_f582ccbb662e4a9bbf3699c7f69a56d5", - "value": "(…)onll2003-3-class/resolve/main/vocab.json: 100%" - } - }, - "3b4f2c964e5d42428b9ce9db06be885c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f86b586715894e38abc7196a303f54e7", - "placeholder": "​", - "style": "IPY_MODEL_025eda03fbad4dd18d7dae72aedd0106", - "value": " 798k/798k [00:00<00:00, 15.4MB/s]" - } - }, - "3de9ee6582f1423598931cea294c532c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f6daf5c19e5e48e193af8b70e61bc1a3", - "max": 1355931, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1689463b2a3d4b39bb427733c160287a", - "value": 1355931 - } - }, - "4c7a6185b4e549ae82bbd69c5951b89d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ded495ef523c49779063258bcec83c4e", - "placeholder": "​", - "style": "IPY_MODEL_9daae647890b4527bf66caccc15afff8", - "value": " 239/239 [00:00<00:00, 11.6kB/s]" - } - }, - "4dd732d7405d4c8bac2b1d4297ec8088": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "528de7c76ae84ccfb4614faddf133cfb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "595ee009a3604de7a1d1c12e127b8f85": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "59b60b985b14488f817d2693785d966f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "624dde2b14de4f87b969772e7792666c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "74aa18fd09fb43f099973ef2c77f1fea": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "74cda4b89a124b009c187cb98a04899d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7d587ac5d3ee4a89a99bc5c0b8044669": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "7d9fe46aa745480e9947350f443be964": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "82cac82421da40a4bb21aacad13aef90": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "84fd9d1f1821408f9adbbba4d3b1ff04": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "8572225a06a14ca8965a8039b2298070": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "885a765e32834db28e6a6aa47a853a8f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "8caea9c1009646e9839e9e410f1006b8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "985ccb56f8df409d97836c83c7f57e44": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0993a78aca3348468b8615d096466b80", - "placeholder": "​", - "style": "IPY_MODEL_33f7e429f91b4e359fd05bbef32e5a46", - "value": "pytorch_model.bin: 100%" - } - }, - "9b31a5e52daf472b8900efa4b0c396b1": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b006c4b472c341ab9fe9781a79c78c0a", - "max": 293, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_12eee2449390429192df0e0394598062", - "value": 293 - } - }, - "9ba0de8569964cb6a092f0359711d28e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9daae647890b4527bf66caccc15afff8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a4d075edad9243b5af25945e727e011f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a4fc8b5fe2a643e384545787ac7f0f98": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e542527c11944d088846505d08c52806", - "IPY_MODEL_9b31a5e52daf472b8900efa4b0c396b1", - "IPY_MODEL_e0a9517ffb70428cba4d8b8749603444" - ], - "layout": "IPY_MODEL_af0ef342d5b14b86b18bfe3dba1c6b9f" - } - }, - "a56652b29f014151ad02630853888abe": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ac0bec7637084a0e8e51231de626f69e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2b8a0ac51adf4cd9b94deb879084696f", - "placeholder": "​", - "style": "IPY_MODEL_7d9fe46aa745480e9947350f443be964", - "value": " 1.36M/1.36M [00:00<00:00, 26.4MB/s]" - } - }, - "af0ef342d5b14b86b18bfe3dba1c6b9f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "afe9cc266e03429b84d094ab1cb29a97": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b006c4b472c341ab9fe9781a79c78c0a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b0491ab858894635bcaae3b307db5d71": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b7201dc4f9584e1c97488425a766c4c6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b8ed253331fe4d4e9b7a10dd282ea172": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c1ac31ed4ded444586913047df105d63": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_dcd04fa06ba744b697e172fa8365b009", - "placeholder": "​", - "style": "IPY_MODEL_a4d075edad9243b5af25945e727e011f", - "value": "(…)nll2003-3-class/resolve/main/config.json: 100%" - } - }, - "c33367067b5c41529e4cb8301bb4631b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9ba0de8569964cb6a092f0359711d28e", - "placeholder": "​", - "style": "IPY_MODEL_4dd732d7405d4c8bac2b1d4297ec8088", - "value": "(…)onll2003-3-class/resolve/main/merges.txt: 100%" - } - }, - "c3a63933131d4d1788b3a552a3c16a8e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c56e4e6111074a75820abdce355adb39": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_8572225a06a14ca8965a8039b2298070", - "max": 239, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_29e72b30d40644b4afdd19b137f8952d", - "value": 239 - } - }, - "ca78d9dbdc854ad7b45d8e0de3cf2d9f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d72f34d844b542b0a4e1ec0264880cab": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "da56089370b6403fa52b9787b84ad86d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "dcd04fa06ba744b697e172fa8365b009": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ded495ef523c49779063258bcec83c4e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e0a9517ffb70428cba4d8b8749603444": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ca78d9dbdc854ad7b45d8e0de3cf2d9f", - "placeholder": "​", - "style": "IPY_MODEL_ec26975de7f3493795c3cdf5a471a59d", - "value": " 293/293 [00:00<00:00, 15.7kB/s]" - } - }, - "e542527c11944d088846505d08c52806": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_8caea9c1009646e9839e9e410f1006b8", - "placeholder": "​", - "style": "IPY_MODEL_afe9cc266e03429b84d094ab1cb29a97", - "value": "(…)class/resolve/main/tokenizer_config.json: 100%" - } - }, - "ec26975de7f3493795c3cdf5a471a59d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ef98c16f3cf84728b593cc1be081c9b2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_985ccb56f8df409d97836c83c7f57e44", - "IPY_MODEL_050dbc230ffa47e1a8b293f622b4ea57", - "IPY_MODEL_1898befd7f36447ea5194e2c68d00c31" - ], - "layout": "IPY_MODEL_a56652b29f014151ad02630853888abe" - } - }, - "f3774c5dc43b44d69c256f259cb22a8c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_ff652925a7bd40bc9939be17cacd818e", - "IPY_MODEL_c56e4e6111074a75820abdce355adb39", - "IPY_MODEL_4c7a6185b4e549ae82bbd69c5951b89d" - ], - "layout": "IPY_MODEL_ff0eedc4f66f4fc9894f22a83245c55f" - } - }, - "f4f066292c894698a145d97645ef0852": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c3a63933131d4d1788b3a552a3c16a8e", - "placeholder": "​", - "style": "IPY_MODEL_74aa18fd09fb43f099973ef2c77f1fea", - "value": " 456k/456k [00:00<00:00, 6.49MB/s]" - } - }, - "f56039a6fb3f4dc7913ea06536e476c3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b0491ab858894635bcaae3b307db5d71", - "max": 456356, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_885a765e32834db28e6a6aa47a853a8f", - "value": 456356 - } - }, - "f582ccbb662e4a9bbf3699c7f69a56d5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f6daf5c19e5e48e193af8b70e61bc1a3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f86b586715894e38abc7196a303f54e7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fb53f3bf55664c4e9aa685809d9b550f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ff0eedc4f66f4fc9894f22a83245c55f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ff652925a7bd40bc9939be17cacd818e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_82cac82421da40a4bb21aacad13aef90", - "placeholder": "​", - "style": "IPY_MODEL_2094ab4f61fc4dbbb2a45b8cb10d696f", - "value": "(…)ass/resolve/main/special_tokens_map.json: 100%" - } - } - } + "f3774c5dc43b44d69c256f259cb22a8c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ff652925a7bd40bc9939be17cacd818e", + "IPY_MODEL_c56e4e6111074a75820abdce355adb39", + "IPY_MODEL_4c7a6185b4e549ae82bbd69c5951b89d" + ], + "layout": "IPY_MODEL_ff0eedc4f66f4fc9894f22a83245c55f" + } + }, + "f4f066292c894698a145d97645ef0852": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c3a63933131d4d1788b3a552a3c16a8e", + "placeholder": "​", + "style": "IPY_MODEL_74aa18fd09fb43f099973ef2c77f1fea", + "value": " 456k/456k [00:00<00:00, 6.49MB/s]" + } + }, + "f56039a6fb3f4dc7913ea06536e476c3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0491ab858894635bcaae3b307db5d71", + "max": 456356, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_885a765e32834db28e6a6aa47a853a8f", + "value": 456356 + } + }, + "f582ccbb662e4a9bbf3699c7f69a56d5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f6daf5c19e5e48e193af8b70e61bc1a3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f86b586715894e38abc7196a303f54e7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fb53f3bf55664c4e9aa685809d9b550f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ff0eedc4f66f4fc9894f22a83245c55f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ff652925a7bd40bc9939be17cacd818e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_82cac82421da40a4bb21aacad13aef90", + "placeholder": "​", + "style": "IPY_MODEL_2094ab4f61fc4dbbb2a45b8cb10d696f", + "value": "(…)ass/resolve/main/special_tokens_map.json: 100%" + } } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb index 8a7518f56631df..f79307a780625f 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb @@ -67,7 +67,7 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.35.2 optimum sentencepiece" + "!pip install -q --upgrade transformers[onnx]==4.35.2 optimum sentencepiece onnx==1.14.0" ] }, { diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb index 262cad47cf6a91..e1d5f5596c944d 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb @@ -1,553 +1,710 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb)\n", - "\n", - "# Import ONNX Whisper models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", - "- The Whisper model was introduced in `Spark NLP 5.1.0 and requires Spark version 3.4.1 and up.`\n", - "- Official models are supported, but not all custom models may work." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Export and Save HuggingFace model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", - "- We lock `transformers` on version `4.31.0`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m364.2/364.2 kB\u001b[0m \u001b[31m25.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m21.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m32.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m33.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m17.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m28.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.3/519.3 kB\u001b[0m \u001b[31m21.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m39.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m32.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h" - ] - } - ], - "source": [ - "!pip install -q --upgrade \"transformers[onnx]==4.31.0\" optimum tensorflow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", - "- We'll use the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model from HuggingFace as an example and export it with the `optimum-cli`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_NAME = \"openai/whisper-tiny\"\n", - "EXPORT_PATH = f\"export_onnx/{MODEL_NAME}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "hEdJynTH3L0x" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb)\n", + "\n", + "# Import ONNX Whisper models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- The Whisper model was introduced in `Spark NLP 5.1.0 and requires Spark version 3.4.1 and up.`\n", + "- Official models are supported, but not all custom models may work." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "2023-08-14 13:53:19.500633: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "Framework not specified. Using pt to export to ONNX.\n", - "Downloading (…)lve/main/config.json: 100% 1.98k/1.98k [00:00<00:00, 3.50MB/s]\n", - "Downloading model.safetensors: 100% 151M/151M [00:05<00:00, 28.6MB/s]\n", - "Downloading (…)neration_config.json: 100% 3.72k/3.72k [00:00<00:00, 17.7MB/s]\n", - "Automatic task detection to automatic-speech-recognition-with-past (possible synonyms are: speech2seq-lm-with-past).\n", - "Downloading (…)okenizer_config.json: 100% 841/841 [00:00<00:00, 2.66MB/s]\n", - "Downloading (…)olve/main/vocab.json: 100% 1.04M/1.04M [00:00<00:00, 10.8MB/s]\n", - "Downloading (…)/main/tokenizer.json: 100% 2.20M/2.20M [00:00<00:00, 25.4MB/s]\n", - "Downloading (…)olve/main/merges.txt: 100% 494k/494k [00:00<00:00, 18.1MB/s]\n", - "Downloading (…)main/normalizer.json: 100% 52.7k/52.7k [00:00<00:00, 73.6MB/s]\n", - "Downloading (…)in/added_tokens.json: 100% 2.08k/2.08k [00:00<00:00, 7.43MB/s]\n", - "Downloading (…)cial_tokens_map.json: 100% 2.08k/2.08k [00:00<00:00, 7.93MB/s]\n", - "Downloading (…)rocessor_config.json: 100% 185k/185k [00:00<00:00, 7.36MB/s]\n", - "Using framework PyTorch: 2.0.1+cu118\n", - "Overriding 1 configuration item(s)\n", - "\t- use_cache -> False\n", - "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:410: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", - " if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):\n", - "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:449: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", - " if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):\n", - "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", - "verbose: False, log level: Level.ERROR\n", - "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", - "\n", - "Using framework PyTorch: 2.0.1+cu118\n", - "Overriding 1 configuration item(s)\n", - "\t- use_cache -> True\n", - "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:1004: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", - " if input_shape[-1] > 1:\n", - "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:417: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", - " if attention_mask.size() != (bsz, 1, tgt_len, src_len):\n", - "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", - "verbose: False, log level: Level.ERROR\n", - "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", - "\n", - "Using framework PyTorch: 2.0.1+cu118\n", - "Overriding 1 configuration item(s)\n", - "\t- use_cache -> True\n", - "Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `decoder_input_ids`.\n", - "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:372: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", - " and past_key_value[0].shape[2] == key_value_states.shape[1]\n", - "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", - "verbose: False, log level: Level.ERROR\n", - "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", - "\n", - "Post-processing the exported models...\n", - "The two models proto have different outputs (17 and 9 outputs). Constant outputs will be added to unify the two models outputs.\n", - "Addind a constant output for present.0.encoder.key of shape [0, 6, 1, 64] in model2.\n", - "Addind a constant output for present.0.encoder.value of shape [0, 6, 1, 64] in model2.\n", - "Addind a constant output for present.1.encoder.key of shape [0, 6, 1, 64] in model2.\n", - "Addind a constant output for present.1.encoder.value of shape [0, 6, 1, 64] in model2.\n", - "Addind a constant output for present.2.encoder.key of shape [0, 6, 1, 64] in model2.\n", - "Addind a constant output for present.2.encoder.value of shape [0, 6, 1, 64] in model2.\n", - "Addind a constant output for present.3.encoder.key of shape [0, 6, 1, 64] in model2.\n", - "Addind a constant output for present.3.encoder.value of shape [0, 6, 1, 64] in model2.\n", - "Validating models in subprocesses...\n", - "2023-08-14 13:53:53.825862: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "Validating ONNX model export_onnx/openai/whisper-tiny/encoder_model.onnx...\n", - "\t-[✓] ONNX model output names match reference model (last_hidden_state)\n", - "\t- Validating ONNX Model output \"last_hidden_state\":\n", - "\t\t-[✓] (2, 1500, 384) matches (2, 1500, 384)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "2023-08-14 13:54:09.277640: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "Validating ONNX model export_onnx/openai/whisper-tiny/decoder_model_merged.onnx...\n", - "2023-08-14 13:54:11.873629973 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_4_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:11.873793574 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_16_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:11.892860734 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:11.892952656 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_1_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:11.893552082 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_10_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:11.893587646 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:11.893601631 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_12_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:11.893631102 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_9_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:11.893696170 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_11_output_0'. It is not used by any node and should be removed from the model.\n", - "\t-[✓] ONNX model output names match reference model (present.1.decoder.key, present.2.encoder.value, present.3.encoder.key, present.0.decoder.value, present.1.decoder.value, present.0.encoder.key, present.2.decoder.key, present.3.decoder.value, present.2.encoder.key, present.2.decoder.value, present.1.encoder.key, present.3.encoder.value, present.1.encoder.value, present.0.decoder.key, present.3.decoder.key, logits, present.0.encoder.value)\n", - "\t- Validating ONNX Model output \"logits\":\n", - "\t\t-[✓] (2, 16, 51865) matches (2, 16, 51865)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.0.decoder.key\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.0.decoder.value\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.0.encoder.key\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.0.encoder.value\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.1.decoder.key\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.1.decoder.value\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.1.encoder.key\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.1.encoder.value\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.2.decoder.key\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.2.decoder.value\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.2.encoder.key\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.2.encoder.value\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.3.decoder.key\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.3.decoder.value\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.3.encoder.key\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.3.encoder.value\":\n", - "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "2023-08-14 13:54:20.179734: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "Validating ONNX model export_onnx/openai/whisper-tiny/decoder_model_merged.onnx...\n", - "Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `decoder_input_ids`.\n", - "2023-08-14 13:54:23.118265457 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_4_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:23.118402025 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_16_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:23.134562875 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:23.134629569 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_1_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:23.135051085 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_10_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:23.135074933 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:23.135088187 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_12_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:23.135109430 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_9_output_0'. It is not used by any node and should be removed from the model.\n", - "2023-08-14 13:54:23.135158578 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_11_output_0'. It is not used by any node and should be removed from the model.\n", - "\t-[✓] ONNX model output names match reference model (present.0.decoder.key, present.3.decoder.key, present.1.decoder.key, present.1.decoder.value, logits, present.3.decoder.value, present.2.decoder.value, present.2.decoder.key, present.0.decoder.value)\n", - "\t- Validating ONNX Model output \"logits\":\n", - "\t\t-[✓] (2, 1, 51865) matches (2, 1, 51865)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.0.decoder.key\":\n", - "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.0.decoder.value\":\n", - "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.1.decoder.key\":\n", - "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.1.decoder.value\":\n", - "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.2.decoder.key\":\n", - "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.2.decoder.value\":\n", - "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.3.decoder.key\":\n", - "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "\t- Validating ONNX Model output \"present.3.decoder.value\":\n", - "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", - "\t\t-[✓] all values close (atol: 0.001)\n", - "The ONNX export succeeded and the exported model was saved at: export_onnx/openai/whisper-tiny\n" - ] - } - ], - "source": [ - "! optimum-cli export onnx --model {MODEL_NAME} {EXPORT_PATH}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have to move additional model assets into a seperate folder, so that Spark NLP can load it properly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! mkdir -p {EXPORT_PATH}/assets\n", - "! mv -t {EXPORT_PATH}/assets {EXPORT_PATH}/*.json {EXPORT_PATH}/*.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's have a look inside these two directories and see what we are dealing with:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "DfiBPTV83L0y" + }, + "source": [ + "## Export and Save HuggingFace model" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 607868\n", - "drwxr-xr-x 2 root root 4096 Aug 14 13:55 assets\n", - "-rw-r--r-- 1 root root 198197526 Aug 14 13:53 decoder_model_merged.onnx\n", - "-rw-r--r-- 1 root root 198049530 Aug 14 13:53 decoder_model.onnx\n", - "-rw-r--r-- 1 root root 193295315 Aug 14 13:53 decoder_with_past_model.onnx\n", - "-rw-r--r-- 1 root root 32900723 Aug 14 13:53 encoder_model.onnx\n" - ] - } - ], - "source": [ - "!ls -l {EXPORT_PATH}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "IhUUhv8h3L0z" + }, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.31.0`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 3728\n", - "-rw-r--r-- 1 root root 2082 Aug 14 13:53 added_tokens.json\n", - "-rw-r--r-- 1 root root 2243 Aug 14 13:53 config.json\n", - "-rw-r--r-- 1 root root 3711 Aug 14 13:53 generation_config.json\n", - "-rw-r--r-- 1 root root 493864 Aug 14 13:53 merges.txt\n", - "-rw-r--r-- 1 root root 52666 Aug 14 13:53 normalizer.json\n", - "-rw-r--r-- 1 root root 339 Aug 14 13:53 preprocessor_config.json\n", - "-rw-r--r-- 1 root root 2077 Aug 14 13:53 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 835 Aug 14 13:53 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 2203267 Aug 14 13:53 tokenizer.json\n", - "-rw-r--r-- 1 root root 1036584 Aug 14 13:53 vocab.json\n" - ] - } - ], - "source": [ - "!ls -l {EXPORT_PATH}/assets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import and Save Whisper in Spark NLP\n", - "\n", - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script\n", - "- Additionally, we need to upgrade Spark to version 3.4.1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash\n", - "! pip install -U pyspark==3.4.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sparknlp\n", - "\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Let's use `loadSavedModel` functon in `WhisperForCTC` which allows us to load the ONNX model\n", - "- Most params will be set automatically. They can also be set later after loading the model in `WhisperForCTC` during runtime, so don't worry about setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "\n", - "# All these params should be identical to the original ONNX model\n", - "whisper = (\n", - " WhisperForCTC.loadSavedModel(f\"{EXPORT_PATH}\", spark)\n", - " .setInputCols(\"audio_assembler\")\n", - " .setOutputCol(\"text\")\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "whisper.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's clean up stuff we don't need anymore" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!rm -rf {EXPORT_PATH}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your ONNX Whisper model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "sUIG8ym_3ZdY" + } + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 414388\n", - "-rw-r--r-- 1 root root 198079914 Aug 14 14:03 decoder_model_whisper_ctc\n", - "-rw-r--r-- 1 root root 193324994 Aug 14 14:03 decoder_with_past_model_whisper_ctc\n", - "-rw-r--r-- 1 root root 32905912 Aug 14 14:03 encoder_model_whisper_ctc\n", - "drwxr-xr-x 6 root root 4096 Aug 14 14:03 fields\n", - "drwxr-xr-x 2 root root 4096 Aug 14 14:03 metadata\n" - ] - } - ], - "source": [ - "! ls -l {MODEL_NAME}_spark_nlp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny Whisper model 😊" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "yy9Ig4tY3L0z", + "outputId": "648a6286-3ad1-4656-f8fa-06d156549d41", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/7.4 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.3/7.4 MB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━\u001b[0m \u001b[32m5.2/7.4 MB\u001b[0m \u001b[31m75.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m86.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m56.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.0/410.0 kB\u001b[0m \u001b[31m35.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m589.8/589.8 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m55.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m91.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.8/455.8 kB\u001b[0m \u001b[31m37.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m105.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m24.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m45.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.3/5.3 MB\u001b[0m \u001b[31m78.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m79.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m98.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m66.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.2/311.2 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tf-keras 2.15.1 requires tensorflow<2.16,>=2.15, but you have tensorflow 2.16.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade \"transformers[onnx]==4.31.0\" optimum tensorflow onnx==1.14.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l_WSgW9w3L00" + }, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model from HuggingFace as an example and export it with the `optimum-cli`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "Ar3GeeF43L00" + }, + "outputs": [], + "source": [ + "MODEL_NAME = \"openai/whisper-tiny\"\n", + "EXPORT_PATH = f\"export_onnx/{MODEL_NAME}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "1F7dqTBe3L01", + "outputId": "9a9d903b-a829-4d9d-d2ed-48948e476e73", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2024-04-12 10:35:18.194732: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "Framework not specified. Using pt to export the model.\n", + "config.json: 100% 1.98k/1.98k [00:00<00:00, 7.28MB/s]\n", + "model.safetensors: 100% 151M/151M [00:00<00:00, 311MB/s]\n", + "generation_config.json: 100% 3.75k/3.75k [00:00<00:00, 17.5MB/s]\n", + "Automatic task detection to automatic-speech-recognition-with-past (possible synonyms are: speech2seq-lm-with-past).\n", + "tokenizer_config.json: 100% 283k/283k [00:00<00:00, 30.7MB/s]\n", + "vocab.json: 100% 836k/836k [00:00<00:00, 20.0MB/s]\n", + "tokenizer.json: 100% 2.48M/2.48M [00:00<00:00, 5.49MB/s]\n", + "merges.txt: 100% 494k/494k [00:00<00:00, 22.1MB/s]\n", + "normalizer.json: 100% 52.7k/52.7k [00:00<00:00, 115MB/s]\n", + "added_tokens.json: 100% 34.6k/34.6k [00:00<00:00, 79.4MB/s]\n", + "special_tokens_map.json: 100% 2.19k/2.19k [00:00<00:00, 8.36MB/s]\n", + "preprocessor_config.json: 100% 185k/185k [00:00<00:00, 41.6MB/s]\n", + "Using the export variant default. Available variants are:\n", + " - default: The default ONNX variant.\n", + "Using framework PyTorch: 2.2.1+cu121\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:410: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:449: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):\n", + "Using framework PyTorch: 2.2.1+cu121\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> True\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:1004: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if input_shape[-1] > 1:\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:417: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attention_mask.size() != (bsz, 1, tgt_len, src_len):\n", + "Using framework PyTorch: 2.2.1+cu121\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> True\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:372: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " and past_key_value[0].shape[2] == key_value_states.shape[1]\n", + "Post-processing the exported models...\n", + "Weight deduplication check in the ONNX export requires accelerate. Please install accelerate to run it.\n", + "The two models proto have different outputs (17 and 9 outputs). Constant outputs will be added to unify the two models outputs.\n", + "Adding a constant output for present.0.encoder.key of shape [0, 6, 1, 64] in model2.\n", + "Adding a constant output for present.0.encoder.value of shape [0, 6, 1, 64] in model2.\n", + "Adding a constant output for present.1.encoder.key of shape [0, 6, 1, 64] in model2.\n", + "Adding a constant output for present.1.encoder.value of shape [0, 6, 1, 64] in model2.\n", + "Adding a constant output for present.2.encoder.key of shape [0, 6, 1, 64] in model2.\n", + "Adding a constant output for present.2.encoder.value of shape [0, 6, 1, 64] in model2.\n", + "Adding a constant output for present.3.encoder.key of shape [0, 6, 1, 64] in model2.\n", + "Adding a constant output for present.3.encoder.value of shape [0, 6, 1, 64] in model2.\n", + "Validating ONNX model export_onnx/openai/whisper-tiny/encoder_model.onnx...\n", + "\t-[✓] ONNX model output names match reference model (last_hidden_state)\n", + "\t- Validating ONNX Model output \"last_hidden_state\":\n", + "\t\t-[✓] (2, 1500, 384) matches (2, 1500, 384)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "Validating ONNX model export_onnx/openai/whisper-tiny/decoder_model_merged.onnx...\n", + "\u001b[0;93m2024-04-12 10:35:46.461625726 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_4_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.461718257 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_23_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478143231 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_1_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478201698 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478215371 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_192'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478645008 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_11_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478663150 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_211'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478686496 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_12_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478693914 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_14_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478705872 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478714456 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_13_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478728444 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_180'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:46.478740137 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_10_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\t-[✓] ONNX model output names match reference model (present.2.decoder.key, present.3.encoder.key, present.0.encoder.key, present.1.decoder.key, present.2.decoder.value, present.1.encoder.key, present.3.decoder.key, present.0.decoder.key, logits, present.2.encoder.key, present.3.decoder.value, present.2.encoder.value, present.1.encoder.value, present.0.decoder.value, present.1.decoder.value, present.3.encoder.value, present.0.encoder.value)\n", + "\t- Validating ONNX Model output \"logits\":\n", + "\t\t-[✓] (2, 16, 51865) matches (2, 16, 51865)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.0.decoder.key\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.0.decoder.value\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.0.encoder.key\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.0.encoder.value\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.1.decoder.key\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.1.decoder.value\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.1.encoder.key\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.1.encoder.value\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.2.decoder.key\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.2.decoder.value\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.2.encoder.key\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.2.encoder.value\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.3.decoder.key\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.3.decoder.value\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.3.encoder.key\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.3.encoder.value\":\n", + "\t\t-[✓] (2, 6, 16, 64) matches (2, 6, 16, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "Validating ONNX model export_onnx/openai/whisper-tiny/decoder_model_merged.onnx...\n", + "\u001b[0;93m2024-04-12 10:35:47.393181620 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_4_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.393302485 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_23_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.411905010 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_1_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.411952338 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.411978486 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_192'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.412527152 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_11_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.412551044 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_211'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.412584655 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_12_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.412597581 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_14_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.412615576 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.412631568 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_13_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.412656271 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_180'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\u001b[0;93m2024-04-12 10:35:47.412676295 [W:onnxruntime:, graph.cc:3593 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_10_output_0'. It is not used by any node and should be removed from the model.\u001b[m\n", + "\t-[✓] ONNX model output names match reference model (present.2.decoder.key, present.1.decoder.key, present.2.decoder.value, present.3.decoder.key, present.3.decoder.value, present.0.decoder.key, logits, present.0.decoder.value, present.1.decoder.value)\n", + "\t- Validating ONNX Model output \"logits\":\n", + "\t\t-[✓] (2, 1, 51865) matches (2, 1, 51865)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.0.decoder.key\":\n", + "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.0.decoder.value\":\n", + "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.1.decoder.key\":\n", + "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.1.decoder.value\":\n", + "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.2.decoder.key\":\n", + "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.2.decoder.value\":\n", + "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.3.decoder.key\":\n", + "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "\t- Validating ONNX Model output \"present.3.decoder.value\":\n", + "\t\t-[✓] (2, 6, 17, 64) matches (2, 6, 17, 64)\n", + "\t\t-[✓] all values close (atol: 0.001)\n", + "The ONNX export succeeded and the exported model was saved at: export_onnx/openai/whisper-tiny\n" + ] + } + ], + "source": [ + "! optimum-cli export onnx --model {MODEL_NAME} {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_jrTPqhE3L01" + }, + "source": [ + "We have to move additional model assets into a seperate folder, so that Spark NLP can load it properly." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "CyHyF5Pr3L02" + }, + "outputs": [], + "source": [ + "! mkdir -p {EXPORT_PATH}/assets\n", + "! mv -t {EXPORT_PATH}/assets {EXPORT_PATH}/*.json {EXPORT_PATH}/*.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vqXo5KCK3L02" + }, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "qFXX_acJ3L03", + "outputId": "e82587f4-e280-4288-de3a-087d8c1f7aaa", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "total 607916\n", + "drwxr-xr-x 2 root root 4096 Apr 12 10:35 assets\n", + "-rw-r--r-- 1 root root 198217996 Apr 12 10:35 decoder_model_merged.onnx\n", + "-rw-r--r-- 1 root root 198061779 Apr 12 10:35 decoder_model.onnx\n", + "-rw-r--r-- 1 root root 193303540 Apr 12 10:35 decoder_with_past_model.onnx\n", + "-rw-r--r-- 1 root root 32904958 Apr 12 10:35 encoder_model.onnx\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "-lbCcSP13L03", + "outputId": "9b37ce06-a335-4c82-8899-55fb9197815a", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "total 4308\n", + "-rw-r--r-- 1 root root 34604 Apr 12 10:35 added_tokens.json\n", + "-rw-r--r-- 1 root root 2243 Apr 12 10:35 config.json\n", + "-rw-r--r-- 1 root root 3742 Apr 12 10:35 generation_config.json\n", + "-rw-r--r-- 1 root root 493869 Apr 12 10:35 merges.txt\n", + "-rw-r--r-- 1 root root 52666 Apr 12 10:35 normalizer.json\n", + "-rw-r--r-- 1 root root 339 Apr 12 10:35 preprocessor_config.json\n", + "-rw-r--r-- 1 root root 2194 Apr 12 10:35 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 283277 Apr 12 10:35 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2480466 Apr 12 10:35 tokenizer.json\n", + "-rw-r--r-- 1 root root 1036584 Apr 12 10:35 vocab.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Q63ic12h3L04" + }, + "source": [ + "## Import and Save Whisper in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script\n", + "- Additionally, we need to upgrade Spark to version 3.4.1." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "ZKZ_tizZ3L04", + "outputId": "2d7801ea-99fd-44ac-a963-e98f8feb0c06", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.3.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.3.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m568.4/568.4 kB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting pyspark==3.4.1\n", + " Downloading pyspark-3.4.1.tar.gz (310.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.8/310.8 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting py4j==0.10.9.7 (from pyspark==3.4.1)\n", + " Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m200.5/200.5 kB\u001b[0m \u001b[31m23.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hBuilding wheels for collected packages: pyspark\n", + " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=35520bb723dd6a52ac228a8c249191033e27475dc70be0af064dde9b1b780d3c\n", + " Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834\n", + "Successfully built pyspark\n", + "Installing collected packages: py4j, pyspark\n", + " Attempting uninstall: py4j\n", + " Found existing installation: py4j 0.10.9.5\n", + " Uninstalling py4j-0.10.9.5:\n", + " Successfully uninstalled py4j-0.10.9.5\n", + " Attempting uninstall: pyspark\n", + " Found existing installation: pyspark 3.2.3\n", + " Uninstalling pyspark-3.2.3:\n", + " Successfully uninstalled pyspark-3.2.3\n", + "Successfully installed py4j-0.10.9.7 pyspark-3.4.1\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash\n", + "! pip install -U pyspark==3.4.1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4EiV3v3D3L05" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "HKzEZfQn3L05" + }, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------------------------------------------------------------------------------------+\n", - "|result |\n", - "+------------------------------------------------------------------------------------------+\n", - "|[ Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.]|\n", - "+------------------------------------------------------------------------------------------+\n", - "\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "8UCXtwOd3L05" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `WhisperForCTC` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `WhisperForCTC` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "fZNPXuQP3L05" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "whisper = (\n", + " WhisperForCTC.loadSavedModel(f\"{EXPORT_PATH}\", spark)\n", + " .setInputCols(\"audio_assembler\")\n", + " .setOutputCol(\"text\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5tSlzbOR3L06" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "nkP_gWrt3L06" + }, + "outputs": [], + "source": [ + "whisper.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yKqHPm903L06" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "6Dfa7zDK3L06" + }, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5ecbVmq73L06" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX Whisper model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "WKxyiCOi3L07", + "outputId": "2eae5016-aa01-4da2-f6f9-574b8f4136fb", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "total 414404\n", + "-rw-r--r-- 1 root root 198092144 Apr 12 10:38 decoder_model\n", + "-rw-r--r-- 1 root root 193333200 Apr 12 10:38 decoder_with_past_model\n", + "-rw-r--r-- 1 root root 32910123 Apr 12 10:38 encoder_model\n", + "drwxr-xr-x 6 root root 4096 Apr 12 10:38 fields\n", + "drwxr-xr-x 2 root root 4096 Apr 12 10:38 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0VEQV_Cv3L07" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny Whisper model 😊" + ] + }, + { + "cell_type": "code", + "source": [ + "! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/audio/txt/librispeech_asr_0.txt" + ], + "metadata": { + "id": "KzAIXRki4kRQ", + "outputId": "c926a754-3bb1-4790-b3cf-ec10a93a0ebc", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2024-04-12 10:39:27-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/audio/txt/librispeech_asr_0.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2199992 (2.1M) [text/plain]\n", + "Saving to: ‘librispeech_asr_0.txt’\n", + "\n", + "\rlibrispeech_asr_0.t 0%[ ] 0 --.-KB/s \rlibrispeech_asr_0.t 100%[===================>] 2.10M --.-KB/s in 0.01s \n", + "\n", + "2024-04-12 10:39:27 (143 MB/s) - ‘librispeech_asr_0.txt’ saved [2199992/2199992]\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "L9hjHeKs3L07", + "outputId": "65c2a3cb-675f-4873-e786-3a644ffe0b88", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------------------------------------------------------------------------------------------+\n", + "|result |\n", + "+------------------------------------------------------------------------------------------+\n", + "|[ Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.]|\n", + "+------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "audioAssembler = AudioAssembler() \\\n", + " .setInputCol(\"audio_content\") \\\n", + " .setOutputCol(\"audio_assembler\")\n", + "\n", + "speechToText = WhisperForCTC.load(f\"{MODEL_NAME}_spark_nlp\")\n", + "\n", + "pipeline = Pipeline().setStages([audioAssembler, speechToText])\n", + "\n", + "audio_path = \"librispeech_asr_0.txt\"\n", + "with open(audio_path) as file:\n", + " raw_floats = [float(data) for data in file.read().strip().split(\"\\n\")]\n", + "\n", + "processedAudioFloats = spark.createDataFrame([[raw_floats]]).toDF(\"audio_content\")\n", + "\n", + "result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats)\n", + "result.select(\"text.result\").show(truncate = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s_uVMnSS3L07" + }, + "source": [ + "That's it! You can now go wild and use hundreds of Whisper models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" } - ], - "source": [ - "import sparknlp\n", - "from sparknlp.base import *\n", - "from sparknlp.annotator import *\n", - "from pyspark.ml import Pipeline\n", - "\n", - "audioAssembler = AudioAssembler() \\\n", - " .setInputCol(\"audio_content\") \\\n", - " .setOutputCol(\"audio_assembler\")\n", - "\n", - "speechToText = WhisperForCTC.load(f\"{MODEL_NAME}_spark_nlp\")\n", - "\n", - "pipeline = Pipeline().setStages([audioAssembler, speechToText])\n", - "\n", - "audio_path = \"../../../../src/test/resources/audio/txt/librispeech_asr_0.txt\"\n", - "with open(audio_path) as file:\n", - " raw_floats = [float(data) for data in file.read().strip().split(\"\\n\")]\n", - "\n", - "processedAudioFloats = spark.createDataFrame([[raw_floats]]).toDF(\"audio_content\")\n", - "\n", - "result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats)\n", - "result.select(\"text.result\").show(truncate = False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "That's it! You can now go wild and use hundreds of Whisper models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XLM_RoBERTa.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XLM_RoBERTa.ipynb index d7c228a6d55336..4f87291c05ae41 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XLM_RoBERTa.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XLM_RoBERTa.ipynb @@ -1,2421 +1,2284 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XLM_RoBERTa.ipynb)\n", + "\n", + "# Import ONNX XLM-RoBERTa models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for XLM-RoBERTa from HuggingFace and they have to be in `Fill Mask` category. Meaning, you cannot use XLM-RoBERTa models trained/fine-tuned on a specific task such as token/sequence classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "tGk3flXBkgA1" - }, - "source": [ - "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XLM-RoBERTa.ipynb)\n", - "\n", - "# Import ONNX XLM-RoBERTa models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", - "- You can import models for XLM-RoBERTa from HuggingFace and they have to be in `Fill Mask` category. Meaning, you cannot use XLM-RoBERTa models trained/fine-tuned on a specific task such as token/sequence classification." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m407.1/407.1 kB\u001b[0m \u001b[31m25.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m78.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.8/455.8 kB\u001b[0m \u001b[31m42.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m91.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m536.7/536.7 kB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m71.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m88.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m77.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m92.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.8/6.8 MB\u001b[0m \u001b[31m64.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m93.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m85.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m89.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m86.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m87.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m13.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m88.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m77.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m89.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m61.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m34.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m56.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m35.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m50.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m47.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m75.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m66.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m82.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m35.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m65.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m84.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m39.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m81.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m88.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m86.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.1/17.1 MB\u001b[0m \u001b[31m45.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m64.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m91.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m37.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m40.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m40.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m36.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m36.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m78.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m82.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m81.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m74.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m78.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [xlm-roberta-base](https://huggingface.co/xlm-roberta-base) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n", + "- In addition to the XLM-RoBERTa model, we also need to save the `XLMRobertaTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "xwUb8_YgkgA3" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ec1a9b2d5a994caea613dc851046eddf", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Export and Save HuggingFace model" + "text/plain": [ + "config.json: 0%| | 0.00/615 [00:00=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", - "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + "text/plain": [ + "model.safetensors: 0%| | 0.00/1.12G [00:00 False\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", - "\n", - "MODEL_NAME = \"xlm-roberta-base\"\n", - "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(EXPORT_PATH)\n", - "\n", - "# Create directory for assets and move the tokenizer files.\n", - "# A separate folder is needed for Spark NLP.\n", - "!mkdir {EXPORT_PATH}/assets\n", - "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets/" + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/25.0 [00:00 False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", + "\n", + "MODEL_NAME = \"xlm-roberta-base\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets\n", + "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1100748\n", + "drwxr-xr-x 2 root root 4096 Mar 1 02:25 assets\n", + "-rw-r--r-- 1 root root 679 Mar 1 02:25 config.json\n", + "-rw-r--r-- 1 root root 1110059085 Mar 1 02:25 model.onnx\n", + "-rw-r--r-- 1 root root 280 Mar 1 02:25 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 418 Mar 1 02:25 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082660 Mar 1 02:25 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 4952\n", + "-rw-r--r-- 1 root root 5069051 Mar 1 02:25 sentencepiece.bpe.model\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save XLM-RoBERTa in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.3.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.3.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m564.8/564.8 kB\u001b[0m \u001b[31m39.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m15.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `XlmRoBertaEmbeddings` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `XlmRoBertaEmbeddings` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- `setStorageRef` is very important. When you are training a task like NER or any Text Classification, we use this reference to bound the trained model to this specific embeddings so you won't load a different embeddings by mistake and see terrible results 😊\n", + "- It's up to you what you put in `setStorageRef` but it cannot be changed later on. We usually use the name of the model to be clear, but you can get creative if you want!\n", + "- The `dimension` param is is purely cosmetic and won't change anything. It's mostly for you to know later via `.getDimension` what is the dimension of your model. So set this accordingly.\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "xlm_roberta = XlmRoBertaEmbeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"xlm_roberta\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setDimension(768)\\\n", + " .setStorageRef('xlm_roberta_base')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xlm_roberta.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX XLM-RoBERTa model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1089168\n", + "drwxr-xr-x 2 root root 4096 Mar 1 02:28 metadata\n", + "-rw-r--r-- 1 root root 1110228614 Mar 1 02:30 xlmroberta_onnx\n", + "-rw-r--r-- 1 root root 5069051 Mar 1 02:30 xlmroberta_spp\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny XLM-RoBERTa model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "tokenizer = Tokenizer()\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"token\")\n", + "\n", + "xlm_roberta_loaded = XlmRoBertaEmbeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"xlm_roberta\")\\\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " document_assembler,\n", + " tokenizer,\n", + " xlm_roberta_loaded\n", + " ])\n", + "\n", + "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", + "model = pipeline.fit(data)\n", + "result = model.transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| embeddings|\n", + "+--------------------+\n", + "|[0.01781074, 0.16...|\n", + "|[-0.005121246, 0....|\n", + "|[0.00517074, 0.11...|\n", + "|[0.0065734405, 0....|\n", + "|[-0.028697606, 0....|\n", + "|[-0.0055652205, 0...|\n", + "|[-0.017623652, 0....|\n", + "|[-0.11884157, 0.0...|\n", + "|[-0.08074703, 0.1...|\n", + "|[-0.034696702, 0....|\n", + "|[-0.06809586, 0.1...|\n", + "|[-0.0508499, 0.07...|\n", + "|[-0.0065260027, 0...|\n", + "|[-0.029709894, 0....|\n", + "|[0.011362225, 0.2...|\n", + "|[0.044628896, 0.5...|\n", + "|[0.022999618, 0.2...|\n", + "|[0.017432231, 0.2...|\n", + "|[-0.024950821, 0....|\n", + "|[-0.031514782, 0....|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "result.selectExpr(\"explode(xlm_roberta.embeddings) as embeddings\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of XLM-RoBERTa models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "056d3ff42ddd43b2b20f5ec3cebaf38b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0c82e8ffd4b74655b03da94aef80ec04": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3db2b1532b4f47da9f7fa17bf38a7004", + "max": 25, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_57bd7a45c349469c88051275e5fb4087", + "value": 25 + } + }, + "13f83e1a9a9747d596ca20aeb6c344db": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1837891dac1a4f56aa35a984af00a197": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3be731e51a324504ae4e17c21d552d57", + "placeholder": "​", + "style": "IPY_MODEL_2993ed18ddf2429e885b32fb941f754d", + "value": " 615/615 [00:00<00:00, 32.3kB/s]" + } + }, + "1a305c881d794ebcb9567083faa633c2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2045c150cb67451cb3dc40564147b21a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2093e4b98ee847b892caf6aeb5e6a733": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2045c150cb67451cb3dc40564147b21a", + "placeholder": "​", + "style": "IPY_MODEL_b8d9d288ab7f466eb6f48758e215828e", + "value": " 9.10M/9.10M [00:00<00:00, 28.9MB/s]" + } + }, + "21316bfe791349378b4dd59008d7ef94": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_55635d82f5624418a147cb9b19da84c1", + "IPY_MODEL_2c193771cf3a4ae2b2e2830733bbf0c1", + "IPY_MODEL_c57eceb2ca494a02946d9d8e6dc78506" ], - "source": [ - "!ls -l {EXPORT_PATH}/assets" - ] + "layout": "IPY_MODEL_bcf1015593684c5b8f71ea707a408f7d" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "zRN-_0wpkgA8" - }, - "source": [ - "## Import and Save XLM-RoBERTa in Spark NLP\n", - "\n", - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script" - ] + "264dd92cdff642c68d84d9ea6309b187": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "D0yO3TmwkgA8", - "outputId": "8aaaaf9c-50ae-480b-b384-2ef72fdb6506", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Installing PySpark 3.2.3 and Spark NLP 5.3.0\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 5.3.0\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m564.8/564.8 kB\u001b[0m \u001b[31m39.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m15.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } + "298e83eae1d748aa8049f6adba534ac8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5eddacbffc8740b7bc3a9c1b179ac4dd", + "IPY_MODEL_d52133a3ba584a24bf06c5cf1dbf42bc", + "IPY_MODEL_b5815c68e9754f1ea7161db566c52d8c" ], - "source": [ - "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] + "layout": "IPY_MODEL_8598b6cace044c34ac4d8b74f926da66" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "7slGi8nrkgA9" - }, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] + "2993ed18ddf2429e885b32fb941f754d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "Z_Pnd_W8kgA9" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()" - ] + "2c193771cf3a4ae2b2e2830733bbf0c1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_add4d5c9cbfa4a16bbc125986e9a56d1", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1a305c881d794ebcb9567083faa633c2", + "value": 5069051 + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "qfwqBV67kgA9" - }, - "source": [ - "- Let's use `loadSavedModel` functon in `XlmRoBertaEmbeddings` which allows us to load the ONNX model\n", - "- Most params will be set automatically. They can also be set later after loading the model in `XlmRoBertaEmbeddings` during runtime, so don't worry about setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- `setStorageRef` is very important. When you are training a task like NER or any Text Classification, we use this reference to bound the trained model to this specific embeddings so you won't load a different embeddings by mistake and see terrible results 😊\n", - "- It's up to you what you put in `setStorageRef` but it cannot be changed later on. We usually use the name of the model to be clear, but you can get creative if you want!\n", - "- The `dimension` param is is purely cosmetic and won't change anything. It's mostly for you to know later via `.getDimension` what is the dimension of your model. So set this accordingly.\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" - ] + "3609e88700234d4887116cffdb8a31cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e5d2ac11228d4d07ab7370748e7fdab0", + "IPY_MODEL_0c82e8ffd4b74655b03da94aef80ec04", + "IPY_MODEL_b491f6db46ab493fb264bb2ff3c21b11" + ], + "layout": "IPY_MODEL_4495fecb11d647f69f1d1bcc8da2c8ff" + } }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "8CqvC6sJkgA9" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "\n", - "# All these params should be identical to the original ONNX model\n", - "xlm_roberta = XlmRoBertaEmbeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"xlm_roberta\")\\\n", - " .setCaseSensitive(True)\\\n", - " .setDimension(768)\\\n", - " .setStorageRef('xlm_roberta_base')" - ] + "37c520248bb842818112488b6f915b8e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "8GWj_urkkgA9" - }, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] + "38432651440344299dd9d6d462d681b4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "ZmiaXQXKkgA-" - }, - "outputs": [], - "source": [ - "xlm_roberta.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" - ] + "3bd8d3b7d2e242108d04cb724c7ad89f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "Z0aGPBmVkgA-" - }, - "source": [ - "Let's clean up stuff we don't need anymore" - ] + "3be731e51a324504ae4e17c21d552d57": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "k2yCZptPkgA-" - }, - "outputs": [], - "source": [ - "!rm -rf {EXPORT_PATH}" - ] + "3db2b1532b4f47da9f7fa17bf38a7004": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "XPsHMXy4kgA-" - }, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your ONNX XLM-RoBERTa model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] + "4331cf9d02474f9294b1615bd00b9edd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_264dd92cdff642c68d84d9ea6309b187", + "placeholder": "​", + "style": "IPY_MODEL_e78dcd3395b94f33b945c5f4d4a214e0", + "value": "config.json: 100%" + } }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "Iajnc2gekgA-", - "outputId": "347d60b9-153e-4632-cec7-4fcaa7b66585", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "total 1089168\n", - "drwxr-xr-x 2 root root 4096 Mar 1 02:28 metadata\n", - "-rw-r--r-- 1 root root 1110228614 Mar 1 02:30 xlmroberta_onnx\n", - "-rw-r--r-- 1 root root 5069051 Mar 1 02:30 xlmroberta_spp\n" - ] - } + "4495fecb11d647f69f1d1bcc8da2c8ff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4b3d82f11ee243a7b399a671a06b0b72": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4ea300be5e9546eda7189e1d1f39c22c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "54cedeb3ae624df0ab7bbf2f6e2cda62": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "55635d82f5624418a147cb9b19da84c1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dd1d5e184948402894199ebaf7defc9b", + "placeholder": "​", + "style": "IPY_MODEL_721dd749d6444b46b0293a227b5ebcb6", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "57bd7a45c349469c88051275e5fb4087": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5c22226c130b45269da0c68652a52c7e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5eddacbffc8740b7bc3a9c1b179ac4dd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_79dfd5ceaa3a49ee9eb1414ea4744580", + "placeholder": "​", + "style": "IPY_MODEL_d63f203cc55b41028203064c88e490c2", + "value": "model.safetensors: 100%" + } + }, + "622af6562a5a44348aaa91c54f116d66": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ef2da6ecc4564ec9a5f72145e5fced87", + "max": 9096718, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_73dc20453159438daebc82a61350d9c8", + "value": 9096718 + } + }, + "6a7a2098b1a74a6da302eacba871daa7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f2d8a8b7a58c45bbaa08defddd1f228a", + "IPY_MODEL_622af6562a5a44348aaa91c54f116d66", + "IPY_MODEL_2093e4b98ee847b892caf6aeb5e6a733" ], - "source": [ - "! ls -l {MODEL_NAME}_spark_nlp" - ] + "layout": "IPY_MODEL_4ea300be5e9546eda7189e1d1f39c22c" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "o71ap_SXkgA-" - }, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny XLM-RoBERTa model 😊" - ] + "721dd749d6444b46b0293a227b5ebcb6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "eIlcw7nnkgA-" - }, - "outputs": [], - "source": [ - "import sparknlp\n", - "\n", - "from sparknlp.base import *\n", - "from sparknlp.annotator import *\n", - "\n", - "document_assembler = DocumentAssembler()\\\n", - " .setInputCol(\"text\")\\\n", - " .setOutputCol(\"document\")\n", - "\n", - "tokenizer = Tokenizer()\\\n", - " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"token\")\n", - "\n", - "xlm_roberta_loaded = XlmRoBertaEmbeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"xlm_roberta\")\\\n", - "\n", - "pipeline = Pipeline(\n", - " stages = [\n", - " document_assembler,\n", - " tokenizer,\n", - " xlm_roberta_loaded\n", - " ])\n", - "\n", - "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", - "model = pipeline.fit(data)\n", - "result = model.transform(data)" - ] + "73dc20453159438daebc82a61350d9c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "37gi-KXfkgA_", - "outputId": "7b86c51f-45b8-463d-b199-c5e2c7c8f847", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+\n", - "| embeddings|\n", - "+--------------------+\n", - "|[0.01781074, 0.16...|\n", - "|[-0.005121246, 0....|\n", - "|[0.00517074, 0.11...|\n", - "|[0.0065734405, 0....|\n", - "|[-0.028697606, 0....|\n", - "|[-0.0055652205, 0...|\n", - "|[-0.017623652, 0....|\n", - "|[-0.11884157, 0.0...|\n", - "|[-0.08074703, 0.1...|\n", - "|[-0.034696702, 0....|\n", - "|[-0.06809586, 0.1...|\n", - "|[-0.0508499, 0.07...|\n", - "|[-0.0065260027, 0...|\n", - "|[-0.029709894, 0....|\n", - "|[0.011362225, 0.2...|\n", - "|[0.044628896, 0.5...|\n", - "|[0.022999618, 0.2...|\n", - "|[0.017432231, 0.2...|\n", - "|[-0.024950821, 0....|\n", - "|[-0.031514782, 0....|\n", - "+--------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } + "79dfd5ceaa3a49ee9eb1414ea4744580": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8598b6cace044c34ac4d8b74f926da66": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "871a297967cc4fe49c5463f4472a7b2f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9a99c367c8ce4ab5ac4987b0f3ca878d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a5dd185eef4844fa847808b2b91aa26b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "add4d5c9cbfa4a16bbc125986e9a56d1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b30358df084f451090cde47debff0345": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b491f6db46ab493fb264bb2ff3c21b11": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_38432651440344299dd9d6d462d681b4", + "placeholder": "​", + "style": "IPY_MODEL_37c520248bb842818112488b6f915b8e", + "value": " 25.0/25.0 [00:00<00:00, 97.1B/s]" + } + }, + "b5815c68e9754f1ea7161db566c52d8c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b30358df084f451090cde47debff0345", + "placeholder": "​", + "style": "IPY_MODEL_871a297967cc4fe49c5463f4472a7b2f", + "value": " 1.12G/1.12G [00:10<00:00, 146MB/s]" + } + }, + "b600282be466458089b1a620acb2d28c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b8d9d288ab7f466eb6f48758e215828e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bcf1015593684c5b8f71ea707a408f7d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c57eceb2ca494a02946d9d8e6dc78506": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5c22226c130b45269da0c68652a52c7e", + "placeholder": "​", + "style": "IPY_MODEL_9a99c367c8ce4ab5ac4987b0f3ca878d", + "value": " 5.07M/5.07M [00:00<00:00, 19.0MB/s]" + } + }, + "d52133a3ba584a24bf06c5cf1dbf42bc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_13f83e1a9a9747d596ca20aeb6c344db", + "max": 1115567652, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a5dd185eef4844fa847808b2b91aa26b", + "value": 1115567652 + } + }, + "d63f203cc55b41028203064c88e490c2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dd1d5e184948402894199ebaf7defc9b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e5d2ac11228d4d07ab7370748e7fdab0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f04fd1d8dae449c8a4860127f9a14a24", + "placeholder": "​", + "style": "IPY_MODEL_056d3ff42ddd43b2b20f5ec3cebaf38b", + "value": "tokenizer_config.json: 100%" + } + }, + "e631d560fb66440891770005fc492e90": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e78dcd3395b94f33b945c5f4d4a214e0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ec1a9b2d5a994caea613dc851046eddf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4331cf9d02474f9294b1615bd00b9edd", + "IPY_MODEL_f617219731114a22a24ee6a477696004", + "IPY_MODEL_1837891dac1a4f56aa35a984af00a197" ], - "source": [ - "result.selectExpr(\"explode(xlm_roberta.embeddings) as embeddings\").show()" - ] + "layout": "IPY_MODEL_3bd8d3b7d2e242108d04cb724c7ad89f" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "SMP8yiKhkgA_" - }, - "source": [ - "That's it! You can now go wild and use hundreds of XLM-RoBERTa models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] + "ef2da6ecc4564ec9a5f72145e5fced87": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "f04fd1d8dae449c8a4860127f9a14a24": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" + "f2d8a8b7a58c45bbaa08defddd1f228a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b600282be466458089b1a620acb2d28c", + "placeholder": "​", + "style": "IPY_MODEL_54cedeb3ae624df0ab7bbf2f6e2cda62", + "value": "tokenizer.json: 100%" + } }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "ec1a9b2d5a994caea613dc851046eddf": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_4331cf9d02474f9294b1615bd00b9edd", - "IPY_MODEL_f617219731114a22a24ee6a477696004", - "IPY_MODEL_1837891dac1a4f56aa35a984af00a197" - ], - "layout": "IPY_MODEL_3bd8d3b7d2e242108d04cb724c7ad89f" - } - }, - "4331cf9d02474f9294b1615bd00b9edd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_264dd92cdff642c68d84d9ea6309b187", - "placeholder": "​", - "style": "IPY_MODEL_e78dcd3395b94f33b945c5f4d4a214e0", - "value": "config.json: 100%" - } - }, - "f617219731114a22a24ee6a477696004": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4b3d82f11ee243a7b399a671a06b0b72", - "max": 615, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_e631d560fb66440891770005fc492e90", - "value": 615 - } - }, - "1837891dac1a4f56aa35a984af00a197": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3be731e51a324504ae4e17c21d552d57", - "placeholder": "​", - "style": "IPY_MODEL_2993ed18ddf2429e885b32fb941f754d", - "value": " 615/615 [00:00<00:00, 32.3kB/s]" - } - }, - "3bd8d3b7d2e242108d04cb724c7ad89f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "264dd92cdff642c68d84d9ea6309b187": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e78dcd3395b94f33b945c5f4d4a214e0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4b3d82f11ee243a7b399a671a06b0b72": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e631d560fb66440891770005fc492e90": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "3be731e51a324504ae4e17c21d552d57": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2993ed18ddf2429e885b32fb941f754d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "298e83eae1d748aa8049f6adba534ac8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_5eddacbffc8740b7bc3a9c1b179ac4dd", - "IPY_MODEL_d52133a3ba584a24bf06c5cf1dbf42bc", - "IPY_MODEL_b5815c68e9754f1ea7161db566c52d8c" - ], - "layout": "IPY_MODEL_8598b6cace044c34ac4d8b74f926da66" - } - }, - "5eddacbffc8740b7bc3a9c1b179ac4dd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_79dfd5ceaa3a49ee9eb1414ea4744580", - "placeholder": "​", - "style": "IPY_MODEL_d63f203cc55b41028203064c88e490c2", - "value": "model.safetensors: 100%" - } - }, - "d52133a3ba584a24bf06c5cf1dbf42bc": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_13f83e1a9a9747d596ca20aeb6c344db", - "max": 1115567652, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_a5dd185eef4844fa847808b2b91aa26b", - "value": 1115567652 - } - }, - "b5815c68e9754f1ea7161db566c52d8c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b30358df084f451090cde47debff0345", - "placeholder": "​", - "style": "IPY_MODEL_871a297967cc4fe49c5463f4472a7b2f", - "value": " 1.12G/1.12G [00:10<00:00, 146MB/s]" - } - }, - "8598b6cace044c34ac4d8b74f926da66": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "79dfd5ceaa3a49ee9eb1414ea4744580": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d63f203cc55b41028203064c88e490c2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "13f83e1a9a9747d596ca20aeb6c344db": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a5dd185eef4844fa847808b2b91aa26b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "b30358df084f451090cde47debff0345": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "871a297967cc4fe49c5463f4472a7b2f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3609e88700234d4887116cffdb8a31cb": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e5d2ac11228d4d07ab7370748e7fdab0", - "IPY_MODEL_0c82e8ffd4b74655b03da94aef80ec04", - "IPY_MODEL_b491f6db46ab493fb264bb2ff3c21b11" - ], - "layout": "IPY_MODEL_4495fecb11d647f69f1d1bcc8da2c8ff" - } - }, - "e5d2ac11228d4d07ab7370748e7fdab0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f04fd1d8dae449c8a4860127f9a14a24", - "placeholder": "​", - "style": "IPY_MODEL_056d3ff42ddd43b2b20f5ec3cebaf38b", - "value": "tokenizer_config.json: 100%" - } - }, - "0c82e8ffd4b74655b03da94aef80ec04": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3db2b1532b4f47da9f7fa17bf38a7004", - "max": 25, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_57bd7a45c349469c88051275e5fb4087", - "value": 25 - } - }, - "b491f6db46ab493fb264bb2ff3c21b11": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_38432651440344299dd9d6d462d681b4", - "placeholder": "​", - "style": "IPY_MODEL_37c520248bb842818112488b6f915b8e", - "value": " 25.0/25.0 [00:00<00:00, 97.1B/s]" - } - }, - "4495fecb11d647f69f1d1bcc8da2c8ff": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f04fd1d8dae449c8a4860127f9a14a24": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "056d3ff42ddd43b2b20f5ec3cebaf38b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3db2b1532b4f47da9f7fa17bf38a7004": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "57bd7a45c349469c88051275e5fb4087": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "38432651440344299dd9d6d462d681b4": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "37c520248bb842818112488b6f915b8e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "21316bfe791349378b4dd59008d7ef94": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_55635d82f5624418a147cb9b19da84c1", - "IPY_MODEL_2c193771cf3a4ae2b2e2830733bbf0c1", - "IPY_MODEL_c57eceb2ca494a02946d9d8e6dc78506" - ], - "layout": "IPY_MODEL_bcf1015593684c5b8f71ea707a408f7d" - } - }, - "55635d82f5624418a147cb9b19da84c1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_dd1d5e184948402894199ebaf7defc9b", - "placeholder": "​", - "style": "IPY_MODEL_721dd749d6444b46b0293a227b5ebcb6", - "value": "sentencepiece.bpe.model: 100%" - } - }, - "2c193771cf3a4ae2b2e2830733bbf0c1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_add4d5c9cbfa4a16bbc125986e9a56d1", - "max": 5069051, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_1a305c881d794ebcb9567083faa633c2", - "value": 5069051 - } - }, - "c57eceb2ca494a02946d9d8e6dc78506": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5c22226c130b45269da0c68652a52c7e", - "placeholder": "​", - "style": "IPY_MODEL_9a99c367c8ce4ab5ac4987b0f3ca878d", - "value": " 5.07M/5.07M [00:00<00:00, 19.0MB/s]" - } - }, - "bcf1015593684c5b8f71ea707a408f7d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "dd1d5e184948402894199ebaf7defc9b": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "721dd749d6444b46b0293a227b5ebcb6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "add4d5c9cbfa4a16bbc125986e9a56d1": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1a305c881d794ebcb9567083faa633c2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "5c22226c130b45269da0c68652a52c7e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9a99c367c8ce4ab5ac4987b0f3ca878d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "6a7a2098b1a74a6da302eacba871daa7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_f2d8a8b7a58c45bbaa08defddd1f228a", - "IPY_MODEL_622af6562a5a44348aaa91c54f116d66", - "IPY_MODEL_2093e4b98ee847b892caf6aeb5e6a733" - ], - "layout": "IPY_MODEL_4ea300be5e9546eda7189e1d1f39c22c" - } - }, - "f2d8a8b7a58c45bbaa08defddd1f228a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b600282be466458089b1a620acb2d28c", - "placeholder": "​", - "style": "IPY_MODEL_54cedeb3ae624df0ab7bbf2f6e2cda62", - "value": "tokenizer.json: 100%" - } - }, - "622af6562a5a44348aaa91c54f116d66": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ef2da6ecc4564ec9a5f72145e5fced87", - "max": 9096718, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_73dc20453159438daebc82a61350d9c8", - "value": 9096718 - } - }, - "2093e4b98ee847b892caf6aeb5e6a733": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2045c150cb67451cb3dc40564147b21a", - "placeholder": "​", - "style": "IPY_MODEL_b8d9d288ab7f466eb6f48758e215828e", - "value": " 9.10M/9.10M [00:00<00:00, 28.9MB/s]" - } - }, - "4ea300be5e9546eda7189e1d1f39c22c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b600282be466458089b1a620acb2d28c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "54cedeb3ae624df0ab7bbf2f6e2cda62": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ef2da6ecc4564ec9a5f72145e5fced87": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "73dc20453159438daebc82a61350d9c8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "2045c150cb67451cb3dc40564147b21a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b8d9d288ab7f466eb6f48758e215828e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } + "f617219731114a22a24ee6a477696004": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4b3d82f11ee243a7b399a671a06b0b72", + "max": 615, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e631d560fb66440891770005fc492e90", + "value": 615 + } } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb index 5f3a6e2d16d03d..34520072d358ab 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb @@ -1,2433 +1,2295 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import ONNX XlmRoBertaForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `XlmRoBertaForQuestionAnswering` is only available since in `Spark NLP 5.2.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import XLM-RoBERTa models trained/fine-tuned for question answering via `XlmRoBertaForQuestionAnswering` or `TFXlmRoBertaForQuestionAnswering`. These models are usually under `Question Answering` category and have `xlm-roberta` in their labels\n", + "- Reference: [TFXlmRoBertaForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForQuestionAnswering)\n", + "- Some [example models](https://huggingface.co/models?filter=xlm-roberta&pipeline_tag=question-answering)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- Albert uses SentencePiece, so we will have to install that as well" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "_opj2ZzntbDk" - }, - "source": [ - "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb)" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m29.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m43.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m38.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m63.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m43.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m70.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m69.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m84.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m87.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m37.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m87.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m51.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m93.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m77.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m81.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m42.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m76.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m58.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m83.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m68.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m94.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m68.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m87.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m38.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m74.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m85.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m36.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m75.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m51.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m84.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m82.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m74.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m56.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m90.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m32.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m89.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m40.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m38.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m79.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m29.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m77.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m68.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m75.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use ['deepset/xlm-roberta-base-squad2'](https://huggingface.co/'deepset/xlm-roberta-base-squad2') model from HuggingFace as an example as an example and load it as a `ORTModelForQuestionAnswering`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "u1i6TpsutbDl" - }, - "source": [ - "## Import ONNX XlmRoBertaForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", - "- `XlmRoBertaForQuestionAnswering` is only available since in `Spark NLP 5.2.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", - "- You can import XLM-RoBERTa models trained/fine-tuned for question answering via `XlmRoBertaForQuestionAnswering` or `TFXlmRoBertaForQuestionAnswering`. These models are usually under `Question Answering` category and have `xlm-roberta` in their labels\n", - "- Reference: [TFXlmRoBertaForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForQuestionAnswering)\n", - "- Some [example models](https://huggingface.co/models?filter=xlm-roberta&pipeline_tag=question-answering)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "tikYI59NtbDl" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b8d926231122407f95b4483350bc4e8e", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Export and Save HuggingFace model" + "text/plain": [ + "config.json: 0%| | 0.00/605 [00:00=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", - "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + "text/plain": [ + "model.safetensors: 0%| | 0.00/1.11G [00:00 False\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForQuestionAnswering\n", - "import tensorflow as tf\n", - "\n", - "MODEL_NAME = 'deepset/xlm-roberta-base-squad2'\n", - "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForQuestionAnswering.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(ONNX_MODEL)" + "text/plain": [ + "sentencepiece.bpe.model: 0%| | 0.00/5.07M [00:00 False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForQuestionAnswering\n", + "import tensorflow as tf\n", + "\n", + "MODEL_NAME = 'deepset/xlm-roberta-base-squad2'\n", + "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForQuestionAnswering.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(ONNX_MODEL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside this directory and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "blR5qjXwtbDn" - }, - "source": [ - "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1105736\n", + "-rw-r--r-- 1 root root 787 Jan 9 19:44 config.json\n", + "-rw-r--r-- 1 root root 1110100056 Jan 9 19:44 model.onnx\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:44 sentencepiece.bpe.model\n", + "-rw-r--r-- 1 root root 167 Jan 9 19:44 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 500 Jan 9 19:44 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082730 Jan 9 19:44 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila! We have our `sentencepiece.bpe.model` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "QEOomAeKtbDn" - }, - "outputs": [], - "source": [ - "!mkdir {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "onnx_models/deepset/xlm-roberta-base-squad2:\n", + "total 1100788\n", + "drwxr-xr-x 2 root root 4096 Jan 9 19:44 assets\n", + "-rw-r--r-- 1 root root 787 Jan 9 19:44 config.json\n", + "-rw-r--r-- 1 root root 1110100056 Jan 9 19:44 model.onnx\n", + "-rw-r--r-- 1 root root 167 Jan 9 19:44 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 500 Jan 9 19:44 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082730 Jan 9 19:44 tokenizer.json\n", + "\n", + "onnx_models/deepset/xlm-roberta-base-squad2/assets:\n", + "total 4952\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:44 sentencepiece.bpe.model\n" + ] + } + ], + "source": [ + "!ls -lR {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save XlmRoBertaForQuestionAnswering in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "5wjZ8w19tbDn" - }, - "outputs": [], - "source": [ - "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m40.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m23.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "ehhwZp5ntbDn" - }, - "source": [ - "Voila! We have our `sentencepiece.bpe.model` inside assets directory" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `RoBertaForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `RoBertaForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "spanClassifier = XlmRoBertaForQuestionAnswering.loadSavedModel(\n", + " ONNX_MODEL,\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document_question\",'document_context'])\\\n", + " .setOutputCol(\"answer\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(512)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spanClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your XlmRoBertaForQuestionAnswering model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "s7B5nkQ7tbDn", - "outputId": "d5a9f508-f04c-4281-b99e-a74ce6c8c153", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "onnx_models/deepset/xlm-roberta-base-squad2:\n", - "total 1100788\n", - "drwxr-xr-x 2 root root 4096 Jan 9 19:44 assets\n", - "-rw-r--r-- 1 root root 787 Jan 9 19:44 config.json\n", - "-rw-r--r-- 1 root root 1110100056 Jan 9 19:44 model.onnx\n", - "-rw-r--r-- 1 root root 167 Jan 9 19:44 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 500 Jan 9 19:44 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 17082730 Jan 9 19:44 tokenizer.json\n", - "\n", - "onnx_models/deepset/xlm-roberta-base-squad2/assets:\n", - "total 4952\n", - "-rw-r--r-- 1 root root 5069051 Jan 9 19:44 sentencepiece.bpe.model\n" - ] - } - ], - "source": [ - "!ls -lR {ONNX_MODEL}" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 484956\n", + "drwxr-xr-x 4 root root 4096 Oct 17 16:49 fields\n", + "drwxr-xr-x 2 root root 4096 Oct 17 16:49 metadata\n", + "-rw-r--r-- 1 root root 496583922 Oct 17 16:49 roberta_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny XlmRoBertaForQuestionAnswering model in Spark NLP 🚀 pipeline!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "bVKi9X6ftbDn" - }, - "source": [ - "## Import and Save RoBertaForQuestionAnswering in Spark NLP\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------+\n", + "|result |\n", + "+---------------------------+\n", + "|[as Amazonia or the Amazon]|\n", + "+---------------------------+\n", + "\n" + ] + } + ], + "source": [ + "document_assembler = MultiDocumentAssembler() \\\n", + " .setInputCols([\"question\", \"context\"]) \\\n", + " .setOutputCols([\"document_question\", \"document_context\"])\n", + "\n", + "spanClassifier_loaded = XlmRoBertaForQuestionAnswering.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document_question\",'document_context'])\\\n", + " .setOutputCol(\"answer\")\n", + "\n", + "pipeline = Pipeline().setStages([\n", + " document_assembler,\n", + " spanClassifier_loaded\n", + "])\n", + "\n", + "context = \"\"\"The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain \"Amazonas\" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.\"\"\"\n", + "question = \"Which name is also used to describe the Amazon rainforest in English?\"\n", + "example = spark.createDataFrame([[question, context]]).toDF(\"question\", \"context\")\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "result.select(\"answer.result\").show(1, False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of `RoBertaForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "01f9a013ab38435191872a8bce64dd69": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "be7jTIVAtbDo" - }, - "source": [ - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script" - ] + "032d809a9bb64c9b863f6f4b7b115133": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R_YAIBS_tbDo", - "outputId": "7506fbe6-aa72-4697-ae19-5ab7ae404f18" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m40.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m23.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } + "05d85f70379b486db128698df001384e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0be61c031fad4f31877ad29119b6a77e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0d91f57bfa25489eb70a23f2bd834cdf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "10e1aa374ebd43a9b083d9b7aab95b23": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f251ac2cd1ed45d4a7d25d6b906a84bc", + "placeholder": "​", + "style": "IPY_MODEL_824bc6d195ba465497fe34898880a7f8", + "value": "model.safetensors: 100%" + } + }, + "130c539288884b3aa341f9be6c62d29a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "14d951a930d5478b9f4e77c430c2bb6f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_80fb8200d6d74cbca19ce946d73d9efa", + "placeholder": "​", + "style": "IPY_MODEL_67688fa958004ccfbacf0288f4075dea", + "value": "special_tokens_map.json: 100%" + } + }, + "17ca1b81af0f408e9ab164456872cd49": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_032d809a9bb64c9b863f6f4b7b115133", + "placeholder": "​", + "style": "IPY_MODEL_c9bb2353da02443c94afd06685b8cde8", + "value": " 605/605 [00:00<00:00, 24.9kB/s]" + } + }, + "1f3d6a7de82f4710a79d814b6af57679": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "240bdca5cc794ae8b1319310e42aec2c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2577bab4d4a64010addd7c2b5370a6c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2bef4b3dc3cd457fa4ccb2e796d771f7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2d9727f78a41430890ec77cce4fe0ce5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_88166d6e4ee04f11a79c2a1a532c7300", + "max": 605, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_130c539288884b3aa341f9be6c62d29a", + "value": 605 + } + }, + "389d3448663f4389afa71734e6cc1434": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3fbcd89672f842bf9692ebffd72d26c7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_389d3448663f4389afa71734e6cc1434", + "max": 79, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_937847a9252247979ed76e17dc60f7b6", + "value": 79 + } + }, + "4119ae23b5bc4a0c9148e60eb9dfcb53": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4154781b221948aab4258b4fa6799996": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6566341fd0c04880a7ab5ff1409e4448", + "placeholder": "​", + "style": "IPY_MODEL_4119ae23b5bc4a0c9148e60eb9dfcb53", + "value": "config.json: 100%" + } + }, + "423cf810d1254518af9e4fc79f97e54e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_96b57a70103f4ac9b014b375e1aee824", + "placeholder": "​", + "style": "IPY_MODEL_74b28f16de69498b9915e501863b4d73", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "54210209428948189c549c1d2cd939ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e77456b2ef2a46ef99572a02e1b70817", + "IPY_MODEL_3fbcd89672f842bf9692ebffd72d26c7", + "IPY_MODEL_f87349ba371f41d788f802a64f9312f9" ], - "source": [ - "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] + "layout": "IPY_MODEL_c217c65b12264609862aeaeb1c25adc0" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "5MD6ogjatbDo" - }, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] + "5473ab4a957b414aa344f37f921d89a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_423cf810d1254518af9e4fc79f97e54e", + "IPY_MODEL_83c2346cea5c4928a23f4fa38d2ffe40", + "IPY_MODEL_e2fde33a78ff4172b806b1530bda4e9a" + ], + "layout": "IPY_MODEL_d1b3fb0032ce400c80922d128940333c" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kcTBCppJtbDo", - "outputId": "379c7e82-9918-4294-b20b-d0c45215febf" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apache Spark version: 3.2.3\n" - ] - } + "6566341fd0c04880a7ab5ff1409e4448": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "67688fa958004ccfbacf0288f4075dea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6885319026334ff99533e70c8670baea": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "74b28f16de69498b9915e501863b4d73": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "76f48ff667754dd682ade56eaaca7049": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7c620665c5964164adaf3539a0cb5ff5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7f808235cbfd4ec28e05215cbd27e3f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_10e1aa374ebd43a9b083d9b7aab95b23", + "IPY_MODEL_a5844989c8f648df82ee6414eeec9a21", + "IPY_MODEL_d02f4f39378b4e568f0f9382efd6a9d1" ], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()\n", - "print(\"Apache Spark version: {}\".format(spark.version))" - ] + "layout": "IPY_MODEL_76f48ff667754dd682ade56eaaca7049" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "k3S-0O9btbDo" - }, - "source": [ - "- Let's use `loadSavedModel` functon in `RoBertaForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", - "- Most params can be set later when you are loading this model in `RoBertaForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n", - "\n" - ] + "7f90a349f8ce4525aa33649d70c09a33": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Gsnk6JQ7tbDo" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *\n", - "\n", - "spanClassifier = RoBertaForQuestionAnswering.loadSavedModel(\n", - " ONNX_MODEL,\n", - " spark\n", - " )\\\n", - " .setInputCols([\"document_question\",'document_context'])\\\n", - " .setOutputCol(\"answer\")\\\n", - " .setCaseSensitive(True)\\\n", - " .setMaxSentenceLength(512)" - ] + "80fb8200d6d74cbca19ce946d73d9efa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "3ed2WScitbDo" - }, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] + "824bc6d195ba465497fe34898880a7f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Gy7YzF0htbDo" - }, - "outputs": [], - "source": [ - "spanClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" - ] + "8338b89888584db5944d0ad5baa9118f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "1p0HFM4atbDo" - }, - "source": [ - "Let's clean up stuff we don't need anymore" - ] + "83c2346cea5c4928a23f4fa38d2ffe40": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8338b89888584db5944d0ad5baa9118f", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8baaa8a868ec4e0b98388e565d61a3db", + "value": 5069051 + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RvkyiLHotbDo" - }, - "outputs": [], - "source": [ - "!rm -rf {ONNX_MODEL}" - ] + "88166d6e4ee04f11a79c2a1a532c7300": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "xiNxN0tdtbDo" - }, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your RoBertaForQuestionAnswering model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] + "8baaa8a868ec4e0b98388e565d61a3db": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Kzym6Y90tbDo", - "outputId": "b3f2deb2-be48-4eac-e747-472ec58d6873" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 484956\n", - "drwxr-xr-x 4 root root 4096 Oct 17 16:49 fields\n", - "drwxr-xr-x 2 root root 4096 Oct 17 16:49 metadata\n", - "-rw-r--r-- 1 root root 496583922 Oct 17 16:49 roberta_classification_onnx\n" - ] - } + "8c36c9eda216474d86fe73c161690a6d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_14d951a930d5478b9f4e77c430c2bb6f", + "IPY_MODEL_f6ae4d642a9f41cd829dde04347e50a3", + "IPY_MODEL_969548659ca74527afa6a0f92e3a98e7" ], - "source": [ - "! ls -l {ONNX_MODEL}_spark_nlp_onnx" - ] + "layout": "IPY_MODEL_7f90a349f8ce4525aa33649d70c09a33" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "m2NiO3hytbDo" - }, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForQuestionAnswering model in Spark NLP 🚀 pipeline!" - ] + "937847a9252247979ed76e17dc60f7b6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BmIIrsGctbDp", - "outputId": "44f84743-6908-4143-87b6-244aae258115" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---------------------------+\n", - "|result |\n", - "+---------------------------+\n", - "|[as Amazonia or the Amazon]|\n", - "+---------------------------+\n", - "\n" - ] - } + "969548659ca74527afa6a0f92e3a98e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_01f9a013ab38435191872a8bce64dd69", + "placeholder": "​", + "style": "IPY_MODEL_0d91f57bfa25489eb70a23f2bd834cdf", + "value": " 150/150 [00:00<00:00, 10.5kB/s]" + } + }, + "96b57a70103f4ac9b014b375e1aee824": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9c1d4b12b247412295fe57637f7d2e60": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a5844989c8f648df82ee6414eeec9a21": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_05d85f70379b486db128698df001384e", + "max": 1109846632, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7c620665c5964164adaf3539a0cb5ff5", + "value": 1109846632 + } + }, + "ac6ad6631b054107b3287180cf3b9e68": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b8d926231122407f95b4483350bc4e8e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4154781b221948aab4258b4fa6799996", + "IPY_MODEL_2d9727f78a41430890ec77cce4fe0ce5", + "IPY_MODEL_17ca1b81af0f408e9ab164456872cd49" ], - "source": [ - "document_assembler = MultiDocumentAssembler() \\\n", - " .setInputCols([\"question\", \"context\"]) \\\n", - " .setOutputCols([\"document_question\", \"document_context\"])\n", - "\n", - "spanClassifier_loaded = RoBertaForQuestionAnswering.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", - " .setInputCols([\"document_question\",'document_context'])\\\n", - " .setOutputCol(\"answer\")\n", - "\n", - "pipeline = Pipeline().setStages([\n", - " document_assembler,\n", - " spanClassifier_loaded\n", - "])\n", - "\n", - "context = \"\"\"The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain \"Amazonas\" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.\"\"\"\n", - "question = \"Which name is also used to describe the Amazon rainforest in English?\"\n", - "example = spark.createDataFrame([[question, context]]).toDF(\"question\", \"context\")\n", - "result = pipeline.fit(example).transform(example)\n", - "\n", - "result.select(\"answer.result\").show(1, False)" - ] + "layout": "IPY_MODEL_6885319026334ff99533e70c8670baea" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "M5L0cHZptbDp" - }, - "source": [ - "That's it! You can now go wild and use hundreds of `RoBertaForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] + "c217c65b12264609862aeaeb1c25adc0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "c9bb2353da02443c94afd06685b8cde8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "cb0069caad574477866382fa085508d5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "b8d926231122407f95b4483350bc4e8e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_4154781b221948aab4258b4fa6799996", - "IPY_MODEL_2d9727f78a41430890ec77cce4fe0ce5", - "IPY_MODEL_17ca1b81af0f408e9ab164456872cd49" - ], - "layout": "IPY_MODEL_6885319026334ff99533e70c8670baea" - } - }, - "4154781b221948aab4258b4fa6799996": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_6566341fd0c04880a7ab5ff1409e4448", - "placeholder": "​", - "style": "IPY_MODEL_4119ae23b5bc4a0c9148e60eb9dfcb53", - "value": "config.json: 100%" - } - }, - "2d9727f78a41430890ec77cce4fe0ce5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_88166d6e4ee04f11a79c2a1a532c7300", - "max": 605, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_130c539288884b3aa341f9be6c62d29a", - "value": 605 - } - }, - "17ca1b81af0f408e9ab164456872cd49": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_032d809a9bb64c9b863f6f4b7b115133", - "placeholder": "​", - "style": "IPY_MODEL_c9bb2353da02443c94afd06685b8cde8", - "value": " 605/605 [00:00<00:00, 24.9kB/s]" - } - }, - "6885319026334ff99533e70c8670baea": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6566341fd0c04880a7ab5ff1409e4448": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4119ae23b5bc4a0c9148e60eb9dfcb53": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "88166d6e4ee04f11a79c2a1a532c7300": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "130c539288884b3aa341f9be6c62d29a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "032d809a9bb64c9b863f6f4b7b115133": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c9bb2353da02443c94afd06685b8cde8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7f808235cbfd4ec28e05215cbd27e3f8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_10e1aa374ebd43a9b083d9b7aab95b23", - "IPY_MODEL_a5844989c8f648df82ee6414eeec9a21", - "IPY_MODEL_d02f4f39378b4e568f0f9382efd6a9d1" - ], - "layout": "IPY_MODEL_76f48ff667754dd682ade56eaaca7049" - } - }, - "10e1aa374ebd43a9b083d9b7aab95b23": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f251ac2cd1ed45d4a7d25d6b906a84bc", - "placeholder": "​", - "style": "IPY_MODEL_824bc6d195ba465497fe34898880a7f8", - "value": "model.safetensors: 100%" - } - }, - "a5844989c8f648df82ee6414eeec9a21": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_05d85f70379b486db128698df001384e", - "max": 1109846632, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_7c620665c5964164adaf3539a0cb5ff5", - "value": 1109846632 - } - }, - "d02f4f39378b4e568f0f9382efd6a9d1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f708d4e670c54cfe93d9ba82818590dc", - "placeholder": "​", - "style": "IPY_MODEL_0be61c031fad4f31877ad29119b6a77e", - "value": " 1.11G/1.11G [00:22<00:00, 52.5MB/s]" - } - }, - "76f48ff667754dd682ade56eaaca7049": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f251ac2cd1ed45d4a7d25d6b906a84bc": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "824bc6d195ba465497fe34898880a7f8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "05d85f70379b486db128698df001384e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7c620665c5964164adaf3539a0cb5ff5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "f708d4e670c54cfe93d9ba82818590dc": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0be61c031fad4f31877ad29119b6a77e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "54210209428948189c549c1d2cd939ca": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e77456b2ef2a46ef99572a02e1b70817", - "IPY_MODEL_3fbcd89672f842bf9692ebffd72d26c7", - "IPY_MODEL_f87349ba371f41d788f802a64f9312f9" - ], - "layout": "IPY_MODEL_c217c65b12264609862aeaeb1c25adc0" - } - }, - "e77456b2ef2a46ef99572a02e1b70817": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2bef4b3dc3cd457fa4ccb2e796d771f7", - "placeholder": "​", - "style": "IPY_MODEL_240bdca5cc794ae8b1319310e42aec2c", - "value": "tokenizer_config.json: 100%" - } - }, - "3fbcd89672f842bf9692ebffd72d26c7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_389d3448663f4389afa71734e6cc1434", - "max": 79, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_937847a9252247979ed76e17dc60f7b6", - "value": 79 - } - }, - "f87349ba371f41d788f802a64f9312f9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_cb0069caad574477866382fa085508d5", - "placeholder": "​", - "style": "IPY_MODEL_1f3d6a7de82f4710a79d814b6af57679", - "value": " 79.0/79.0 [00:00<00:00, 3.95kB/s]" - } - }, - "c217c65b12264609862aeaeb1c25adc0": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2bef4b3dc3cd457fa4ccb2e796d771f7": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "240bdca5cc794ae8b1319310e42aec2c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "389d3448663f4389afa71734e6cc1434": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "937847a9252247979ed76e17dc60f7b6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "cb0069caad574477866382fa085508d5": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1f3d6a7de82f4710a79d814b6af57679": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "5473ab4a957b414aa344f37f921d89a2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_423cf810d1254518af9e4fc79f97e54e", - "IPY_MODEL_83c2346cea5c4928a23f4fa38d2ffe40", - "IPY_MODEL_e2fde33a78ff4172b806b1530bda4e9a" - ], - "layout": "IPY_MODEL_d1b3fb0032ce400c80922d128940333c" - } - }, - "423cf810d1254518af9e4fc79f97e54e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_96b57a70103f4ac9b014b375e1aee824", - "placeholder": "​", - "style": "IPY_MODEL_74b28f16de69498b9915e501863b4d73", - "value": "sentencepiece.bpe.model: 100%" - } - }, - "83c2346cea5c4928a23f4fa38d2ffe40": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_8338b89888584db5944d0ad5baa9118f", - "max": 5069051, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_8baaa8a868ec4e0b98388e565d61a3db", - "value": 5069051 - } - }, - "e2fde33a78ff4172b806b1530bda4e9a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9c1d4b12b247412295fe57637f7d2e60", - "placeholder": "​", - "style": "IPY_MODEL_ac6ad6631b054107b3287180cf3b9e68", - "value": " 5.07M/5.07M [00:00<00:00, 83.1MB/s]" - } - }, - "d1b3fb0032ce400c80922d128940333c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "96b57a70103f4ac9b014b375e1aee824": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "74b28f16de69498b9915e501863b4d73": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "8338b89888584db5944d0ad5baa9118f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8baaa8a868ec4e0b98388e565d61a3db": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "9c1d4b12b247412295fe57637f7d2e60": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ac6ad6631b054107b3287180cf3b9e68": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "8c36c9eda216474d86fe73c161690a6d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_14d951a930d5478b9f4e77c430c2bb6f", - "IPY_MODEL_f6ae4d642a9f41cd829dde04347e50a3", - "IPY_MODEL_969548659ca74527afa6a0f92e3a98e7" - ], - "layout": "IPY_MODEL_7f90a349f8ce4525aa33649d70c09a33" - } - }, - "14d951a930d5478b9f4e77c430c2bb6f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_80fb8200d6d74cbca19ce946d73d9efa", - "placeholder": "​", - "style": "IPY_MODEL_67688fa958004ccfbacf0288f4075dea", - "value": "special_tokens_map.json: 100%" - } - }, - "f6ae4d642a9f41cd829dde04347e50a3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fa8d4ffdee2d454cb14941089ff881d6", - "max": 150, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_2577bab4d4a64010addd7c2b5370a6c8", - "value": 150 - } - }, - "969548659ca74527afa6a0f92e3a98e7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_01f9a013ab38435191872a8bce64dd69", - "placeholder": "​", - "style": "IPY_MODEL_0d91f57bfa25489eb70a23f2bd834cdf", - "value": " 150/150 [00:00<00:00, 10.5kB/s]" - } - }, - "7f90a349f8ce4525aa33649d70c09a33": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "80fb8200d6d74cbca19ce946d73d9efa": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "67688fa958004ccfbacf0288f4075dea": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "fa8d4ffdee2d454cb14941089ff881d6": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2577bab4d4a64010addd7c2b5370a6c8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "01f9a013ab38435191872a8bce64dd69": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0d91f57bfa25489eb70a23f2bd834cdf": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } + "d02f4f39378b4e568f0f9382efd6a9d1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f708d4e670c54cfe93d9ba82818590dc", + "placeholder": "​", + "style": "IPY_MODEL_0be61c031fad4f31877ad29119b6a77e", + "value": " 1.11G/1.11G [00:22<00:00, 52.5MB/s]" + } + }, + "d1b3fb0032ce400c80922d128940333c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e2fde33a78ff4172b806b1530bda4e9a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9c1d4b12b247412295fe57637f7d2e60", + "placeholder": "​", + "style": "IPY_MODEL_ac6ad6631b054107b3287180cf3b9e68", + "value": " 5.07M/5.07M [00:00<00:00, 83.1MB/s]" + } + }, + "e77456b2ef2a46ef99572a02e1b70817": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2bef4b3dc3cd457fa4ccb2e796d771f7", + "placeholder": "​", + "style": "IPY_MODEL_240bdca5cc794ae8b1319310e42aec2c", + "value": "tokenizer_config.json: 100%" + } + }, + "f251ac2cd1ed45d4a7d25d6b906a84bc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f6ae4d642a9f41cd829dde04347e50a3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fa8d4ffdee2d454cb14941089ff881d6", + "max": 150, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2577bab4d4a64010addd7c2b5370a6c8", + "value": 150 + } + }, + "f708d4e670c54cfe93d9ba82818590dc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f87349ba371f41d788f802a64f9312f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cb0069caad574477866382fa085508d5", + "placeholder": "​", + "style": "IPY_MODEL_1f3d6a7de82f4710a79d814b6af57679", + "value": " 79.0/79.0 [00:00<00:00, 3.95kB/s]" + } + }, + "fa8d4ffdee2d454cb14941089ff881d6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb index 4a1a54cef9cc6b..86eb57bf47cb98 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb @@ -1,2173 +1,2036 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import ONNX XlmRoBertaForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `XlmRoBertaForSequenceClassification` is only available since in `Spark NLP 5.2.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import XLM-RoBERTa models trained/fine-tuned for sequence classification via `XlmRoBertaForSequenceClassification` or `TFXlmRoBertaForSequenceClassification`. These models are usually under `Text Classification` category and have `xlm-roberta` in their labels\n", + "- Reference: [TFXlmRoBertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForSequenceClassification)\n", + "- Some [example models](https://huggingface.co/models?filter=xlm-roberta&pipeline_tag=text-classification)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "VjZY8Zs2nOZy" - }, - "source": [ - "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb)" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m23.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m29.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m65.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m26.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m51.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m59.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m58.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m59.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m57.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m62.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m60.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m61.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m61.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m47.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m45.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m17.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [cardiffnlp/twitter-xlm-roberta-base-sentiment](https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment) model from HuggingFace as an example and load it as a `ORTModelForSequenceClassification`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "_7VIYuX3nOZ1" - }, - "source": [ - "## Import ONNX XlmRoBertaForSequenceClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", - "- `XlmRoBertaForSequenceClassification` is only available since in `Spark NLP 5.2.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", - "- You can import XLM-RoBERTa models trained/fine-tuned for sequence classification via `XlmRoBertaForSequenceClassification` or `TFXlmRoBertaForSequenceClassification`. These models are usually under `Text Classification` category and have `xlm-roberta` in their labels\n", - "- Reference: [TFXlmRoBertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForSequenceClassification)\n", - "- Some [example models](https://huggingface.co/models?filter=xlm-roberta&pipeline_tag=text-classification)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "HZGLjeyxnOZ1" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0f95c2f60eca422a8b484df66cd738e7", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Export and Save HuggingFace model" + "text/plain": [ + "config.json: 0%| | 0.00/841 [00:00=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", - "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + "text/plain": [ + "pytorch_model.bin: 0%| | 0.00/1.11G [00:00 False\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForSequenceClassification\n", - "import tensorflow as tf\n", - "\n", - "MODEL_NAME = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'\n", - "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(ONNX_MODEL)" + "text/plain": [ + "special_tokens_map.json: 0%| | 0.00/150 [00:00 False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForSequenceClassification\n", + "import tensorflow as tf\n", + "\n", + "MODEL_NAME = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'\n", + "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(ONNX_MODEL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside this and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qIBR7cAqnOZ5", - "outputId": "49d6906d-a710-4d12-e547-3c7638ec1ab4", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "total 1108048\n", - "-rw-r--r-- 1 root root 915 Jan 9 19:15 config.json\n", - "-rw-r--r-- 1 root root 1112465741 Jan 9 19:15 model.onnx\n", - "-rw-r--r-- 1 root root 5069051 Jan 9 19:15 sentencepiece.bpe.model\n", - "-rw-r--r-- 1 root root 167 Jan 9 19:15 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 471 Jan 9 19:15 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 17082730 Jan 9 19:15 tokenizer.json\n" - ] - } - ], - "source": [ - "!ls -l {ONNX_MODEL}" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1108048\n", + "-rw-r--r-- 1 root root 915 Jan 9 19:15 config.json\n", + "-rw-r--r-- 1 root root 1112465741 Jan 9 19:15 model.onnx\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:15 sentencepiece.bpe.model\n", + "-rw-r--r-- 1 root root 167 Jan 9 19:15 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 471 Jan 9 19:15 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082730 Jan 9 19:15 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for\n", + "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get label2id dictionary\n", + "labels = ort_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila! We have our `sentencepiece.bpe.model` and `labels.txt` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "-Sp8nJu7nOZ5" - }, - "source": [ - "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for\n", - "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "onnx_models/cardiffnlp/twitter-xlm-roberta-base-sentiment:\n", + "total 1103100\n", + "drwxr-xr-x 2 root root 4096 Jan 9 19:17 assets\n", + "-rw-r--r-- 1 root root 915 Jan 9 19:15 config.json\n", + "-rw-r--r-- 1 root root 1112465741 Jan 9 19:15 model.onnx\n", + "-rw-r--r-- 1 root root 167 Jan 9 19:15 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 471 Jan 9 19:15 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082730 Jan 9 19:15 tokenizer.json\n", + "\n", + "onnx_models/cardiffnlp/twitter-xlm-roberta-base-sentiment/assets:\n", + "total 4956\n", + "-rw-r--r-- 1 root root 25 Jan 9 19:16 labels.txt\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:15 sentencepiece.bpe.model\n" + ] + } + ], + "source": [ + "!ls -lR {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save RoBertaForSequenceClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2KsG42cmnOZ5" - }, - "outputs": [], - "source": [ - "!mkdir {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-10-16 21:08:22-- http://setup.johnsnowlabs.com/colab.sh\n", + "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", + "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", + "HTTP request sent, awaiting response... 302 Moved Temporarily\n", + "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", + "--2023-10-16 21:08:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1191 (1.2K) [text/plain]\n", + "Saving to: ‘STDOUT’\n", + "\n", + "- 100%[===================>] 1.16K --.-KB/s in 0s \n", + "\n", + "2023-10-16 21:08:23 (93.8 MB/s) - written to stdout [1191/1191]\n", + "\n", + "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m21.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fKElx2rtnOZ5" - }, - "outputs": [], - "source": [ - "# get label2id dictionary\n", - "labels = ort_model.config.id2label\n", - "# sort the dictionary based on the id\n", - "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", - "\n", - "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", - " f.write('\\n'.join(labels))" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `XlmRoBertaForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `XlmRoBertaForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "sequenceClassifier = XlmRoBertaForSequenceClassification.loadSavedModel(\n", + " ONNX_MODEL,\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your XlmRoBertaForSequenceClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JrxxMgNpnOZ5" - }, - "outputs": [], - "source": [ - "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 487524\n", + "drwxr-xr-x 5 root root 4096 Oct 16 21:15 fields\n", + "drwxr-xr-x 2 root root 4096 Oct 16 21:15 metadata\n", + "-rw-r--r-- 1 root root 499209257 Oct 16 21:16 roberta_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForSequenceClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sequenceClassifier_loaded = XlmRoBertaForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "WaxO1clenOZ6" - }, - "source": [ - "Voila! We have our `sentencepiece.bpe.model` and `labels.txt` inside assets directory" + "data": { + "text/plain": [ + "['disgust',\n", + " 'optimism',\n", + " 'embarrassment',\n", + " 'amusement',\n", + " 'realization',\n", + " 'surprise',\n", + " 'grief',\n", + " 'caring',\n", + " 'disapproval',\n", + " 'disappointment',\n", + " 'joy',\n", + " 'confusion',\n", + " 'excitement',\n", + " 'approval',\n", + " 'curiosity',\n", + " 'anger',\n", + " 'love',\n", + " 'admiration',\n", + " 'gratitude',\n", + " 'annoyance',\n", + " 'remorse',\n", + " 'nervousness',\n", + " 'neutral',\n", + " 'pride',\n", + " 'fear',\n", + " 'sadness',\n", + " 'desire',\n", + " 'relief']" ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "sequenceClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " sequenceClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"I love you!\"], ['I feel lucky to be here.']]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"class.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of `XlmRoBertaForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "06bdcebdd8634c08be2a5c7edba7f20f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2209890ded0b42b2b11a53aaadc1dfc3", + "max": 150, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ecf37d50caa6458fbaf4dc9300961c83", + "value": 150 + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "C_sD6vcDnOZ6", - "outputId": "ee31714d-f3ff-4e7c-874f-d9f3a2358700", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "onnx_models/cardiffnlp/twitter-xlm-roberta-base-sentiment:\n", - "total 1103100\n", - "drwxr-xr-x 2 root root 4096 Jan 9 19:17 assets\n", - "-rw-r--r-- 1 root root 915 Jan 9 19:15 config.json\n", - "-rw-r--r-- 1 root root 1112465741 Jan 9 19:15 model.onnx\n", - "-rw-r--r-- 1 root root 167 Jan 9 19:15 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 471 Jan 9 19:15 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 17082730 Jan 9 19:15 tokenizer.json\n", - "\n", - "onnx_models/cardiffnlp/twitter-xlm-roberta-base-sentiment/assets:\n", - "total 4956\n", - "-rw-r--r-- 1 root root 25 Jan 9 19:16 labels.txt\n", - "-rw-r--r-- 1 root root 5069051 Jan 9 19:15 sentencepiece.bpe.model\n" - ] - } + "0f95c2f60eca422a8b484df66cd738e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_251aee8a46f84ddbb3dc6a092f041911", + "IPY_MODEL_d01d4a2dcdbb4a88a9ec3df55e949f51", + "IPY_MODEL_39b3eeec515b46109ebd01a3c81cf839" ], - "source": [ - "!ls -lR {ONNX_MODEL}" - ] + "layout": "IPY_MODEL_81386512a80d4c7986543ae245e2b128" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "WoRaIuTgnOZ6" - }, - "source": [ - "## Import and Save RoBertaForSequenceClassification in Spark NLP\n" - ] + "1a177346302c400097ede29b9ddcdde3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_98c0bef2c2004631a1eea7dbeeb474ef", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4bcdfb214d224488bd647edf528a6474", + "value": 5069051 + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "rmyTRnmTnOZ6" - }, - "source": [ - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script" - ] + "1a3c1b1b2d4d45fb96186d0d4b87d746": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VjgCKRjxnOZ6", - "outputId": "f8d62151-4ae3-4212-d2e6-be61f24cfcc8" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2023-10-16 21:08:22-- http://setup.johnsnowlabs.com/colab.sh\n", - "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", - "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", - "HTTP request sent, awaiting response... 302 Moved Temporarily\n", - "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", - "--2023-10-16 21:08:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1191 (1.2K) [text/plain]\n", - "Saving to: ‘STDOUT’\n", - "\n", - "- 100%[===================>] 1.16K --.-KB/s in 0s \n", - "\n", - "2023-10-16 21:08:23 (93.8 MB/s) - written to stdout [1191/1191]\n", - "\n", - "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m21.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], - "source": [ - "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] + "1aa9c4c2da4e499c8eb25eff14729039": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "-QbKgNWUnOZ6" - }, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] + "2209890ded0b42b2b11a53aaadc1dfc3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "t8nE1WMKnOZ6", - "outputId": "58c3086e-cb83-4472-f0a0-07d87ee70371" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apache Spark version: 3.2.3\n" - ] - } + "251aee8a46f84ddbb3dc6a092f041911": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1aa9c4c2da4e499c8eb25eff14729039", + "placeholder": "​", + "style": "IPY_MODEL_fb4a285fffd9454da534302c6fc17e7d", + "value": "config.json: 100%" + } + }, + "26aded6abf1242e8910a8051ee80f609": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "28315f4670194975a577d772b73fa439": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "29a548e9b0484038b62d5f57175e3d58": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_905c59c8b7bd42a9a305ecb52d93f875", + "placeholder": "​", + "style": "IPY_MODEL_1a3c1b1b2d4d45fb96186d0d4b87d746", + "value": "pytorch_model.bin: 100%" + } + }, + "2a6ea6ad829149abbb37ec45d0721651": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5c90d0f385f446a1a933231fd82f7c5c", + "IPY_MODEL_06bdcebdd8634c08be2a5c7edba7f20f", + "IPY_MODEL_5e85cd1b82374b5b9c7124a99c13784e" ], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()\n", - "\n", - "print(\"Apache Spark version: {}\".format(spark.version))" - ] + "layout": "IPY_MODEL_6615c07fdecb4049b19f7a2178c3879d" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "yCoPZcMmnOZ6" - }, - "source": [ - "- Let's use `loadSavedModel` functon in `XlmRoBertaForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", - "- Most params can be set later when you are loading this model in `XlmRoBertaForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." - ] + "33887ca8a56f44b8ab7b78dcdb604e5e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hwPVKZyinOZ6" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *\n", - "\n", - "sequenceClassifier = XlmRoBertaForSequenceClassification.loadSavedModel(\n", - " ONNX_MODEL,\n", - " spark\n", - " )\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"class\")\\\n", - " .setCaseSensitive(True)\\\n", - " .setMaxSentenceLength(128)" - ] + "39b3eeec515b46109ebd01a3c81cf839": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e38b9de9a6da4058956892c87ab4a29a", + "placeholder": "​", + "style": "IPY_MODEL_33887ca8a56f44b8ab7b78dcdb604e5e", + "value": " 841/841 [00:00<00:00, 52.8kB/s]" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "sBaSiegrnOZ6" - }, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] + "3c088ffa5f1045a59667dfd0b5024db6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wJM6A2ZMnOZ6" - }, - "outputs": [], - "source": [ - "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" - ] + "3dcbba3ef4524613848833f7eddc7bf3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "BSseNI1ZnOZ6" - }, - "source": [ - "Let's clean up stuff we don't need anymore" - ] + "4bcdfb214d224488bd647edf528a6474": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-t_ST7fznOZ6" - }, - "outputs": [], - "source": [ - "!rm -rf {ONNX_MODEL}" - ] + "4e476a3c03044b95a5d951a36643dfe5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "HrIRyrwJnOZ7" - }, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your XlmRoBertaForSequenceClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] + "54d98bec988246afa575f514b5fb538e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "x18SNGz5nOZ7", - "outputId": "b58ae4c0-385a-49f7-a989-9f43c1654648" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 487524\n", - "drwxr-xr-x 5 root root 4096 Oct 16 21:15 fields\n", - "drwxr-xr-x 2 root root 4096 Oct 16 21:15 metadata\n", - "-rw-r--r-- 1 root root 499209257 Oct 16 21:16 roberta_classification_onnx\n" - ] - } - ], - "source": [ - "! ls -l {ONNX_MODEL}_spark_nlp_onnx" - ] + "5c90d0f385f446a1a933231fd82f7c5c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_db3f53eaa65c44f987b410a60e0c04de", + "placeholder": "​", + "style": "IPY_MODEL_a153a09450f8403998482ec3cf5e5424", + "value": "special_tokens_map.json: 100%" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "CNG-mf3nnOZ7" - }, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForSequenceClassification model 😊" - ] + "5e85cd1b82374b5b9c7124a99c13784e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_babd560899554c3ab5eb12bbab99938b", + "placeholder": "​", + "style": "IPY_MODEL_26aded6abf1242e8910a8051ee80f609", + "value": " 150/150 [00:00<00:00, 7.91kB/s]" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "E-YVoU8xnOZ7" - }, - "outputs": [], - "source": [ - "sequenceClassifier_loaded = XlmRoBertaForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"class\")" - ] + "6615c07fdecb4049b19f7a2178c3879d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "VpFTaC7GnOZ7" - }, - "source": [ - "You can see what labels were used to train this model via `getClasses` function:" - ] + "7ec33b66132449a1804b9dd655dba44e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_54d98bec988246afa575f514b5fb538e", + "placeholder": "​", + "style": "IPY_MODEL_e33aaf9937d346619cfafb14e2f2257e", + "value": "sentencepiece.bpe.model: 100%" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "OkxqlnoBnOZ7", - "outputId": "2dd0576e-8abe-4d8b-8e2c-598783ba116a" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['disgust',\n", - " 'optimism',\n", - " 'embarrassment',\n", - " 'amusement',\n", - " 'realization',\n", - " 'surprise',\n", - " 'grief',\n", - " 'caring',\n", - " 'disapproval',\n", - " 'disappointment',\n", - " 'joy',\n", - " 'confusion',\n", - " 'excitement',\n", - " 'approval',\n", - " 'curiosity',\n", - " 'anger',\n", - " 'love',\n", - " 'admiration',\n", - " 'gratitude',\n", - " 'annoyance',\n", - " 'remorse',\n", - " 'nervousness',\n", - " 'neutral',\n", - " 'pride',\n", - " 'fear',\n", - " 'sadness',\n", - " 'desire',\n", - " 'relief']" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } + "81386512a80d4c7986543ae245e2b128": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "861430374a7843d78a5e2499252f4e78": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "905c59c8b7bd42a9a305ecb52d93f875": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "98c0bef2c2004631a1eea7dbeeb474ef": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9f664ab32d0d43f5bddfe20417c10131": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a153a09450f8403998482ec3cf5e5424": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a5ecd770b12843a7af3fad2dae5be8e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ffc6bd7026344b7aad522b9a2337bb6f", + "placeholder": "​", + "style": "IPY_MODEL_3c088ffa5f1045a59667dfd0b5024db6", + "value": " 5.07M/5.07M [00:00<00:00, 124MB/s]" + } + }, + "b6795898eeb24ed284b2bbd31040697a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "babd560899554c3ab5eb12bbab99938b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c501c8ec56b340d49e6c6698d325398d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cbe05ca4a37745908898d47bb56c9774": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3dcbba3ef4524613848833f7eddc7bf3", + "max": 1112271561, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_28315f4670194975a577d772b73fa439", + "value": 1112271561 + } + }, + "d01d4a2dcdbb4a88a9ec3df55e949f51": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_861430374a7843d78a5e2499252f4e78", + "max": 841, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4e476a3c03044b95a5d951a36643dfe5", + "value": 841 + } + }, + "d39f1f408a7241b8bb14dc1fd4ef9df6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_29a548e9b0484038b62d5f57175e3d58", + "IPY_MODEL_cbe05ca4a37745908898d47bb56c9774", + "IPY_MODEL_eb413258aa00462195161ba8b0047bca" ], - "source": [ - "# .getClasses was introduced in spark-nlp==3.4.0\n", - "sequenceClassifier_loaded.getClasses()" - ] + "layout": "IPY_MODEL_b6795898eeb24ed284b2bbd31040697a" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "c62SdOTdnOZ7" - }, - "source": [ - "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" - ] + "db3f53eaa65c44f987b410a60e0c04de": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Di5xRn1nOZ7" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *\n", - "\n", - "document_assembler = DocumentAssembler() \\\n", - " .setInputCol('text') \\\n", - " .setOutputCol('document')\n", - "\n", - "tokenizer = Tokenizer() \\\n", - " .setInputCols(['document']) \\\n", - " .setOutputCol('token')\n", - "\n", - "pipeline = Pipeline(stages=[\n", - " document_assembler,\n", - " tokenizer,\n", - " sequenceClassifier_loaded\n", - "])\n", - "\n", - "# couple of simple examples\n", - "example = spark.createDataFrame([[\"I love you!\"], ['I feel lucky to be here.']]).toDF(\"text\")\n", - "\n", - "result = pipeline.fit(example).transform(example)\n", - "\n", - "# result is a DataFrame\n", - "result.select(\"text\", \"class.result\").show()" - ] + "e0ed0155dd944f5ebbcc905509f01b20": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7ec33b66132449a1804b9dd655dba44e", + "IPY_MODEL_1a177346302c400097ede29b9ddcdde3", + "IPY_MODEL_a5ecd770b12843a7af3fad2dae5be8e3" + ], + "layout": "IPY_MODEL_c501c8ec56b340d49e6c6698d325398d" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "_ka-wmU-nOZ7" - }, - "source": [ - "That's it! You can now go wild and use hundreds of `XlmRoBertaForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] + "e33aaf9937d346619cfafb14e2f2257e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "e38b9de9a6da4058956892c87ab4a29a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "eb413258aa00462195161ba8b0047bca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f83361dcbeaa4a8b9ddbbe47dc28e3f1", + "placeholder": "​", + "style": "IPY_MODEL_9f664ab32d0d43f5bddfe20417c10131", + "value": " 1.11G/1.11G [00:07<00:00, 180MB/s]" + } + }, + "ecf37d50caa6458fbaf4dc9300961c83": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f83361dcbeaa4a8b9ddbbe47dc28e3f1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "0f95c2f60eca422a8b484df66cd738e7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_251aee8a46f84ddbb3dc6a092f041911", - "IPY_MODEL_d01d4a2dcdbb4a88a9ec3df55e949f51", - "IPY_MODEL_39b3eeec515b46109ebd01a3c81cf839" - ], - "layout": "IPY_MODEL_81386512a80d4c7986543ae245e2b128" - } - }, - "251aee8a46f84ddbb3dc6a092f041911": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1aa9c4c2da4e499c8eb25eff14729039", - "placeholder": "​", - "style": "IPY_MODEL_fb4a285fffd9454da534302c6fc17e7d", - "value": "config.json: 100%" - } - }, - "d01d4a2dcdbb4a88a9ec3df55e949f51": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_861430374a7843d78a5e2499252f4e78", - "max": 841, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_4e476a3c03044b95a5d951a36643dfe5", - "value": 841 - } - }, - "39b3eeec515b46109ebd01a3c81cf839": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e38b9de9a6da4058956892c87ab4a29a", - "placeholder": "​", - "style": "IPY_MODEL_33887ca8a56f44b8ab7b78dcdb604e5e", - "value": " 841/841 [00:00<00:00, 52.8kB/s]" - } - }, - "81386512a80d4c7986543ae245e2b128": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1aa9c4c2da4e499c8eb25eff14729039": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fb4a285fffd9454da534302c6fc17e7d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "861430374a7843d78a5e2499252f4e78": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4e476a3c03044b95a5d951a36643dfe5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "e38b9de9a6da4058956892c87ab4a29a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "33887ca8a56f44b8ab7b78dcdb604e5e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "d39f1f408a7241b8bb14dc1fd4ef9df6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_29a548e9b0484038b62d5f57175e3d58", - "IPY_MODEL_cbe05ca4a37745908898d47bb56c9774", - "IPY_MODEL_eb413258aa00462195161ba8b0047bca" - ], - "layout": "IPY_MODEL_b6795898eeb24ed284b2bbd31040697a" - } - }, - "29a548e9b0484038b62d5f57175e3d58": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_905c59c8b7bd42a9a305ecb52d93f875", - "placeholder": "​", - "style": "IPY_MODEL_1a3c1b1b2d4d45fb96186d0d4b87d746", - "value": "pytorch_model.bin: 100%" - } - }, - "cbe05ca4a37745908898d47bb56c9774": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3dcbba3ef4524613848833f7eddc7bf3", - "max": 1112271561, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_28315f4670194975a577d772b73fa439", - "value": 1112271561 - } - }, - "eb413258aa00462195161ba8b0047bca": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f83361dcbeaa4a8b9ddbbe47dc28e3f1", - "placeholder": "​", - "style": "IPY_MODEL_9f664ab32d0d43f5bddfe20417c10131", - "value": " 1.11G/1.11G [00:07<00:00, 180MB/s]" - } - }, - "b6795898eeb24ed284b2bbd31040697a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "905c59c8b7bd42a9a305ecb52d93f875": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1a3c1b1b2d4d45fb96186d0d4b87d746": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3dcbba3ef4524613848833f7eddc7bf3": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "28315f4670194975a577d772b73fa439": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "f83361dcbeaa4a8b9ddbbe47dc28e3f1": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9f664ab32d0d43f5bddfe20417c10131": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e0ed0155dd944f5ebbcc905509f01b20": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_7ec33b66132449a1804b9dd655dba44e", - "IPY_MODEL_1a177346302c400097ede29b9ddcdde3", - "IPY_MODEL_a5ecd770b12843a7af3fad2dae5be8e3" - ], - "layout": "IPY_MODEL_c501c8ec56b340d49e6c6698d325398d" - } - }, - "7ec33b66132449a1804b9dd655dba44e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_54d98bec988246afa575f514b5fb538e", - "placeholder": "​", - "style": "IPY_MODEL_e33aaf9937d346619cfafb14e2f2257e", - "value": "sentencepiece.bpe.model: 100%" - } - }, - "1a177346302c400097ede29b9ddcdde3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_98c0bef2c2004631a1eea7dbeeb474ef", - "max": 5069051, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_4bcdfb214d224488bd647edf528a6474", - "value": 5069051 - } - }, - "a5ecd770b12843a7af3fad2dae5be8e3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ffc6bd7026344b7aad522b9a2337bb6f", - "placeholder": "​", - "style": "IPY_MODEL_3c088ffa5f1045a59667dfd0b5024db6", - "value": " 5.07M/5.07M [00:00<00:00, 124MB/s]" - } - }, - "c501c8ec56b340d49e6c6698d325398d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "54d98bec988246afa575f514b5fb538e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e33aaf9937d346619cfafb14e2f2257e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "98c0bef2c2004631a1eea7dbeeb474ef": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4bcdfb214d224488bd647edf528a6474": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "ffc6bd7026344b7aad522b9a2337bb6f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3c088ffa5f1045a59667dfd0b5024db6": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "2a6ea6ad829149abbb37ec45d0721651": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_5c90d0f385f446a1a933231fd82f7c5c", - "IPY_MODEL_06bdcebdd8634c08be2a5c7edba7f20f", - "IPY_MODEL_5e85cd1b82374b5b9c7124a99c13784e" - ], - "layout": "IPY_MODEL_6615c07fdecb4049b19f7a2178c3879d" - } - }, - "5c90d0f385f446a1a933231fd82f7c5c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_db3f53eaa65c44f987b410a60e0c04de", - "placeholder": "​", - "style": "IPY_MODEL_a153a09450f8403998482ec3cf5e5424", - "value": "special_tokens_map.json: 100%" - } - }, - "06bdcebdd8634c08be2a5c7edba7f20f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2209890ded0b42b2b11a53aaadc1dfc3", - "max": 150, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ecf37d50caa6458fbaf4dc9300961c83", - "value": 150 - } - }, - "5e85cd1b82374b5b9c7124a99c13784e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_babd560899554c3ab5eb12bbab99938b", - "placeholder": "​", - "style": "IPY_MODEL_26aded6abf1242e8910a8051ee80f609", - "value": " 150/150 [00:00<00:00, 7.91kB/s]" - } - }, - "6615c07fdecb4049b19f7a2178c3879d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "db3f53eaa65c44f987b410a60e0c04de": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a153a09450f8403998482ec3cf5e5424": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "2209890ded0b42b2b11a53aaadc1dfc3": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ecf37d50caa6458fbaf4dc9300961c83": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "babd560899554c3ab5eb12bbab99938b": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "26aded6abf1242e8910a8051ee80f609": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } + "fb4a285fffd9454da534302c6fc17e7d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ffc6bd7026344b7aad522b9a2337bb6f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb index 0cc16cf9245d9b..d601203cc11078 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb @@ -1,2144 +1,2006 @@ { - "cells": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import ONNX XlmRoBertaForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `XlmRoBertaForTokenClassification` is only available since in `Spark NLP 5.2.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import XLM-RoBERTa models trained/fine-tuned for token classification via `XlmRoBertaForTokenClassification` or `TFXlmRoBertaForTokenClassification`. These models are usually under `Token Classification` category and have `roberta` in their labels\n", + "- Reference: [TFXlmRoBertaForTokenClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForTokenClassification)\n", + "- Some [example models](https://huggingface.co/models?filter=xlm-roberta&pipeline_tag=token-classification)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases\n", + "- Albert uses SentencePiece, so we will have to install that as well" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "PT2s_38mqpqS" - }, - "source": [ - "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb)" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m19.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m65.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m38.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m65.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m70.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m97.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m111.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m102.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m100.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m102.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m109.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m96.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m100.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m81.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m44.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m100.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m67.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m35.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m64.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m67.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m84.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m104.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m45.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m81.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m45.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m76.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m62.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m98.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m102.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m86.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m105.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m43.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m98.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m43.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m42.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m78.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m79.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m93.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m28.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [xlm-roberta-large-finetuned-conll03-english](https://huggingface.co/xlm-roberta-large-finetuned-conll03-english) model from HuggingFace as an example and load it as a `ORTModelForTokenClassification`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "iss2RqRIqpqV" - }, - "source": [ - "## Import ONNX XlmRoBertaForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", - "\n", - "Let's keep in mind a few things before we start 😊\n", - "\n", - "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", - "- `XlmRoBertaForTokenClassification` is only available since in `Spark NLP 5.2.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", - "- You can import XLM-RoBERTa models trained/fine-tuned for token classification via `XlmRoBertaForTokenClassification` or `TFXlmRoBertaForTokenClassification`. These models are usually under `Token Classification` category and have `roberta` in their labels\n", - "- Reference: [TFXlmRoBertaForTokenClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForTokenClassification)\n", - "- Some [example models](https://huggingface.co/models?filter=xlm-roberta&pipeline_tag=token-classification)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "yhZZmLjgqpqX" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1544c9cc98bc469b98ae804569204420", + "version_major": 2, + "version_minor": 0 }, - "source": [ - "## Export and Save HuggingFace model" + "text/plain": [ + "config.json: 0%| | 0.00/852 [00:00=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", - "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + "text/plain": [ + "model.safetensors: 0%| | 0.00/2.24G [00:00 False\n", - "Saving external data to one file...\n" - ] - } - ], - "source": [ - "from optimum.onnxruntime import ORTModelForTokenClassification\n", - "import tensorflow as tf\n", - "\n", - "MODEL_NAME = 'xlm-roberta-large-finetuned-conll03-english'\n", - "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", - "\n", - "ort_model = ORTModelForTokenClassification.from_pretrained(MODEL_NAME, export=True)\n", - "\n", - "# Save the ONNX model\n", - "ort_model.save_pretrained(ONNX_MODEL)" + "text/plain": [ + "tokenizer.json: 0%| | 0.00/9.10M [00:00 False\n", + "Saving external data to one file...\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForTokenClassification\n", + "import tensorflow as tf\n", + "\n", + "MODEL_NAME = 'xlm-roberta-large-finetuned-conll03-english'\n", + "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForTokenClassification.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(ONNX_MODEL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside the directory and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "fp6t5TETqpqZ", - "outputId": "3eab09a6-51c3-48f8-d8e5-74e76fe22585", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "total 2205260\n", - "-rw-r--r-- 1 root root 1046 Jan 9 19:26 config.json\n", - "-rw-r--r-- 1 root root 617783 Jan 9 19:26 model.onnx\n", - "-rw-r--r-- 1 root root 2235396096 Jan 9 19:26 model.onnx_data\n", - "-rw-r--r-- 1 root root 5069051 Jan 9 19:26 sentencepiece.bpe.model\n", - "-rw-r--r-- 1 root root 280 Jan 9 19:26 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 418 Jan 9 19:26 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 17082660 Jan 9 19:26 tokenizer.json\n" - ] - } - ], - "source": [ - "!ls -l {ONNX_MODEL}" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 2205260\n", + "-rw-r--r-- 1 root root 1046 Jan 9 19:26 config.json\n", + "-rw-r--r-- 1 root root 617783 Jan 9 19:26 model.onnx\n", + "-rw-r--r-- 1 root root 2235396096 Jan 9 19:26 model.onnx_data\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:26 sentencepiece.bpe.model\n", + "-rw-r--r-- 1 root root 280 Jan 9 19:26 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 418 Jan 9 19:26 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082660 Jan 9 19:26 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {ONNX_MODEL}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for\n", + "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get label2id dictionary\n", + "labels = ort_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila! We have our `sentencepiece.bpe.model` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "u5kdjGpdqpqZ" - }, - "outputs": [], - "source": [ - "!mkdir {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "onnx_models/xlm-roberta-large-finetuned-conll03-english:\n", + "total 2200312\n", + "drwxr-xr-x 2 root root 4096 Jan 9 19:26 assets\n", + "-rw-r--r-- 1 root root 1046 Jan 9 19:26 config.json\n", + "-rw-r--r-- 1 root root 617783 Jan 9 19:26 model.onnx\n", + "-rw-r--r-- 1 root root 2235396096 Jan 9 19:26 model.onnx_data\n", + "-rw-r--r-- 1 root root 280 Jan 9 19:26 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 418 Jan 9 19:26 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082660 Jan 9 19:26 tokenizer.json\n", + "\n", + "onnx_models/xlm-roberta-large-finetuned-conll03-english/assets:\n", + "total 4956\n", + "-rw-r--r-- 1 root root 45 Jan 9 19:26 labels.txt\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:26 sentencepiece.bpe.model\n" + ] + } + ], + "source": [ + "!ls -lR {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save XlmRoBertaForTokenClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "q53KnN90qpqZ" - }, - "source": [ - "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for\n", - "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m26.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "NYYI6xnTqpqa" - }, - "outputs": [], - "source": [ - "# get label2id dictionary\n", - "labels = ort_model.config.id2label\n", - "# sort the dictionary based on the id\n", - "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", - "\n", - "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", - " f.write('\\n'.join(labels))" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `RoBertaForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `RoBertaForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "tokenClassifier = XlmRoBertaForTokenClassification\\\n", + " .loadSavedModel(ONNX_MODEL, spark)\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your XlmRoBertaForTokenClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "hx6uf2PPqpqa" - }, - "outputs": [], - "source": [ - "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "total 318696\n", + "drwxr-xr-x 5 root root 4096 Oct 16 22:21 fields\n", + "drwxr-xr-x 2 root root 4096 Oct 16 22:21 metadata\n", + "-rw-r--r-- 1 root root 326328924 Oct 16 22:21 roberta_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny XlmRoBertaForTokenClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenClassifier_loaded = XlmRoBertaForTokenClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "idrz2RCWqpqa" - }, - "source": [ - "Voila! We have our `sentencepiece.bpe.model` inside assets directory" + "data": { + "text/plain": [ + "['B-LOC', 'I-ORG', 'I-LOC', 'I-PER', 'B-ORG', 'O', 'B-PER']" ] - }, + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "tokenClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "T5YSOXhLqpqa", - "outputId": "028c70df-7f80-4bf6-9779-e8b26ee574aa", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "onnx_models/xlm-roberta-large-finetuned-conll03-english:\n", - "total 2200312\n", - "drwxr-xr-x 2 root root 4096 Jan 9 19:26 assets\n", - "-rw-r--r-- 1 root root 1046 Jan 9 19:26 config.json\n", - "-rw-r--r-- 1 root root 617783 Jan 9 19:26 model.onnx\n", - "-rw-r--r-- 1 root root 2235396096 Jan 9 19:26 model.onnx_data\n", - "-rw-r--r-- 1 root root 280 Jan 9 19:26 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 418 Jan 9 19:26 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 17082660 Jan 9 19:26 tokenizer.json\n", - "\n", - "onnx_models/xlm-roberta-large-finetuned-conll03-english/assets:\n", - "total 4956\n", - "-rw-r--r-- 1 root root 45 Jan 9 19:26 labels.txt\n", - "-rw-r--r-- 1 root root 5069051 Jan 9 19:26 sentencepiece.bpe.model\n" - ] - } + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| text| result|\n", + "+--------------------+--------------------+\n", + "|My name is Clara ...|[O, O, O, B-PER, ...|\n", + "|My name is Clara ...|[O, O, O, B-PER, ...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " tokenClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"My name is Clara and I live in Berkeley, California.\"], ['My name is Clara and I live in Berkeley, California.']]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"ner.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of `RoBertaForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0bd10f7cb5244da29d0a7da73ae52335": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_825857db473849d2bb498ffb5fcfb962", + "IPY_MODEL_a88c300d3a81439fb3da9d46a023dc47", + "IPY_MODEL_636fccb3b002475a90c888f987b36400" ], - "source": [ - "!ls -lR {ONNX_MODEL}" - ] + "layout": "IPY_MODEL_89c7d83dc8e640cbb93ccfe2bb3030f0" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "yC4lfOb_qpqb" - }, - "source": [ - "## Import and Save RoBertaForTokenClassification in Spark NLP\n" - ] + "0faedd1c4d4148fa965bcec52325bd08": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "_T57R-wBqpqb" - }, - "source": [ - "- Let's install and setup Spark NLP in Google Colab\n", - "- This part is pretty easy via our simple script" - ] + "106c462bc57243018162577b103db007": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vqzkT2Tbqpqb", - "outputId": "3d1b295e-e6e9-409c-f73f-e778352aa7ff" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", - "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m26.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } + "119f12a1e8204bf7b7bbf1b4d7cca247": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1544c9cc98bc469b98ae804569204420": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_94da2b3a4e7743a2b7dd362292c5496f", + "IPY_MODEL_6494c0f839ff4418a93df6e88c012d07", + "IPY_MODEL_df8afa14db524e2992b832b30bc0f692" ], - "source": [ - "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" - ] + "layout": "IPY_MODEL_78b0860d0ed643d785ef00633d9e17e8" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "e_C8Rt6Iqpqb" - }, - "source": [ - "Let's start Spark with Spark NLP included via our simple `start()` function" - ] + "1826afdb3fd94748941c71d8621682e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "44kpKSG-qpqb", - "outputId": "d556353a-cd63-4e2a-a5e6-fcfbfa72fa57" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apache Spark version: 3.2.3\n" - ] - } - ], - "source": [ - "import sparknlp\n", - "# let's start Spark with Spark NLP\n", - "spark = sparknlp.start()\n", - "\n", - "print(\"Apache Spark version: {}\".format(spark.version))" - ] + "21ca8729098a4bd498b29de51a92e8bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "IHPVcE9nqpqc" - }, - "source": [ - "- Let's use `loadSavedModel` functon in `RoBertaForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", - "- Most params can be set later when you are loading this model in `RoBertaForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", - "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", - "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." - ] + "238132625e604ddf85bdbf4931889d51": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c2ab7313174c4231825be28c4a3181b8", + "placeholder": "​", + "style": "IPY_MODEL_45123f4cdc0a4aa8ac90aa29d357240e", + "value": " 5.07M/5.07M [00:00<00:00, 41.3MB/s]" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-_OWtRBHqpqc" - }, - "outputs": [], - "source": [ - "from sparknlp.annotator import *\n", - "from sparknlp.base import *\n", - "\n", - "tokenClassifier = RoBertaForTokenClassification\\\n", - " .loadSavedModel(ONNX_MODEL, spark)\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"ner\")\\\n", - " .setCaseSensitive(True)\\\n", - " .setMaxSentenceLength(128)" - ] + "2510ee2600af466a851566f4634e7fe9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "cgoFul55qpqc" - }, - "source": [ - "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" - ] + "349c635d1e8c47d6ab1d0fe3819dd837": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EzK8inoxqpqc" - }, - "outputs": [], - "source": [ - "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" - ] + "44e1f77b21e04d32ad83a405ab62ca38": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "0VwXvPlbqpqc" - }, - "source": [ - "Let's clean up stuff we don't need anymore" - ] + "45123f4cdc0a4aa8ac90aa29d357240e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Ce2ZAEtqpqc" - }, - "outputs": [], - "source": [ - "!rm -rf {ONNX_MODEL}" - ] + "451d9aec16b8417fb3b9565e5a73cb52": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "Z4QXBzVsqpqd" - }, - "source": [ - "Awesome 😎 !\n", - "\n", - "This is your RoBertaForTokenClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" - ] + "547e218edc3d47cab11d876641955409": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRKVUu9tqpqd", - "outputId": "2278d83a-63be-4e1a-b574-29c963b4b7a1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 318696\n", - "drwxr-xr-x 5 root root 4096 Oct 16 22:21 fields\n", - "drwxr-xr-x 2 root root 4096 Oct 16 22:21 metadata\n", - "-rw-r--r-- 1 root root 326328924 Oct 16 22:21 roberta_classification_onnx\n" - ] - } - ], - "source": [ - "! ls -l {ONNX_MODEL}_spark_nlp_onnx" - ] + "59a857c998734b0a95af2b96252aa130": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "qXl-kXeLqpqd" - }, - "source": [ - "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForTokenClassification model 😊" - ] + "61335821e4c94fcba501cb1c94541d07": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QTrtB8u7qpqd" - }, - "outputs": [], - "source": [ - "tokenClassifier_loaded = RoBertaForTokenClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", - " .setInputCols([\"document\",'token'])\\\n", - " .setOutputCol(\"ner\")" - ] + "636fccb3b002475a90c888f987b36400": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1826afdb3fd94748941c71d8621682e3", + "placeholder": "​", + "style": "IPY_MODEL_21ca8729098a4bd498b29de51a92e8bd", + "value": " 9.10M/9.10M [00:00<00:00, 25.6MB/s]" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "UDHQves4qpqd" - }, - "source": [ - "You can see what labels were used to train this model via `getClasses` function:" - ] + "6494c0f839ff4418a93df6e88c012d07": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_547e218edc3d47cab11d876641955409", + "max": 852, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_86abb923170a4927a99c0289540ecdf4", + "value": 852 + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "01Aw5e47qpqe", - "outputId": "69cfded4-763c-41a3-f7ea-4ef56a744741" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['B-LOC', 'I-ORG', 'I-LOC', 'I-PER', 'B-ORG', 'O', 'B-PER']" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# .getClasses was introduced in spark-nlp==3.4.0\n", - "tokenClassifier_loaded.getClasses()" - ] + "68e4ba7bf6c5483abcae494fcdd46c6a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "F7Kbxqvxqpqe" - }, - "source": [ - "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" - ] + "7277b2423de14e3aada27e5191f096e5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2f0E2Gvxqpqe", - "outputId": "fc05a614-cc89-417b-fad8-1290b34905e0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+--------------------+\n", - "| text| result|\n", - "+--------------------+--------------------+\n", - "|My name is Clara ...|[O, O, O, B-PER, ...|\n", - "|My name is Clara ...|[O, O, O, B-PER, ...|\n", - "+--------------------+--------------------+\n", - "\n" - ] - } - ], - "source": [ - "document_assembler = DocumentAssembler() \\\n", - " .setInputCol('text') \\\n", - " .setOutputCol('document')\n", - "\n", - "tokenizer = Tokenizer() \\\n", - " .setInputCols(['document']) \\\n", - " .setOutputCol('token')\n", - "\n", - "pipeline = Pipeline(stages=[\n", - " document_assembler,\n", - " tokenizer,\n", - " tokenClassifier_loaded\n", - "])\n", - "\n", - "# couple of simple examples\n", - "example = spark.createDataFrame([[\"My name is Clara and I live in Berkeley, California.\"], ['My name is Clara and I live in Berkeley, California.']]).toDF(\"text\")\n", - "\n", - "result = pipeline.fit(example).transform(example)\n", - "\n", - "# result is a DataFrame\n", - "result.select(\"text\", \"ner.result\").show()" - ] + "7538f72687754318ac6657cd98f1f5ae": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e0e53ea997404498850c7dff6f80a5fb", + "placeholder": "​", + "style": "IPY_MODEL_68e4ba7bf6c5483abcae494fcdd46c6a", + "value": "model.safetensors: 100%" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "WOGLNugSqpqe" - }, - "source": [ - "That's it! You can now go wild and use hundreds of `RoBertaForTokenClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] + "78b0860d0ed643d785ef00633d9e17e8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "825857db473849d2bb498ffb5fcfb962": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_87671904fcdc4af993d4ba61f3a5f9e9", + "placeholder": "​", + "style": "IPY_MODEL_2510ee2600af466a851566f4634e7fe9", + "value": "tokenizer.json: 100%" + } }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "86abb923170a4927a99c0289540ecdf4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "87671904fcdc4af993d4ba61f3a5f9e9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "883c34b77a4c4558b74d5dde797e22ab": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "1544c9cc98bc469b98ae804569204420": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_94da2b3a4e7743a2b7dd362292c5496f", - "IPY_MODEL_6494c0f839ff4418a93df6e88c012d07", - "IPY_MODEL_df8afa14db524e2992b832b30bc0f692" - ], - "layout": "IPY_MODEL_78b0860d0ed643d785ef00633d9e17e8" - } - }, - "94da2b3a4e7743a2b7dd362292c5496f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_451d9aec16b8417fb3b9565e5a73cb52", - "placeholder": "​", - "style": "IPY_MODEL_e7df5ab266744d59b29e8c11106dcb65", - "value": "config.json: 100%" - } - }, - "6494c0f839ff4418a93df6e88c012d07": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_547e218edc3d47cab11d876641955409", - "max": 852, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_86abb923170a4927a99c0289540ecdf4", - "value": 852 - } - }, - "df8afa14db524e2992b832b30bc0f692": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_dff16b39b66a4422a407096064fa182e", - "placeholder": "​", - "style": "IPY_MODEL_119f12a1e8204bf7b7bbf1b4d7cca247", - "value": " 852/852 [00:00<00:00, 13.4kB/s]" - } - }, - "78b0860d0ed643d785ef00633d9e17e8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "451d9aec16b8417fb3b9565e5a73cb52": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e7df5ab266744d59b29e8c11106dcb65": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "547e218edc3d47cab11d876641955409": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "86abb923170a4927a99c0289540ecdf4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "dff16b39b66a4422a407096064fa182e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "119f12a1e8204bf7b7bbf1b4d7cca247": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e53d77092ea646b5bff9e5c4051f0709": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_7538f72687754318ac6657cd98f1f5ae", - "IPY_MODEL_b80a55635ff342eaa98451556f4908d3", - "IPY_MODEL_b3ee928046b94c9194b5b5e30c61becb" - ], - "layout": "IPY_MODEL_a2f65e14834a47e69687d32ee896d31d" - } - }, - "7538f72687754318ac6657cd98f1f5ae": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e0e53ea997404498850c7dff6f80a5fb", - "placeholder": "​", - "style": "IPY_MODEL_68e4ba7bf6c5483abcae494fcdd46c6a", - "value": "model.safetensors: 100%" - } - }, - "b80a55635ff342eaa98451556f4908d3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_349c635d1e8c47d6ab1d0fe3819dd837", - "max": 2239643256, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_eded9c7da1eb4f6c9e713c8b23b4327a", - "value": 2239643256 - } - }, - "b3ee928046b94c9194b5b5e30c61becb": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_59a857c998734b0a95af2b96252aa130", - "placeholder": "​", - "style": "IPY_MODEL_ace71ecb2fae4e4196f5c75b86a522f8", - "value": " 2.24G/2.24G [00:17<00:00, 173MB/s]" - } - }, - "a2f65e14834a47e69687d32ee896d31d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e0e53ea997404498850c7dff6f80a5fb": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "68e4ba7bf6c5483abcae494fcdd46c6a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "349c635d1e8c47d6ab1d0fe3819dd837": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "eded9c7da1eb4f6c9e713c8b23b4327a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "59a857c998734b0a95af2b96252aa130": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ace71ecb2fae4e4196f5c75b86a522f8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ee4965a2b5ad435c8c82b377006aa73e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_9c3ec5377e884b3faebd24fb815b6a85", - "IPY_MODEL_9ca46de0beaa4c5d9ec526820d7aa94f", - "IPY_MODEL_238132625e604ddf85bdbf4931889d51" - ], - "layout": "IPY_MODEL_61335821e4c94fcba501cb1c94541d07" - } - }, - "9c3ec5377e884b3faebd24fb815b6a85": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_883c34b77a4c4558b74d5dde797e22ab", - "placeholder": "​", - "style": "IPY_MODEL_a85eea6afaf0480eac17817e4844539a", - "value": "sentencepiece.bpe.model: 100%" - } - }, - "9ca46de0beaa4c5d9ec526820d7aa94f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7277b2423de14e3aada27e5191f096e5", - "max": 5069051, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_0faedd1c4d4148fa965bcec52325bd08", - "value": 5069051 - } - }, - "238132625e604ddf85bdbf4931889d51": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c2ab7313174c4231825be28c4a3181b8", - "placeholder": "​", - "style": "IPY_MODEL_45123f4cdc0a4aa8ac90aa29d357240e", - "value": " 5.07M/5.07M [00:00<00:00, 41.3MB/s]" - } - }, - "61335821e4c94fcba501cb1c94541d07": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "883c34b77a4c4558b74d5dde797e22ab": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a85eea6afaf0480eac17817e4844539a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7277b2423de14e3aada27e5191f096e5": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0faedd1c4d4148fa965bcec52325bd08": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "c2ab7313174c4231825be28c4a3181b8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "45123f4cdc0a4aa8ac90aa29d357240e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0bd10f7cb5244da29d0a7da73ae52335": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_825857db473849d2bb498ffb5fcfb962", - "IPY_MODEL_a88c300d3a81439fb3da9d46a023dc47", - "IPY_MODEL_636fccb3b002475a90c888f987b36400" - ], - "layout": "IPY_MODEL_89c7d83dc8e640cbb93ccfe2bb3030f0" - } - }, - "825857db473849d2bb498ffb5fcfb962": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_87671904fcdc4af993d4ba61f3a5f9e9", - "placeholder": "​", - "style": "IPY_MODEL_2510ee2600af466a851566f4634e7fe9", - "value": "tokenizer.json: 100%" - } - }, - "a88c300d3a81439fb3da9d46a023dc47": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_44e1f77b21e04d32ad83a405ab62ca38", - "max": 9096718, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_106c462bc57243018162577b103db007", - "value": 9096718 - } - }, - "636fccb3b002475a90c888f987b36400": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1826afdb3fd94748941c71d8621682e3", - "placeholder": "​", - "style": "IPY_MODEL_21ca8729098a4bd498b29de51a92e8bd", - "value": " 9.10M/9.10M [00:00<00:00, 25.6MB/s]" - } - }, - "89c7d83dc8e640cbb93ccfe2bb3030f0": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "87671904fcdc4af993d4ba61f3a5f9e9": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2510ee2600af466a851566f4634e7fe9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "44e1f77b21e04d32ad83a405ab62ca38": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "106c462bc57243018162577b103db007": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "1826afdb3fd94748941c71d8621682e3": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "21ca8729098a4bd498b29de51a92e8bd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } + "89c7d83dc8e640cbb93ccfe2bb3030f0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "94da2b3a4e7743a2b7dd362292c5496f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_451d9aec16b8417fb3b9565e5a73cb52", + "placeholder": "​", + "style": "IPY_MODEL_e7df5ab266744d59b29e8c11106dcb65", + "value": "config.json: 100%" + } + }, + "9c3ec5377e884b3faebd24fb815b6a85": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_883c34b77a4c4558b74d5dde797e22ab", + "placeholder": "​", + "style": "IPY_MODEL_a85eea6afaf0480eac17817e4844539a", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "9ca46de0beaa4c5d9ec526820d7aa94f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7277b2423de14e3aada27e5191f096e5", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0faedd1c4d4148fa965bcec52325bd08", + "value": 5069051 + } + }, + "a2f65e14834a47e69687d32ee896d31d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a85eea6afaf0480eac17817e4844539a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a88c300d3a81439fb3da9d46a023dc47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_44e1f77b21e04d32ad83a405ab62ca38", + "max": 9096718, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_106c462bc57243018162577b103db007", + "value": 9096718 + } + }, + "ace71ecb2fae4e4196f5c75b86a522f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b3ee928046b94c9194b5b5e30c61becb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59a857c998734b0a95af2b96252aa130", + "placeholder": "​", + "style": "IPY_MODEL_ace71ecb2fae4e4196f5c75b86a522f8", + "value": " 2.24G/2.24G [00:17<00:00, 173MB/s]" + } + }, + "b80a55635ff342eaa98451556f4908d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_349c635d1e8c47d6ab1d0fe3819dd837", + "max": 2239643256, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_eded9c7da1eb4f6c9e713c8b23b4327a", + "value": 2239643256 + } + }, + "c2ab7313174c4231825be28c4a3181b8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "df8afa14db524e2992b832b30bc0f692": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dff16b39b66a4422a407096064fa182e", + "placeholder": "​", + "style": "IPY_MODEL_119f12a1e8204bf7b7bbf1b4d7cca247", + "value": " 852/852 [00:00<00:00, 13.4kB/s]" + } + }, + "dff16b39b66a4422a407096064fa182e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e0e53ea997404498850c7dff6f80a5fb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e53d77092ea646b5bff9e5c4051f0709": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7538f72687754318ac6657cd98f1f5ae", + "IPY_MODEL_b80a55635ff342eaa98451556f4908d3", + "IPY_MODEL_b3ee928046b94c9194b5b5e30c61becb" + ], + "layout": "IPY_MODEL_a2f65e14834a47e69687d32ee896d31d" + } + }, + "e7df5ab266744d59b29e8c11106dcb65": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "eded9c7da1eb4f6c9e713c8b23b4327a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ee4965a2b5ad435c8c82b377006aa73e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9c3ec5377e884b3faebd24fb815b6a85", + "IPY_MODEL_9ca46de0beaa4c5d9ec526820d7aa94f", + "IPY_MODEL_238132625e604ddf85bdbf4931889d51" + ], + "layout": "IPY_MODEL_61335821e4c94fcba501cb1c94541d07" + } } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaSentenceEmbeddings.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaSentenceEmbeddings.ipynb index d27928d67b9441..4cff73dd823aa2 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaSentenceEmbeddings.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaSentenceEmbeddings.ipynb @@ -421,11 +421,10 @@ "gpuType": "T4", "provenance": [] }, - "kernelspec": { - "display_name": "Python [conda env:sparknlp_dev]", - "language": "python", - "name": "conda-env-sparknlp_dev-py" - }, + "kernelspec": ,{ + "display_name": "Python 3", + "name": "python3" + } "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/examples/python/transformers/onnx/ONNX_Configs_in_Spark_NLP_AlbertForQuestionAnswering.ipynb b/examples/python/transformers/onnx/ONNX_Configs_in_Spark_NLP_AlbertForQuestionAnswering.ipynb index b8134b3d7703b5..d3a6a9e0e8759e 100644 --- a/examples/python/transformers/onnx/ONNX_Configs_in_Spark_NLP_AlbertForQuestionAnswering.ipynb +++ b/examples/python/transformers/onnx/ONNX_Configs_in_Spark_NLP_AlbertForQuestionAnswering.ipynb @@ -93,7 +93,7 @@ "metadata": {}, "source": [ "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", - "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- We lock `transformers` on version `4.34.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", "- Albert uses SentencePiece, so we will have to install that as well" ] }, @@ -162,7 +162,7 @@ } ], "source": [ - "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece tensorflow" + "!pip install -q --upgrade transformers[onnx]==4.34.1 optimum sentencepiece tensorflow" ] }, { diff --git a/python/sparknlp/annotator/embeddings/bge_embeddings.py b/python/sparknlp/annotator/embeddings/bge_embeddings.py index fcb9e7a1b2b9ed..ea2d2a72aedbdf 100644 --- a/python/sparknlp/annotator/embeddings/bge_embeddings.py +++ b/python/sparknlp/annotator/embeddings/bge_embeddings.py @@ -26,6 +26,8 @@ class BGEEmbeddings(AnnotatorModel, BGE, or BAAI General Embeddings, a model that can map any text to a low-dimensional dense vector which can be used for tasks like retrieval, classification, clustering, or semantic search. + + Note that this annotator is only supported for Spark Versions 3.4 and up. Pretrained models can be loaded with `pretrained` of the companion object: diff --git a/python/sparknlp/annotator/embeddings/e5_embeddings.py b/python/sparknlp/annotator/embeddings/e5_embeddings.py index ee372290b1a333..c06a708bb9f12e 100644 --- a/python/sparknlp/annotator/embeddings/e5_embeddings.py +++ b/python/sparknlp/annotator/embeddings/e5_embeddings.py @@ -25,6 +25,8 @@ class E5Embeddings(AnnotatorModel, """Sentence embeddings using E5. E5, a weakly supervised text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) + Note that this annotator is only supported for Spark Versions 3.4 and up. + Pretrained models can be loaded with :meth:`.pretrained` of the companion object: diff --git a/python/sparknlp/annotator/embeddings/mpnet_embeddings.py b/python/sparknlp/annotator/embeddings/mpnet_embeddings.py index a1393462d41b0d..0d72cf3c388319 100644 --- a/python/sparknlp/annotator/embeddings/mpnet_embeddings.py +++ b/python/sparknlp/annotator/embeddings/mpnet_embeddings.py @@ -28,6 +28,8 @@ class MPNetEmbeddings(AnnotatorModel, to inherit the advantages of masked language modeling and permuted language modeling for natural language understanding. + Note that this annotator is only supported for Spark Versions 3.4 and up. + Pretrained models can be loaded with :meth:`.pretrained` of the companion object: diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.scala index 139f9efd2dbf66..3f1428d27a8506 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.scala @@ -42,6 +42,8 @@ import org.slf4j.{Logger, LoggerFactory} * vector which can be used for tasks like retrieval, classification, clustering, or semantic * search. * + * Note that this annotator is only supported for Spark Versions 3.4 and up. + * * Pretrained models can be loaded with `pretrained` of the companion object: * {{{ * val embeddings = BGEEmbeddings.pretrained() @@ -117,7 +119,7 @@ import org.slf4j.{Logger, LoggerFactory} * | result| * +--------------------------------------------------------------------------------+ * |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...| - * [[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...| + * |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...| * +--------------------------------------------------------------------------------+ * }}} * diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala index 38ead9b55ac086..1e516d26199b28 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala @@ -41,6 +41,8 @@ import org.slf4j.{Logger, LoggerFactory} * E5, an instruction-finetuned text embedding model that can generate text embeddings tailored * to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) * + * Note that this annotator is only supported for Spark Versions 3.4 and up. + * * Pretrained models can be loaded with `pretrained` of the companion object: * {{{ * val embeddings = E5Embeddings.pretrained() diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddings.scala index 0f9b0288a14436..710dce6a25db5d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddings.scala @@ -43,6 +43,8 @@ import org.slf4j.{Logger, LoggerFactory} * pre-training method, named masked and permuted language modeling, to inherit the advantages of * masked language modeling and permuted language modeling for natural language understanding. * + * Note that this annotator is only supported for Spark Versions 3.4 and up. + * * Pretrained models can be loaded with `pretrained` of the companion object: * {{{ * val embeddings = MPNetEmbeddings.pretrained() From 66163235b73230b1d0a5da63d0a8e723ece872d7 Mon Sep 17 00:00:00 2001 From: Lev Date: Mon, 27 May 2024 17:06:00 +0300 Subject: [PATCH 2/3] Fixies (#14307) --- docs/en/auxiliary.md | 2 +- docs/en/install.md | 2 +- docs/en/transformers.md | 2 +- docs/index.md | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/auxiliary.md b/docs/en/auxiliary.md index 0ad000ca9e72a1..1c65c3621ca3cb 100644 --- a/docs/en/auxiliary.md +++ b/docs/en/auxiliary.md @@ -66,7 +66,7 @@ import com.johnsnowlabs.nlp.Annotation **Examples:** Complete usage examples can be seen here: -https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/234-release-candidate/jupyter/annotation/english/spark-nlp-basics/spark-nlp-basics-functions.ipynb +[https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/234-release-candidate/jupyter/annotation/english/spark-nlp-basics/spark-nlp-basics-functions.ipynb](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/234-release-candidate/jupyter/annotation/english/spark-nlp-basics/spark-nlp-basics-functions.ipynb)
diff --git a/docs/en/install.md b/docs/en/install.md index f405cab46030d8..dbd4ade00c0e26 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -760,7 +760,7 @@ Finally, use **jupyter_notebook_config.json** for the password: In order to fully take advantage of Spark NLP on Windows (8 or 10), you need to setup/install Apache Spark, Apache Hadoop, Java and a Pyton environment correctly by following the following instructions: [https://github.com/JohnSnowLabs/spark-nlp/discussions/1022](https://github.com/JohnSnowLabs/spark-nlp/discussions/1022) -
\ +
### How to correctly install Spark NLP on Windows diff --git a/docs/en/transformers.md b/docs/en/transformers.md index f73f644d3f3606..e17cf3de529afc 100644 --- a/docs/en/transformers.md +++ b/docs/en/transformers.md @@ -9,7 +9,7 @@ modify_date: "2023-06-18" use_language_switcher: "Python-Scala-Java" show_nav: true sidebar: -nav: sparknlp + nav: sparknlp --- diff --git a/docs/index.md b/docs/index.md index fcce82b00d9d01..154824a38a2398 100644 --- a/docs/index.md +++ b/docs/index.md @@ -314,7 +314,7 @@ data: - title: image: - src: https://upload.wikimedia.org/wikipedia/fr/thumb/8/8e/Centre_national_de_la_recherche_scientifique.svg/2048px-Centre_national_de_la_recherche_scientifique.svg.png + src: https://iscpif.fr/wp-content/uploads/2023/11/Logo-CNRS-ISCPIF.png url: https://iscpif.fr/ style: "padding: 30px;" is_row: true @@ -344,7 +344,7 @@ data: is_row: true - title: image: - src: https://upload.wikimedia.org/wikipedia/commons/thumb/f/f1/Columbia_University_shield.svg/1184px-Columbia_University_shield.svg.png + src: https://miro.medium.com/v2/resize:fit:1024/0*3qIWoFnZgVUtsXB-.png url: https://www.columbia.edu/ style: "padding: 25px;" is_row: true From a5e4a7ab17ca5e1a8f7563dc5d506a7f8fb4fa58 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Tue, 11 Jun 2024 18:33:39 +0500 Subject: [PATCH 3/3] adding fix for broken annotators --- .../scala/com/johnsnowlabs/ml/ai/Bert.scala | 24 ++++++++++----- .../com/johnsnowlabs/ml/ai/XlmRoberta.scala | 29 ++++++++++++------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bert.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bert.scala index 6de0eabd36ce1f..d218426dfaeed8 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bert.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bert.scala @@ -16,12 +16,12 @@ package com.johnsnowlabs.ml.ai -import ai.onnxruntime.OnnxTensor +import ai.onnxruntime.{OnnxTensor, TensorInfo} import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings import com.johnsnowlabs.ml.onnx.{OnnxSession, OnnxWrapper} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} -import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} +import com.johnsnowlabs.ml.util.{LinAlg, ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} import org.slf4j.{Logger, LoggerFactory} @@ -206,10 +206,13 @@ private[johnsnowlabs] class Bert( val tokenTensors = OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val attentionMask = batch + .map(sentence => sentence.map(x => if (x == 0) 0L else 1L)) + .toArray val maskTensors = OnnxTensor.createTensor( env, - batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + attentionMask) val segmentTensors = OnnxTensor.createTensor(env, batch.map(x => Array.fill(maxSentenceLength)(0L)).toArray) @@ -222,8 +225,11 @@ private[johnsnowlabs] class Bert( try { val results = runner.run(inputs) + val lastHiddenState = results.get("last_hidden_state").get() + val info = lastHiddenState.getInfo.asInstanceOf[TensorInfo] + val shape = info.getShape try { - val embeddings = results + val flattenEmbeddings = results .get("last_hidden_state") .get() .asInstanceOf[OnnxTensor] @@ -235,7 +241,9 @@ private[johnsnowlabs] class Bert( // runner.close() // env.close() // - embeddings + val embeddings = LinAlg.avgPooling(flattenEmbeddings, attentionMask, shape) + val normalizedEmbeddings = LinAlg.l2Normalize(embeddings) + LinAlg.denseMatrixToArray(normalizedEmbeddings) } finally if (results != null) results.close() } catch { case e: Exception => @@ -287,12 +295,12 @@ private[johnsnowlabs] class Bert( tensors.clearSession(outs) tensors.clearTensors() - embeddings + val dim = embeddings.length / batchLength + embeddings.grouped(dim).toArray } - val dim = embeddings.length / batchLength - embeddings.grouped(dim).toArray + embeddings } def tagSequenceSBert(batch: Seq[Array[Int]]): Array[Array[Float]] = { diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoberta.scala b/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoberta.scala index df7dbeb4a106a9..ed8055a0b72674 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoberta.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoberta.scala @@ -16,13 +16,13 @@ package com.johnsnowlabs.ml.ai -import ai.onnxruntime.OnnxTensor +import ai.onnxruntime.{OnnxTensor, TensorInfo} import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings import com.johnsnowlabs.ml.onnx.{OnnxSession, OnnxWrapper} import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} -import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} +import com.johnsnowlabs.ml.util.{LinAlg, ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} import org.slf4j.{Logger, LoggerFactory} @@ -213,22 +213,26 @@ private[johnsnowlabs] class XlmRoberta( val embeddings = detectedEngine match { case ONNX.name => val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) - + val attentionMask = batch + .map(sentence => sentence.map(x => if (x == SentencePadTokenId) 0L else 1L)) + .toArray val tokenTensors = OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) val maskTensors = OnnxTensor.createTensor( env, - batch - .map(sentence => sentence.map(x => if (x == SentencePadTokenId) 0L else 1L)) - .toArray) + attentionMask + ) val inputs = Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava val results = runner.run(inputs) + val lastHiddenState = results.get("last_hidden_state").get() + val info = lastHiddenState.getInfo.asInstanceOf[TensorInfo] + val shape = info.getShape try { - val embeddings = results + val flattenEmbeddings = results .get("last_hidden_state") .get() .asInstanceOf[OnnxTensor] @@ -236,8 +240,10 @@ private[johnsnowlabs] class XlmRoberta( .array() tokenTensors.close() maskTensors.close() - embeddings + val embeddings = LinAlg.avgPooling(flattenEmbeddings, attentionMask, shape) + val normalizedEmbeddings = LinAlg.l2Normalize(embeddings) + LinAlg.denseMatrixToArray(normalizedEmbeddings) } finally if (results != null) results.close() case TensorFlow.name => val tensors = new TensorResources() @@ -277,11 +283,12 @@ private[johnsnowlabs] class XlmRoberta( tensors.clearSession(outs) tensors.clearTensors() - embeddings + val dim = embeddings.length / batchLength + embeddings.grouped(dim).toArray } - val dim = embeddings.length / batchLength - embeddings.grouped(dim).toArray + embeddings + } def predict(