microsoft · natke · Sep 17, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 25, 2024
diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt
@@ -6,6 +6,7 @@ set(CMAKE_CXX_STANDARD 20)
 option(USE_CUDA "Build with CUDA support" OFF)
 option(USE_CXX "Invoke the C++ example" ON)
 option(PHI3 "Build the Phi example" OFF)
+option(LLAMA "Build the Llama example" OFF)
-option(LLAMA "Build the Llama example" OFF)
+option(LLM "Build the large-language model example" OFF)
+option(VLM "Build the vision-language model example" OFF)
+option(ALM "Build the audio-language model example" OFF)
-option(LLAMA "Build the Llama example" OFF)
+option(LLM "Build the large-language model example" OFF)
+option(VLM "Build the vision-language model example" OFF)
+option(ALM "Build the audio-language model example" OFF)
 option(PHI3V "Build the Phi3v example" OFF)
 option(WHISPER "Build the Whisper example" OFF)
 
@@ -58,6 +59,11 @@ if(PHI3)
   prepare_executable(phi3)
 endif()
 
+if(LLAMA)
+  add_executable(llama ${CMAKE_SOURCE_DIR}/src/llama.cpp)
+  prepare_executable(llama)
+endif()
+
 if(PHI3V)
   add_executable(phi3v ${CMAKE_SOURCE_DIR}/src/phi3v.cpp)
   prepare_executable(phi3v)

diff --git a/examples/c/README.md b/examples/c/README.md
@@ -221,43 +221,152 @@ Change into the onnxruntime-genai directory.
 
 2. Build onnxruntime-genai from source and install
 
-   This example requires onnxruntime-genai to be built from source.
+   ```bash
+   curl -L https://github.com/microsoft/onnxruntime-genai/releases/download/v0.4.0/onnxruntime-genai-linux-cpu-x64-capi.zip -o onnxruntime-genai-linux-cpu-x64-capi.zip
+   unzip onnxruntime-genai-linux-cpu-x64-capi.zip
+   cd onnxruntime-genai-linux-cpu-x64-capi
+   tar xvf onnxruntime-genai-0.4.0-linux-x64.tar.gz
+   cp onnxruntime-genai-0.4.0-linux-x64/include/* ../include
+   cp onnxruntime-genai-0.4.0-linux-x64/lib/* ../lib
+   cd ..
+   ```
+
+#### Build this sample
+
+Build with CUDA:
+
+```bash
+cmake . -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=80 -DUSE_CUDA=ON -DPHI3=ON
+cd build
+cmake --build . --config Release
+```
+
+Build for CPU:
+
+```bash
+cmake . -B build -DPHI3=ON
+cd build
+cmake --build . --config Release
+```
+
+#### Run the sample
+
+```bash
+./phi3 path_to_model
+```
+
+## Llama
+
+### Obtain model
+
+To access Llama models, you need to sign the license agreement on HuggingFace. Navigate to the model on HuggingFace e.g. https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct and sign the license agreement.
+
+Once you have been granted access, run the following steps to generate the ONNX model in the precision and for the target that you want to run on. Note: this operations requires 64GB of RAM to complete.
+
+```bash
+pip install torch transformers onnx onnxruntime onnxruntime-genai huggingface-hub[cli]
+huggingface-cli login
+python onnxruntime_genai.models.builder -m meta-llama/Llama-3.1-8B-Instruct -e cpu -p int4 -o llama-3.1-8b-instruct-onnx
+```
+
+The models and all of the necessary meta data will be available in a folder called `llama-3.1-8b-instruct-onnx`.
+
+### Windows x64 CPU
+
+#### Install the onnxruntime and onnxruntime-genai binaries
+
+Change into the `onnxruntime-genai\examples\c` folder.
+
+1. Install onnxruntime
+
+   ```cmd
+   curl -L https://github.com/microsoft/onnxruntime/releases/download/v1.19.2/onnxruntime-win-x64-1.19.2.zip -o onnxruntime-win-x64-1.19.2.zip
+   tar xvf onnxruntime-win-x64-1.19.2.zip
+   copy onnxruntime-win-x64-1.19.2\include\* include
+   copy onnxruntime-win-x64-1.19.2\lib\* lib
+   ```
+
+2. Install onnxruntime-genai
+
+   ```cmd
+   curl -L https://github.com/microsoft/onnxruntime-genai/releases/download/v0.4.0/onnxruntime-genai-win-cpu-x64-capi.zip -o onnxruntime-genai-win-cpu-x64-capi.zip
+   tar xvf onnxruntime-genai-win-cpu-x64-capi.zip
+   cd onnxruntime-genai-win-cpu-x64-capi
+   tar xvf onnxruntime-genai-0.4.0-win-x64.zip
+   copy onnxruntime-genai-0.4.0-win-x64\include\* ..\include
+   copy onnxruntime-genai-0.4.0-win-x64\lib\* ..\lib
+   cd ..
+   ```
+
+#### Build this sample
+
+```bash
+cmake -A x64 -S . -B build -DLLAMA=ON
+cd build
+cmake --build . --config Release
+```
+
+#### Run the sample
+
+```bash
+cd Release
+.\llama.exe llama-3.1-8b-instruct-onnx
+```
+
+### Linux
+
+#### Install the onnxruntime and onnxruntime-genai binaries
+
+Change into the onnxruntime-genai directory.
+
+1. Install onnxruntime
 
    ```bash
-   # This should be run from the root of the onnxruntime-genai folder
-   python build.py --config Release --ort_home examples\c
-   cp src/ort_genai.h examples/c/include
-   cp src/ort_genai_c.h examples/c/include
-   cp build/Linux/release/onnxruntime-genai.so examples/c/lib
    cd examples/c
+   curl -L https://github.com/microsoft/onnxruntime/releases/download/v1.19.2/onnxruntime-linux-x64-1.19.2.tgz -o onnxruntime-linux-x64-1.19.2.tgz
+   tar xvzf onnxruntime-linux-x64-1.19.2.tgz
+   cp onnxruntime-linux-x64-1.19.2/include/* include
+   cp onnxruntime-linux-x64-1.19.2/lib/* lib
+   cd ../..
+   ```
+
+2. Build onnxruntime-genai from source and install
+
+   ```bash
+   curl -L https://github.com/microsoft/onnxruntime-genai/releases/download/v0.4.0/onnxruntime-genai-linux-cpu-x64-capi.zip -o onnxruntime-genai-linux-cpu-x64-capi.zip
+   unzip onnxruntime-genai-linux-cpu-x64-capi.zip
+   cd onnxruntime-genai-linux-cpu-x64-capi
+   tar xvf onnxruntime-genai-0.4.0-linux-x64.tar.gz
+   cp onnxruntime-genai-0.4.0-linux-x64/include/* ../include
+   cp onnxruntime-genai-0.4.0-linux-x64/lib/* ../lib
+   cd ..
    ```
 
 #### Build this sample
 
 Build with CUDA:
 
 ```bash
-mkdir build
+cmake . -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=80 -DUSE_CUDA=ON -DPHI3=ON
 cd build
-cmake ../ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=80 -DUSE_CUDA=ON -DPHI3=ON
 cmake --build . --config Release
 ```
 
 Build for CPU:
 
 ```bash
-cmake . -DPHI3=ON
+cmake . -B build -DPHI3=ON
 cd build
 cmake --build . --config Release
 ```
 
 #### Run the sample
 
 ```bash
-cd Release
-./phi3 path_to_model
+./llama path_to_model
 ```
 
+
 ## Phi-3 vision
 
 ### Download model

diff --git a/examples/c/include/.gitkeep b/examples/c/include/.gitkeep
diff --git a/examples/c/lib/.gitkeep b/examples/c/lib/.gitkeep
diff --git a/examples/c/src/llama.cpp b/examples/c/src/llama.cpp
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <string>
+#include "ort_genai.h"
+
+// C++ API Example
+
+void CXX_API(const char* model_path) {
+  std::cout << "Creating model..." << std::endl;
+  auto model = OgaModel::Create(model_path);
+  std::cout << "Creating tokenizer..." << std::endl;
+  auto tokenizer = OgaTokenizer::Create(*model);
+  auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+
+  while (true) {
+    std::string text;
+    std::cout << "Prompt: " << std::endl;
+    std::getline(std::cin, text);
+
+    const std::string prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful AI assistant. Give a short answer to the following<|eot_id|><|start_header_id|>user<|end_header_id|>" + text + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>";
+
+    auto sequences = OgaSequences::Create();
+    tokenizer->Encode(prompt.c_str(), *sequences);
+
+    std::cout << "Generating response..." << std::endl;
+    auto params = OgaGeneratorParams::Create(*model);
+    params->SetSearchOption("max_length", 1024);
+    params->SetSearchOptionBool("do_sample", true);
+    params->SetInputSequences(*sequences);
+
+    auto generator = OgaGenerator::Create(*model, *params);
+
+    while (!generator->IsDone()) {
+      generator->ComputeLogits();
+      generator->GenerateNextToken();
+
+      // Show usage of GetOutput
+      std::unique_ptr<OgaTensor> output_logits = generator->GetOutput("logits");
+
+      // Assuming output_logits.Type() is float as it's logits
+      // Assuming shape is 1 dimensional with shape[0] being the size
+      auto logits = reinterpret_cast<float*>(output_logits->Data());
+
+      // Print out the logits using the following snippet, if needed
+      //auto shape = output_logits->Shape();
+      //for (size_t i=0; i < shape[0]; i++)
+      //   std::cout << logits[i] << " ";
+      //std::cout << std::endl;
+
+      const auto num_tokens = generator->GetSequenceCount(0);
+      const auto new_token = generator->GetSequenceData(0)[num_tokens - 1];
+      std::cout << tokenizer_stream->Decode(new_token) << std::flush;
+    }
+
+    for (int i = 0; i < 3; ++i)
+      std::cout << std::endl;
+  }
+}
+
+
+static void print_usage(int /*argc*/, char** argv) {
+  std::cerr << "usage: " << argv[0] << " model_path" << std::endl;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 2) {
+    print_usage(argc, argv);
+    return -1;
+  }
+
+  // Responsible for cleaning up the library during shutdown
+  OgaHandle handle;
+
+  std::cout << "-------------" << std::endl;
+  std::cout << "Run Llama" << std::endl;
+  std::cout << "-------------" << std::endl;
+
+#ifdef USE_CXX
+  std::cout << "C++ API" << std::endl;
+  CXX_API(argv[1]);
+#endif
+
+  return 0;
+}