Cherry Pick v4 (#801)

Cherry pick PRs and update version to 0.4.0
microsoft · Aug 21, 2024 · b77e768 · b77e768
1 parent a61454c
commit b77e768
Show file tree

Hide file tree

Showing 26 changed files with 373 additions and 162 deletions.
diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml
@@ -72,7 +72,7 @@ jobs:
           --container-registry onnxruntimebuildcache \
           --repository ort_genai_linux_arm64_gha
 
-      - name: Doker -- Configure with CMake and GCC
+      - name: Docker -- Configure with CMake and GCC
         run: |
           docker run --rm \
           --volume $GITHUB_WORKSPACE:/onnxruntime_src \
@@ -84,7 +84,7 @@ jobs:
           --volume $GITHUB_WORKSPACE:/onnxruntime_src \
           -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "/usr/bin/cmake --build --preset linux_gcc_cpu_release"
 
-      - name: Dokcer -- check test directory
+      - name: Docker -- Check test directory
         run: |
           docker run --rm \
           --volume $GITHUB_WORKSPACE:/onnxruntime_src \

diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
@@ -129,19 +129,21 @@ jobs:
           docker run \
             --gpus all \
             --rm \
+            --volume /data/ortgenai_pytorch_models:/data/ortgenai_pytorch_models \
             --volume $GITHUB_WORKSPACE:/ort_genai_src \
             -e HF_TOKEN=$HF_TOKEN \
             -w /ort_genai_src onnxruntimecudabuildx64 bash -c " \
               ${{ env.PYTHON_EXECUTABLE }} -m pip install -r test/python/requirements.txt --user && \
               ${{ env.PYTHON_EXECUTABLE }} -m pip install -r test/python/requirements-cuda.txt --user && \
               ${{ env.PYTHON_EXECUTABLE }} -m pip install /ort_genai_src/build/cuda/wheel/onnxruntime_genai*manylinux*.whl --user && \
-              ${{ env.PYTHON_EXECUTABLE }} test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models"
+              ${{ env.PYTHON_EXECUTABLE }} test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models --e2e"
 
       - name: Docker -- Run unit tests
         run: |
           echo "Running docker image onnxruntimecudabuildx64"
           docker run \
             --gpus all \
             --rm \
+            --volume /data/ortgenai_pytorch_models:/data/ortgenai_pytorch_models \
             --volume $GITHUB_WORKSPACE:/ort_genai_src \
             -w /ort_genai_src onnxruntimecudabuildx64 bash -c "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ort_genai_src/build/cuda/ /ort_genai_src/build/cuda/test/unit_tests"
diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
@@ -94,8 +94,6 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-
-
     - name: Verify Build Artifacts
       if: always()
       continue-on-error: true

diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
@@ -93,8 +93,7 @@ jobs:
 
     - name: Run the Python Tests
       run: |
-        python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
-
+        python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e
 
     - name: Verify Build Artifacts
       if: always()

diff --git a/.pipelines/stages/jobs/nuget-validation-job.yml b/.pipelines/stages/jobs/nuget-validation-job.yml
@@ -116,15 +116,16 @@ jobs:
     inputs:
       version: '8.x'
 
-  - template: steps/utils/download-huggingface-model.yml
-    parameters:
-      StepName: 'Download Model from HuggingFace'
-      HuggingFaceRepo: 'microsoft/Phi-3-mini-4k-instruct-onnx'
-      RepoFolder: $(prebuild_phi3_mini_model_folder)
-      LocalFolder: 'models'
-      WorkingDirectory: '$(Build.Repository.LocalPath)/examples/csharp/HelloPhi'
-      HuggingFaceToken: $(HF_TOKEN)
-      os: ${{ parameters.os }}
+  - ${{ if ne(parameters.arch, 'arm64') }}:
+    - template: steps/utils/download-huggingface-model.yml
+      parameters:
+        StepName: 'Download Model from HuggingFace'
+        HuggingFaceRepo: 'microsoft/Phi-3-mini-4k-instruct-onnx'
+        RepoFolder: $(prebuild_phi3_mini_model_folder)
+        LocalFolder: 'models'
+        WorkingDirectory: '$(Build.Repository.LocalPath)/examples/csharp/HelloPhi'
+        HuggingFaceToken: $(HF_TOKEN)
+        os: ${{ parameters.os }}
 
   - template: steps/utils//flex-download-pipeline-artifact.yml
     parameters:
@@ -134,7 +135,7 @@ jobs:
       SpecificArtifact: ${{ parameters.specificArtifact }}
       BuildId: ${{ parameters.BuildId }}
 
-  - ${{ if eq(parameters.os, 'win') }}:
+  - ${{ if and(eq(parameters.os, 'win'), ne(parameters.arch, 'arm64')) }}:
     - ${{ if eq(parameters.ep, 'cuda') }}:
       - powershell: |
           $env:AZCOPY_MSI_CLIENT_ID = "63b63039-6328-442f-954b-5a64d124e5b4";

diff --git a/.pipelines/stages/jobs/py-validation-job.yml b/.pipelines/stages/jobs/py-validation-job.yml
@@ -164,15 +164,16 @@ jobs:
         SpecificArtifact: ${{ parameters.specificArtifact }}
         BuildId: ${{ parameters.BuildId }}
 
-  - template: steps/utils/download-huggingface-model.yml
-    parameters:
-      StepName: 'Download Model from HuggingFace'
-      HuggingFaceRepo: 'microsoft/Phi-3-mini-4k-instruct-onnx'
-      RepoFolder: $(prebuild_phi3_mini_model_folder)
-      LocalFolder: 'models'
-      WorkingDirectory: '$(Build.Repository.LocalPath)/examples/python'
-      HuggingFaceToken: $(HF_TOKEN)
-      os: ${{ parameters.os }}
+  - ${{ if ne(parameters.arch, 'arm64') }}:
+    - template: steps/utils/download-huggingface-model.yml
+      parameters:
+        StepName: 'Download Model from HuggingFace'
+        HuggingFaceRepo: 'microsoft/Phi-3-mini-4k-instruct-onnx'
+        RepoFolder: $(prebuild_phi3_mini_model_folder)
+        LocalFolder: 'models'
+        WorkingDirectory: '$(Build.Repository.LocalPath)/examples/python'
+        HuggingFaceToken: $(HF_TOKEN)
+        os: ${{ parameters.os }}
 
   - ${{ if eq(parameters.os, 'linux') }}:
     - ${{ if eq(parameters.ep, 'cuda') }}:
@@ -195,7 +196,7 @@ jobs:
                 $python_exe -m pip install -r /ort_genai_src/test/python/requirements.txt && \
                 $python_exe -m pip install -r /ort_genai_src/test/python/requirements-cuda.txt && \
                 cd /ort_genai_src/examples/python && \
-                $python_exe -m pip install --no-index --find-links=/ort_genai_binary/wheel $(pip_package_name) && \
+                $python_exe -m pip install --find-links=/ort_genai_binary/wheel $(pip_package_name) && \
                 $python_exe model-generate.py -m ./models/$(prebuild_phi3_mini_model_folder) --min_length 25 --max_length 50 --verbose"
 
         displayName: 'Run Example With Artifact'
@@ -206,12 +207,12 @@ jobs:
           python -m pip install -r test/python/requirements.txt
           python -m pip install -r test/python/requirements-cpu.txt
           cd examples/python
-          python -m pip install --no-index --find-links=$(Build.BinariesDirectory)/wheel $(pip_package_name)
+          python -m pip install --find-links=$(Build.BinariesDirectory)/wheel $(pip_package_name)
           python model-generate.py -m ./models/$(prebuild_phi3_mini_model_folder) --min_length 25 --max_length 50 --verbose
         displayName: 'Run Example With Artifact'
         workingDirectory: '$(Build.Repository.LocalPath)'
 
-  - ${{ if eq(parameters.os, 'win') }}:
+  - ${{ if and(eq(parameters.os, 'win'), ne(parameters.arch, 'arm64'), ne(parameters.ep, 'directml')) }}:
     - ${{ if eq(parameters.ep, 'cuda') }}:
       - powershell: |
           $env:AZCOPY_MSI_CLIENT_ID = "63b63039-6328-442f-954b-5a64d124e5b4";
@@ -233,7 +234,7 @@ jobs:
           python -m pip install -r test/python/requirements-cpu.txt
         }
         cd examples\python
-        python -m pip install --no-index --find-links=$(Build.BinariesDirectory)/wheel $(pip_package_name)
+        python -m pip install --find-links=$(Build.BinariesDirectory)/wheel $(pip_package_name)
 
         python model-generate.py -m .\models\$(prebuild_phi3_mini_model_folder) --min_length 25 --max_length 50 --verbose
       displayName: 'Run Example With Artifact'

diff --git a/VERSION_INFO b/VERSION_INFO
@@ -1 +1 @@
-0.4.0-rc1
+0.4.0
diff --git a/cmake/global_variables.cmake b/cmake/global_variables.cmake
@@ -13,7 +13,13 @@ set(VERSION_INFO ${ver})
 # VERSION_PATCH: 0
 string(REPLACE "-" ";" VERSION_LIST ${VERSION_INFO})
 list(GET VERSION_LIST 0 VERSION_STR)
-list(GET VERSION_LIST 1 VERSION_SUFFIX)
+# Check if it is a stable or dev version
+list(LENGTH VERSION_LIST VERSION_LIST_LENGTH)
+if(VERSION_LIST_LENGTH GREATER 1)
+    list(GET VERSION_LIST 1 VERSION_SUFFIX)
+else()
+    set(VERSION_SUFFIX "")  # Set VERSION_SUFFIX to empty if stable version
+endif()
 string(REPLACE "." ";" VERSION_LIST ${VERSION_STR})
 list(GET VERSION_LIST 0 VERSION_MAJOR)
 list(GET VERSION_LIST 1 VERSION_MINOR)

diff --git a/examples/csharp/HelloPhi/HelloPhi.csproj b/examples/csharp/HelloPhi/HelloPhi.csproj
@@ -9,9 +9,9 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="[0.4.0-rc1]" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="[0.4.0-rc1]" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="[0.4.0-rc1]" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="[0.4.0]" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="[0.4.0]" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="[0.4.0]" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
   </ItemGroup>
 
   <ItemGroup>

diff --git a/examples/csharp/HelloPhi/Program.cs b/examples/csharp/HelloPhi/Program.cs
@@ -5,18 +5,18 @@ void PrintUsage()
 {
     Console.WriteLine("Usage:");
     Console.WriteLine("  -m model_path");
-    Console.WriteLine("  -i (optional): Intereactive mode");
+    Console.WriteLine("  -i (optional): Interactive mode");
 }
 
-OgaHandle ogaHandle = new OgaHandle();
+using OgaHandle ogaHandle = new OgaHandle();
 
 if (args.Length < 1)
 {
     PrintUsage();
     Environment.Exit(-1);
 }
 
-bool intereactive = false;
+bool interactive = false;
 string modelPath = string.Empty;
 
 uint i = 0;
@@ -25,7 +25,7 @@ void PrintUsage()
     var arg = args[i];
     if (arg == "-i")
     {
-        intereactive = true;
+        interactive = true;
     }
     else if (arg == "-m")
     {
@@ -47,13 +47,13 @@ void PrintUsage()
 Console.WriteLine("-------------");
 
 Console.WriteLine("Model path: " + modelPath);
-Console.WriteLine("Intereactive: " + intereactive);
+Console.WriteLine("Interactive: " + interactive);
 
 using Model model = new Model(modelPath);
 using Tokenizer tokenizer = new Tokenizer(model);
 
 var option = 2;
-if (intereactive)
+if (interactive)
 {
     Console.WriteLine("Please enter option number:");
     Console.WriteLine("1. Complete Output");
@@ -64,15 +64,15 @@ void PrintUsage()
 do
 {
     string prompt = "def is_prime(num):"; // Example prompt
-    if (intereactive)
+    if (interactive)
     {
         Console.WriteLine("Prompt:");
         prompt = Console.ReadLine();
     }
     if (string.IsNullOrEmpty(prompt))
     {
         continue;
-    }
+    }
     var sequences = tokenizer.Encode($"<|user|>{prompt}<|end|><|assistant|>");
 
     using GeneratorParams generatorParams = new GeneratorParams(model);
@@ -99,4 +99,4 @@ void PrintUsage()
         }
         Console.WriteLine();
     }
-} while (intereactive);
+} while (interactive);
diff --git a/nuget/MANAGED_PACKAGE.md b/nuget/MANAGED_PACKAGE.md
@@ -0,0 +1,3 @@
+## About
+
+This package is a dependency of [Microsoft.ML.OnnxRuntimeGenAI](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntimeGenAI) and does not need to be installed directly.
diff --git a/nuget/Microsoft.ML.OnnxRuntimeGenAI.Managed.nuspec b/nuget/Microsoft.ML.OnnxRuntimeGenAI.Managed.nuspec
@@ -19,7 +19,7 @@
   </metadata>
   <files>
     <file src="..\LICENSE" target="LICENSE" />
-    <file src="..\src\csharp\README.md" target="README.md" />
+    <file src="MANAGED_PACKAGE.md" target="README.md" />
     <file src="..\ThirdPartyNotices.txt" target="ThirdPartyNotices.txt" />
 
     <file src="..\src\csharp\bin\$configuration$\netstandard2.0\Microsoft.ML.OnnxRuntimeGenAI.dll" target="lib\netstandard2.0" />

diff --git a/src/csharp/README.md → nuget/PACKAGE.md b/src/csharp/README.md → nuget/PACKAGE.md
diff --git a/src/csharp/Microsoft.ML.OnnxRuntimeGenAI.csproj b/src/csharp/Microsoft.ML.OnnxRuntimeGenAI.csproj
@@ -36,10 +36,15 @@
       <Output TaskParameter="Lines" PropertyName="VersionInfoStr" />
     </ReadLinesFromFile>
 
-    <PropertyGroup>
+    <PropertyGroup Condition=" '$(VersionInfoStr.Contains(-))' == 'true' ">
       <VersionInfo>$(VersionInfoStr.Split(-)[0])</VersionInfo>
       <VersionSuffix>$(VersionInfoStr.Split(-)[1])</VersionSuffix>
     </PropertyGroup>
+
+    <PropertyGroup Condition=" '$(VersionInfoStr.Contains(-))' == 'false' ">
+      <VersionInfo>$(VersionInfoStr)</VersionInfo>
+      <VersionSuffix></VersionSuffix>
+    </PropertyGroup>
   </Target>
 
   <Target Name="WriteAssemblyInfo" BeforeTargets="CoreCompile" DependsOnTargets="PrepareForBuild;ReadVersionFromFile">

diff --git a/src/csharp/Utils.cs b/src/csharp/Utils.cs
@@ -7,11 +7,33 @@
 
 namespace Microsoft.ML.OnnxRuntimeGenAI
 {
-    public class OgaHandle
+    public class OgaHandle: IDisposable
     {
+        private bool _disposed = false;
+
+        public OgaHandle()
+        {
+        }
+
         ~OgaHandle()
         {
+            Dispose(false);
+        }
+
+        public void Dispose()
+        {
+            Dispose(true);
+            GC.SuppressFinalize(this);
+        }
+
+        protected virtual void Dispose(bool disposing)
+        {
+            if (_disposed)
+            {
+                return;
+            }
             NativeMethods.OgaShutdown();
+            _disposed = true;
         }
     }
 

diff --git a/src/ort_genai.h b/src/ort_genai.h
@@ -232,6 +232,12 @@ struct OgaGenerator : OgaAbstract {
     return OgaGenerator_GetSequenceData(this, index);
   }
 
+  std::unique_ptr<OgaTensor> GetOutput(const char* name) {
+    OgaTensor* out;
+    OgaCheckResult(OgaGenerator_GetOutput(this, name, &out));
+    return std::unique_ptr<OgaTensor>(out);
+  }
+
 #if __cplusplus >= 202002L
   std::span<const int32_t> GetSequence(size_t index) const {
     return {GetSequenceData(index), GetSequenceCount(index)};

diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
@@ -208,6 +208,50 @@ OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator)
   OGA_CATCH
 }
 
+OgaResult* OGA_API_CALL OgaGenerator_GetOutput(const OgaGenerator* oga_generator, const char* name, OgaTensor** out) {
+  OGA_TRY
+  auto& generator = *reinterpret_cast<const Generators::Generator*>(oga_generator);
+  auto* ortvalue_output = generator.state_->GetOutput(name);
+  auto type_info = ortvalue_output->GetTensorTypeAndShapeInfo();
+  std::unique_ptr<OrtValue> ortvalue_clone = OrtValue::CreateTensor(generator.model_->allocator_cpu_,
+                                                                    type_info->GetShape(),
+                                                                    type_info->GetElementType());
+  // Copy data to ortvalue_clone
+  auto element_size = Generators::SizeOf(type_info->GetElementType());
+  auto data_size = type_info->GetElementCount() * element_size;
+  if (ortvalue_output->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_GPU && generator.model_->device_type_ == Generators::DeviceType::CUDA) {
+#if USE_CUDA
+    cudaMemcpy(ortvalue_clone->GetTensorMutableRawData(), ortvalue_output->GetTensorMutableRawData(), data_size, cudaMemcpyDeviceToHost);
+#endif
+  } else if (ortvalue_output->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_GPU && generator.model_->device_type_ == Generators::DeviceType::DML) {
+#if USE_DML
+    ComPtr<ID3D12Resource> gpu_resource;
+    Ort::ThrowOnError(generator.model_->GetOrtDmlApi()->GetD3D12ResourceFromAllocation(
+        generator.model_->allocator_device_,
+        ortvalue_output->GetTensorMutableRawData(),
+        &gpu_resource));
+    auto cpu_tensor = ortvalue_clone->GetTensorMutableRawData();
+    generator.model_->GetDmlReadbackHeap()->ReadbackFromGpu(
+        std::span(reinterpret_cast<uint8_t*>(cpu_tensor), data_size),
+        gpu_resource.Get(),
+        0,
+        D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+#endif
+  } else if (ortvalue_output->GetTensorMemoryInfo().GetDeviceType() == OrtMemoryInfoDeviceType_CPU) {
+    std::copy(static_cast<uint8_t*>(ortvalue_output->GetTensorMutableRawData()),
+              static_cast<uint8_t*>(ortvalue_output->GetTensorMutableRawData()) + data_size,
+              static_cast<uint8_t*>(ortvalue_clone->GetTensorMutableRawData()));
+  } else {
+    throw std::runtime_error("Unsupported Device type: " + ortvalue_output->GetTensorMemoryInfo().GetDeviceType());
+  }
+
+  auto tensor = std::make_shared<Generators::Tensor>(std::move(ortvalue_clone));
+  tensor->external_owner_ = tensor;
+  *out = reinterpret_cast<OgaTensor*>(tensor.get());
+  return nullptr;
+  OGA_CATCH
+}
+
 size_t OGA_API_CALL OgaGenerator_GetSequenceCount(const OgaGenerator* oga_generator, size_t index) {
   auto& generator = *reinterpret_cast<const Generators::Generator*>(oga_generator);
   return generator.GetSequence(static_cast<int>(index)).GetCPU().size();