Skip to content

Commit

Permalink
Add re-designed support for Phi-3 vision and Phi-3.5 vision (#882)
Browse files Browse the repository at this point in the history
### Description
This PR supports the re-designed export of Phi-3 vision and Phi-3.5
vision. The new design natively supports multi-image and the `select`
logic inside the ONNX models.

### Motivation and Context
With the re-designed export, some of the logic inside ONNX Runtime GenAI
is no longer needed as it is now inside the ONNX model. This allows
other models to more easily re-use the vision and embedding components
within ONNX Runtime GenAI.
  • Loading branch information
kunal-vaishnavi authored Sep 10, 2024
1 parent ad6a02f commit b49e3b1
Show file tree
Hide file tree
Showing 19 changed files with 861 additions and 709 deletions.
4 changes: 2 additions & 2 deletions examples/python/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import onnxruntime_genai as og


def _complete(text, state):
return (glob.glob(text + "*") + [None])[state]

Expand All @@ -29,9 +28,10 @@ def run(args: argparse.Namespace):
"Image Path (comma separated; leave empty if no image): "
).split(",")
]
image_paths = [image_path for image_path in image_paths if len(image_path)]
print(image_paths)

image = None
images = None
prompt = "<|user|>\n"
if len(image_paths) == 0:
print("No image provided")
Expand Down
10 changes: 4 additions & 6 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,6 @@ struct Inputs_Element : JSON::Element {
v_.position_ids = value;
} else if (name == "attention_mask") {
v_.attention_mask = value;
} else if (name == "seqlens_k") {
v_.seqlens_k = value;
} else if (name == "total_seq_len") {
v_.total_sequence_length = value;
} else if (name == "past_key_names") {
v_.past_key_names = value;
} else if (name == "past_value_names") {
Expand Down Expand Up @@ -248,8 +244,8 @@ struct VisionOutputs_Element : JSON::Element {
explicit VisionOutputs_Element(Config::Model::Vision::Outputs& v) : v_{v} {}

void OnString(std::string_view name, std::string_view value) override {
if (name == "visual_features") {
v_.visual_features = value;
if (name == "image_features") {
v_.image_features = value;
} else
throw JSON::unknown_value_error{};
}
Expand Down Expand Up @@ -312,6 +308,8 @@ struct EmbeddingInputs_Element : JSON::Element {
void OnString(std::string_view name, std::string_view value) override {
if (name == "input_ids") {
v_.input_ids = value;
} else if (name == "image_features") {
v_.image_features = value;
} else
throw JSON::unknown_value_error{};
}
Expand Down
6 changes: 3 additions & 3 deletions src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ struct Config {
static constexpr std::string_view InputIdsName = "input_ids";
static constexpr std::string_view PixelValuesName = "pixel_values";
static constexpr std::string_view ImageSizesName = "image_sizes";
static constexpr std::string_view ImageFeaturesName = "image_features";
};

fs::path config_path; // Path of the config directory
Expand Down Expand Up @@ -62,6 +63,7 @@ struct Config {

struct Inputs {
std::string input_ids{Defaults::InputIdsName};
std::string image_features{Defaults::ImageFeaturesName};
} inputs;

struct Outputs {
Expand All @@ -78,7 +80,7 @@ struct Config {
} inputs;

struct Outputs {
std::string visual_features{"visual_features"};
std::string image_features{Defaults::ImageFeaturesName};
} outputs;
} vision;

Expand All @@ -97,8 +99,6 @@ struct Config {
std::string embeddings{"inputs_embeds"};
std::string position_ids{"position_ids"};
std::string attention_mask{"attention_mask"};
std::string seqlens_k{"seqlens_k"};
std::string total_sequence_length{"total_seq_len"};
std::string past_key_names{"past_key_values.%d.key"}, past_value_names{"past_key_values.%d.value"};
std::string past_names; // When key/value pairs are combined
std::string cross_past_key_names, cross_past_value_names;
Expand Down
69 changes: 69 additions & 0 deletions src/models/image_features.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "../generators.h"
#include "model.h"
#include "image_features.h"

namespace Generators {

ImageFeatures::ImageFeatures(const Model& model, State& state, ImageFeatures::Mode mode, const std::string& name, int64_t num_image_tokens)
: model_{model},
state_{state},
shape_{num_image_tokens, state_.params_->hidden_size},
type_{mode == ImageFeatures::Mode::Input
? model_.session_info_->GetInputDataType(name)
: model_.session_info_->GetOutputDataType(name)},
mode_{mode},
name_{name} {
// There are four cases for ImageFeatures:
// 1) Created as an output for vision model (num_image_tokens > 0)
// The tensor needs to be pre-allocated to store the output.
// It will be transferred to an input for the embedding model.
// 2) Created as an output for vision model (num_image_tokens = 0)
// The tensor will be pre-allocated to store the empty output.
// It will be transferred to an input for the embedding model.
// 3) Created as an input for embedding model (num_image_tokens > 0)
// The tensor does not need to be pre-allocated because it will be created during (1).
// 4) Created as an input for embedding model (num_image_tokens = 0)
// The tensor does not need to be pre-allocated because it will be created during (2).
if (mode == ImageFeatures::Mode::Output) {
image_features_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
}
}

void ImageFeatures::Add() {
if (mode_ == ImageFeatures::Mode::Input) {
// In case the image_features are an input to a model, they are added
// as a nullptr to reserve a slot in the inputs. The image_features
// input will be overwritten when ReuseImageFeaturesBuffer is invoked.
index_ = state_.inputs_.size();
state_.inputs_.push_back(nullptr);
state_.input_names_.push_back(name_.c_str());
} else {
index_ = state_.outputs_.size();
state_.outputs_.push_back(image_features_.get());
state_.output_names_.push_back(name_.c_str());
}
}

void ImageFeatures::Update() {
// Initialize empty image_features tensor for after-prompt input scenarios
// num_image_tokens will be 0 when no image is provided
if (shape_[0] > 0) { // if num_image_tokens > 0
shape_[0] = 0;
image_features_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
state_.inputs_[index_] = image_features_.get();
}
}

void ImageFeatures::ReuseImageFeaturesBuffer(ImageFeatures& other) {
if (mode_ == ImageFeatures::Mode::Output || other.mode_ == ImageFeatures::Mode::Input) {
throw std::runtime_error("Incorrect usage of the ImageFeatures inputs and outputs.");
}

// Share the output ImageFeatures OrtValue* from other with the input ImageFeatures for this.
image_features_ = std::move(other.image_features_);
state_.inputs_[index_] = other.state_.outputs_[other.index_];
}

} // namespace Generators
39 changes: 39 additions & 0 deletions src/models/image_features.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

namespace Generators {

struct ImageFeatures {
enum struct Mode {
Input = 0,
Output
};

ImageFeatures(const Model& model, State& state, ImageFeatures::Mode mode, const std::string& name, int64_t num_image_tokens);
ImageFeatures(const ImageFeatures&) = delete;
ImageFeatures& operator=(const ImageFeatures&) = delete;

void Add();
void Update();
void ReuseImageFeaturesBuffer(ImageFeatures& other);

auto& GetShape() const { return shape_; }
OrtValue* Get() { return image_features_.get(); }

private:
const Model& model_;
State& state_;

std::array<int64_t, 2> shape_{}; // [num_image_tokens, hidden_size]
ONNXTensorElementDataType type_;

const Mode mode_{};
const std::string name_;

std::unique_ptr<OrtValue> image_features_;
size_t index_{~0U};
};

} // namespace Generators
Loading

0 comments on commit b49e3b1

Please sign in to comment.