Skip to content

Commit

Permalink
Merge pull request #2 from uhh-cms/onnx_implementation
Browse files Browse the repository at this point in the history
Onnx implementation
  • Loading branch information
riga authored Mar 27, 2024
2 parents 8bf0846 + 947d623 commit 54ba404
Show file tree
Hide file tree
Showing 15 changed files with 785 additions and 495 deletions.
323 changes: 132 additions & 191 deletions README.md

Large diffs are not rendered by default.

25 changes: 20 additions & 5 deletions cmssw/MLProf/RuntimeMeasurement/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
<use name="FWCore/Framework" />
<use name="FWCore/PluginManager" />
<use name="FWCore/ParameterSet" />
<use name="PhysicsTools/TensorFlow" />
<library name="MLProfRuntimeMeasurementTFInference" file="TFInference.cc">
<use name="FWCore/Framework"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ParameterSet"/>

<flags EDM_PLUGIN="1" />
<use name="PhysicsTools/TensorFlow"/>
<use name="MLProf/Utils"/>

<flags EDM_PLUGIN="1"/>
</library>

<library name="MLProfRuntimeMeasurementONNXInference" file="ONNXInference.cc">
<use name="FWCore/Framework"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ParameterSet"/>

<use name="PhysicsTools/ONNXRuntime"/>
<use name="MLProf/Utils"/>

<flags EDM_PLUGIN="1"/>
</library>
196 changes: 196 additions & 0 deletions cmssw/MLProf/RuntimeMeasurement/plugins/ONNXInference.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* Plugin to measure the inference runtime of an onnx model.
*/

#include <chrono>
#include <fstream>
#include <iostream>
#include <list>
#include <memory>
#include <random>
#include <stdexcept>

#include "FWCore/Framework/interface/Event.h"
#include "FWCore/Framework/interface/Frameworkfwd.h"
#include "FWCore/Framework/interface/MakerMacros.h"
#include "FWCore/Framework/interface/stream/EDAnalyzer.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "PhysicsTools/ONNXRuntime/interface/ONNXRuntime.h"

#include "MLProf/Utils/interface/utils.h"

using namespace cms::Ort;

class ONNXInference : public edm::stream::EDAnalyzer<edm::GlobalCache<ONNXRuntime>> {
public:
explicit ONNXInference(const edm::ParameterSet&, const ONNXRuntime*);
~ONNXInference(){};

static void fillDescriptions(edm::ConfigurationDescriptions&);
static std::unique_ptr<ONNXRuntime> initializeGlobalCache(const edm::ParameterSet&);
static void globalEndJob(const ONNXRuntime*);

private:
void beginJob();
void analyze(const edm::Event&, const edm::EventSetup&);
void endJob();

inline float drawNormal() { return normalPdf_(rndGen_); }

// parameters
std::vector<std::string> inputTensorNames_;
std::vector<std::string> outputTensorNames_;
std::string outputFile_;
std::string inputTypeStr_;
std::vector<int> inputRanks_;
std::vector<int> flatInputSizes_;
int batchSize_;
int nCalls_;

// other members
int nInputs_;
int nPreCalls_;
mlprof::InputType inputType_;
std::random_device rnd_;
std::default_random_engine rndGen_;
std::normal_distribution<float> normalPdf_;

std::vector<std::vector<int64_t>> input_shapes_;
FloatArrays inputArrays_; // each stream hosts its own data
};

void ONNXInference::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
// defining this function will lead to a *_cfi file being generated when
// compiling
edm::ParameterSetDescription desc;
// the path to the file containing the graph
desc.add<std::string>("graphPath");
// the names of the input tensors
desc.add<std::vector<std::string>>("inputTensorNames");
// the names of the output tensors
desc.add<std::vector<std::string>>("outputTensorNames");
// the name of the output csv file
desc.add<std::string>("outputFile");
// the type of input values, either "incremental" or "random"
desc.add<std::string>("inputType", "random");
// the rank (number of dimensions) of each input tensor
desc.add<std::vector<int>>("inputRanks");
// flat list of sizes of each dimension of each input tensor
// (for a graph with a 1D and a 2D input tensor, this would be a vector of
// three values)
desc.add<std::vector<int>>("flatInputSizes");
// batch sizes to test
desc.add<int>("batchSize");
// the number of calls to the graph to measure the runtime
desc.add<int>("nCalls");

descriptions.addWithDefaultLabel(desc);
}

ONNXInference::ONNXInference(const edm::ParameterSet& iConfig, const ONNXRuntime* cache)
: inputTensorNames_(iConfig.getParameter<std::vector<std::string>>("inputTensorNames")),
outputTensorNames_(iConfig.getParameter<std::vector<std::string>>("outputTensorNames")),
outputFile_(iConfig.getParameter<std::string>("outputFile")),
inputTypeStr_(iConfig.getParameter<std::string>("inputType")),
inputRanks_(iConfig.getParameter<std::vector<int>>("inputRanks")),
flatInputSizes_(iConfig.getParameter<std::vector<int>>("flatInputSizes")),
batchSize_(iConfig.getParameter<int>("batchSize")),
nCalls_(iConfig.getParameter<int>("nCalls")),
nInputs_(inputTensorNames_.size()),
nPreCalls_(10),
rndGen_(rnd_()),
normalPdf_(0.0, 1.0) {
// the number of input ranks must match the number of input tensors
if ((int)inputRanks_.size() != nInputs_) {
throw cms::Exception("InvalidInputRanks") << "number of input ranks must match number of input tensors";
}
// the input must be at least 1 dimensional
for (auto rank : inputRanks_) {
if (rank < 1) {
throw cms::Exception("InvalidRank") << "only ranks above 0 are supported, got " << rank;
}
}
// the sum of ranks must match the number of flat input sizes
if (std::accumulate(inputRanks_.begin(), inputRanks_.end(), 0) != (int)flatInputSizes_.size()) {
throw cms::Exception("InvalidFlatInputSizes")
<< "sum of input ranks must match number of flat input sizes, got " << flatInputSizes_.size();
}
// batch size must be positive
if (batchSize_ < 1) {
throw cms::Exception("InvalidBatchSize") << "batch size must be positive, got " << batchSize_;
}

// input sizes must be positive
for (auto size : flatInputSizes_) {
if (size < 1) {
throw cms::Exception("InvalidInputSize") << "input sizes must be positive, got " << size;
}
}
// check the input type
if (inputTypeStr_ == "incremental") {
inputType_ = mlprof::InputType::Incremental;
} else if (inputTypeStr_ == "random") {
inputType_ = mlprof::InputType::Random;
} else if (inputTypeStr_ == "zeros") {
inputType_ = mlprof::InputType::Zeros;
} else {
throw cms::Exception("InvalidInputType")
<< "input type must be either 'incremental', 'zeros' or 'random', got " << inputTypeStr_;
}

// initialize the input_shapes array with inputRanks_ and flatInputSizes_
int i = 0;
for (auto rank : inputRanks_) {
std::vector<int64_t> input_shape(flatInputSizes_.begin() + i, flatInputSizes_.begin() + i + rank);
input_shape.insert(input_shape.begin(), batchSize_);
input_shapes_.push_back(input_shape);
i += rank;
}
// initialize the input data arrays
// note there is only one element in the FloatArrays type (i.e.
// vector<vector<float>>) variable
for (int i = 0; i < nInputs_; i++) {
inputArrays_.emplace_back(batchSize_ * flatInputSizes_[i], 0);
}
}

std::unique_ptr<ONNXRuntime> ONNXInference::initializeGlobalCache(const edm::ParameterSet& iConfig) {
return std::make_unique<ONNXRuntime>(edm::FileInPath(iConfig.getParameter<std::string>("graphPath")).fullPath());
}

void ONNXInference::globalEndJob(const ONNXRuntime* cache) {}

void ONNXInference::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
for (int i = 0; i < nInputs_; i++) {
std::vector<float>& group_data = inputArrays_[i];
// fill the input
for (int i = 0; i < (int)group_data.size(); i++) {
group_data[i] = inputType_ == mlprof::InputType::Incremental
? float(i)
: (inputType_ == mlprof::InputType::Zeros ? float(0) : drawNormal());
}
}

// run prediction and get outputs
std::vector<std::vector<float>> outputs;

// pre calls to "warm up"
for (int r = 0; r < nPreCalls_; r++) {
outputs = globalCache()->run(inputTensorNames_, inputArrays_, input_shapes_, outputTensorNames_, batchSize_);
}

// actual calls to measure runtimes
std::vector<float> runtimes;
for (int r = 0; r < nCalls_; r++) {
auto start = std::chrono::high_resolution_clock::now();
outputs = globalCache()->run(inputTensorNames_, inputArrays_, input_shapes_, outputTensorNames_, batchSize_);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<float> runtime_in_seconds = (end - start);
runtimes.push_back(runtime_in_seconds.count() * 1000);
}

// save them
mlprof::writeRuntimes(outputFile_, batchSize_, runtimes);
}

DEFINE_FWK_MODULE(ONNXInference);
Loading

0 comments on commit 54ba404

Please sign in to comment.