Add performace metrics into phi3 C example. (#928)

microsoft · Sep 26, 2024 · 7013224 · 7013224
1 parent 708e677
commit 7013224
Showing 1 changed file with 96 additions and 2 deletions.
diff --git a/examples/c/src/phi3.cpp b/examples/c/src/phi3.cpp
@@ -1,10 +1,68 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <cassert>
+#include <chrono>
+#include <iomanip>
 #include <iostream>
 #include <string>
 #include "ort_genai.h"
 
+using Clock = std::chrono::high_resolution_clock;
+using TimePoint = std::chrono::time_point<Clock>;
+using Duration = std::chrono::duration<double>;
+
+// `Timing` is a utility class for measuring performance metrics.
+class Timing {
+ public:
+  Timing(const Timing&) = delete;
+  Timing& operator=(const Timing&) = delete;
+
+  Timing() = default;
+
+  ~Timing() = default;
+
+  void RecordStartTimestamp() {
+    assert(start_timestamp_.time_since_epoch().count() == 0);
+    start_timestamp_ = Clock::now();
+  }
+
+  void RecordFirstTokenTimestamp() {
+    assert(first_token_timestamp_.time_since_epoch().count() == 0);
+    first_token_timestamp_ = Clock::now();
+  }
+
+  void RecordEndTimestamp() {
+    assert(end_timestamp_.time_since_epoch().count() == 0);
+    end_timestamp_ = Clock::now();
+  }
+
+  void Log(const int prompt_tokens_length, const int new_tokens_length) {
+    assert(start_timestamp_.time_since_epoch().count() != 0);
+    assert(first_token_timestamp_.time_since_epoch().count() != 0);
+    assert(end_timestamp_.time_since_epoch().count() != 0);
+
+    Duration prompt_time = (first_token_timestamp_ - start_timestamp_);
+    Duration run_time = (end_timestamp_ - first_token_timestamp_);
+
+    const auto default_precision{std::cout.precision()};
+    std::cout << std::endl;
+    std::cout << "-------------" << std::endl;
+    std::cout << std::fixed << std::showpoint << std::setprecision(2)
+              << "Prompt length: " << prompt_tokens_length << ", New tokens: " << new_tokens_length
+              << ", Time to first: " << prompt_time.count() << "s"
+              << ", Prompt tokens per second: " << prompt_tokens_length / prompt_time.count() << " tps"
+              << ", New tokens per second: " << new_tokens_length / run_time.count() << " tps"
+              << std::setprecision(default_precision) << std::endl;
+    std::cout << "-------------" << std::endl;
+  }
+
+ private:
+  TimePoint start_timestamp_;
+  TimePoint first_token_timestamp_;
+  TimePoint end_timestamp_;
+};
+
 // C++ API Example
 
 void CXX_API(const char* model_path) {
@@ -16,11 +74,19 @@ void CXX_API(const char* model_path) {
 
   while (true) {
     std::string text;
-    std::cout << "Prompt: " << std::endl;
+    std::cout << "Prompt: (Use quit() to exit)" << std::endl;
     std::getline(std::cin, text);
 
+    if (text == "quit()") {
+      break;  // Exit the loop
+    }
+
     const std::string prompt = "<|user|>\n" + text + "<|end|>\n<|assistant|>";
 
+    bool is_first_token = true;
+    Timing timing;
+    timing.RecordStartTimestamp();
+
     auto sequences = OgaSequences::Create();
     tokenizer->Encode(prompt.c_str(), *sequences);
 
@@ -35,6 +101,11 @@ void CXX_API(const char* model_path) {
       generator->ComputeLogits();
       generator->GenerateNextToken();
 
+      if (is_first_token) {
+        timing.RecordFirstTokenTimestamp();
+        is_first_token = false;
+      }
+
       // Show usage of GetOutput
       std::unique_ptr<OgaTensor> output_logits = generator->GetOutput("logits");
 
@@ -53,6 +124,11 @@ void CXX_API(const char* model_path) {
       std::cout << tokenizer_stream->Decode(new_token) << std::flush;
     }
 
+    timing.RecordEndTimestamp();
+    const int prompt_tokens_length = sequences->SequenceCount(0);
+    const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
+    timing.Log(prompt_tokens_length, new_tokens_length);
+
     for (int i = 0; i < 3; ++i)
       std::cout << std::endl;
   }
@@ -82,11 +158,19 @@ void C_API(const char* model_path) {
 
   while (true) {
     std::string text;
-    std::cout << "Prompt: " << std::endl;
+    std::cout << "Prompt: (Use quit() to exit)" << std::endl;
     std::getline(std::cin, text);
 
+    if (text == "quit()") {
+      break;  // Exit the loop
+    }
+
     const std::string prompt = "<|user|>\n" + text + "<|end|>\n<|assistant|>";
 
+    bool is_first_token = true;
+    Timing timing;
+    timing.RecordStartTimestamp();
+
     OgaSequences* sequences;
     CheckResult(OgaCreateSequences(&sequences));
     CheckResult(OgaTokenizerEncode(tokenizer, prompt.c_str(), sequences));
@@ -104,13 +188,23 @@ void C_API(const char* model_path) {
       CheckResult(OgaGenerator_ComputeLogits(generator));
       CheckResult(OgaGenerator_GenerateNextToken(generator));
 
+      if (is_first_token) {
+        timing.RecordFirstTokenTimestamp();
+        is_first_token = false;
+      }
+
       const int32_t num_tokens = OgaGenerator_GetSequenceCount(generator, 0);
       int32_t new_token = OgaGenerator_GetSequenceData(generator, 0)[num_tokens - 1];
       const char* new_token_string;
       CheckResult(OgaTokenizerStreamDecode(tokenizer_stream, new_token, &new_token_string));
       std::cout << new_token_string << std::flush;
     }
 
+    timing.RecordEndTimestamp();
+    const int prompt_tokens_length = OgaSequencesGetSequenceCount(sequences, 0);
+    const int new_tokens_length = OgaGenerator_GetSequenceCount(generator, 0) - prompt_tokens_length;
+    timing.Log(prompt_tokens_length, new_tokens_length);
+
     for (int i = 0; i < 3; ++i)
       std::cout << std::endl;