Skip to content

Commit

Permalink
Added some documentation + comments to the code
Browse files Browse the repository at this point in the history
  • Loading branch information
Maximilian committed Sep 6, 2024
1 parent 8d73404 commit 9174c7b
Show file tree
Hide file tree
Showing 10 changed files with 474 additions and 135 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,15 @@ $ mkdir build_hw && cd build_hw
$ cmake <path_to_cmake_config> -DFDEV_NAME=<target_device> -DEXAMPLE=<target_example>
~~~~

It's a good practice to generate the hardware-build in a subfolder of the `examples_hw`, since this already contains the cmake that needs to be referenced. In this case, the procedure would look like this:

~~~~
$ mkdir examples_hw/build_hw && cd examples_hw/build_hw
$ cmake ../ -DFDEV_NAME=<target_device> -DEXAMPLE=<target_example>
~~~~

Already implemented target-examples are specified in `examples_hw/CMakeLists.txt` and allow to build a variety of interesting design constellations, i.e. `rdma_perf` will create a RDMA-capable Coyote-NIC.

Generate all projects and compile all bitstreams:

~~~~
Expand Down
51 changes: 47 additions & 4 deletions examples_sw/apps/rdma_service/client/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ int main(int argc, char *argv[])
// ARGS
// -----------------------------------------------------------------------------------------------------------------------

// Generates the command-line printout and deals with reading in the user-defined arguments for running the experiments
boost::program_options::options_description programDescription("Options:");
programDescription.add_options()
("bitstream,b", boost::program_options::value<string>(), "Shell bitstream")
Expand All @@ -101,6 +102,7 @@ int main(int argc, char *argv[])
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, programDescription), commandLineArgs);
boost::program_options::notify(commandLineArgs);

// Set the default values to variables for further usage
string bstream_path = "";
uint32_t cs_dev = defDevice;
uint32_t vfid = defTargetVfid;
Expand All @@ -111,6 +113,7 @@ int main(int argc, char *argv[])
uint32_t n_reps_thr = defNRepsThr;
uint32_t n_reps_lat = defNRepsLat;

// Read the actual arguments from the command line and parse them to variables for further usage, for setting the experiment correctly
if(commandLineArgs.count("bitstream") > 0) {
bstream_path = commandLineArgs["bitstream"].as<string>();

Expand All @@ -136,68 +139,108 @@ int main(int argc, char *argv[])
// RDMA client side
// -----------------------------------------------------------------------------------------------------------------------

// Get a thread ...
// Get a thread for execution: Has the vFPGA-ID, host-process-ID of this calling process, and device number
cThread<int> cthread(defTargetVfid, getpid(), cs_dev);

// Get memory in the max size of the experiment. Argument is a cs_alloc-struct: Huge Page, max size, is remote
// This operation attaches the buffer to the Thread, which is required for the cLib constructor for RDMA-capabilities
cthread.getMem({CoyoteAlloc::HPF, max_size, true});

// Connect to the RDMA server and run the task

// This instantiates the communication library cLib with the name of the socket, function-ID (?), the executing cthread, the target IP-address and the target port
// The constructor of the communication library also automatically does the meta-exchange of information in the beginning to connect the queue pairs from local and remote
cLib<int, bool, uint32_t, uint32_t, uint32_t, uint32_t> clib_rdma("/tmp/coyote-daemon-vfid-0-rdma",
fidRDMA, &cthread, tcp_ip.c_str(), defPort);

// Execute the iTask -> That goes to cLib and from there probably to cFunc for scheduling of the execution of the cThread
clib_rdma.iTask(opPriority, oper, min_size, max_size, n_reps_thr, n_reps_lat);

// Benchmark the RDMA

// SG entries

// Create a Scatter-Gather-Entry, save it in memory - size of the rdmaSg
// How is this sg-element connected to the thread-attached buffer? Should be the vaddr, shouldn't it?
// There has to be a connection, since sg is handed over to the invoke-function, where the local_dest and offset is accessed
sgEntry sg;
memset(&sg, 0, sizeof(rdmaSg));
sg.rdma.len = min_size; sg.rdma.local_stream = strmHost;

// Set properties of the Scatter-Gather-Entry: Min-Size (size to start the experiment with), Stream Host as origin of data to be used for the RDMA-experiment
sg.rdma.len = min_size;
sg.rdma.local_stream = strmHost;

// Set the Coyote Operation, which can either be a REMOTE_WRITE or a REMOTE_READ, depending on the settings for the experiment
CoyoteOper coper = oper ? CoyoteOper::REMOTE_RDMA_WRITE : CoyoteOper::REMOTE_RDMA_READ;;

PR_HEADER("RDMA BENCHMARK");

// Iterate over the experiment size (for incrementing size up to defined maximum)
while(sg.rdma.len <= max_size) {

// Sync
// Clear the registers that hold information about completed functions
cthread.clearCompleted();
// Initiate a sync between the remote nodes with handshaking via exchanged ACKs
cthread.connSync(true);
// Initialize a benchmark-object to precisely benchmark the RDMA-execution. Number of executions is set to 1 (no further repetitions on this level), no calibration required, no distribution required.
cBench bench(1);

// Lambda-function for throughput-benchmarking
auto benchmark_thr = [&]() {
// For the desired number of repetitions per size, invoke the cThread-Function with the coyote-Operation
for(int i = 0; i < n_reps_thr; i++)
cthread.invoke(coper, &sg);

// Check the number of completed RDMA-transactions, wait until all operations have been completed. Check for stalling in-between.
while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < n_reps_thr) {
// stalled is an atomic boolean used for event-handling (?) that would indicate a stalled operation
if( stalled.load() ) throw std::runtime_error("Stalled, SIGINT caught");
}

};

// Execution of the throughput-lambda-function through the benchmarking-function to get timing
bench.runtime(benchmark_thr);

// Generate the required output based on the statistical data from the benchmarking tool
std::cout << std::fixed << std::setprecision(2);
std::cout << std::setw(8) << sg.rdma.len << " [bytes], thoughput: "
<< std::setw(8) << ((1 + oper) * ((1000 * sg.rdma.len ))) / ((bench.getAvg()) / n_reps_thr) << " [MB/s], latency: ";

// Sync
// Sync - reset the completion counter from the thread, sync-up via ACK-handshakes
cthread.clearCompleted();
cthread.connSync(true);

// Lambda-function for latency-benchmarking
auto benchmark_lat = [&]() {
// Different than before: Issue one single command via invoke, then wait for its completion (ping-pong-scheme)
// Repeated for the number of desired repetitions
for(int i = 0; i < n_reps_lat; i++) {
cthread.invoke(coper, &sg);
while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) {
// As long as the completion is not yet received, check for a possible stall-event
if( stalled.load() ) throw std::runtime_error("Stalled, SIGINT caught");
}
}
};

// Execution of the latency-lambda-function through the benchmarking-function to get the timing right
bench.runtime(benchmark_lat);

// Generate the average time for the latency-test execution
std::cout << (bench.getAvg()) / (n_reps_lat * (1 + oper)) << " [ns]" << std::endl;

// Scale up the Scatter-Gather-length to get to the next step of the experiment
sg.rdma.len *= 2;
}

// End the printout
std::cout << std::endl;

// Final connection sync via the thread-provided function
cthread.connSync(true);

// Try to obtain the completion event at the end - probably has to do with the iTask at the beginning?
int ret_val = clib_rdma.iCmpl();

return (ret_val);
Expand Down
16 changes: 14 additions & 2 deletions sw/include/bFunc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,26 @@ namespace fpga {
/**
* @brief User functions, base
*
* To be inherited by the cFunc-class
*
*/
class bFunc {
public:
//
// General notion: virtual functions are expected to be overwritten in derived classes

// Function to register a client thread (also returns such a thread). Takes the following parameters as input:
// connfd: connection file descriptor
// vfid: vFPGA ID
// rpid: remote process ID
// dev: Device ID
// cSched: Pointer to a scheduler
// Pointer to a User Interrupt Service Routine
virtual bThread* registerClientThread(int connfd, int32_t vfid, pid_t rpid, uint32_t dev, cSched *csched, void (*uisr)(int) = nullptr) = 0;
//

// Virtual function to start
virtual void start() = 0;

// Virtual destructor of the bFunc
virtual ~bFunc() {}
};

Expand Down
31 changes: 22 additions & 9 deletions sw/include/bThread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,19 @@ constexpr auto cmd_fifo_thr = cmdFifoThr;
class bThread {
protected:
/* Fpga device */
// Relevant IDs for describing the vFPGA / the interaction with it
int32_t fd = { 0 };
int32_t vfid = { -1 };
int32_t ctid = { -1 };
int32_t vfid = { -1 }; // vFPGA ID, part of the QPN later on
int32_t ctid = { -1 }; // Not sure where this ID comes from
pid_t hpid = { 0 };
fCnfg fcnfg;
fCnfg fcnfg; // vFPGA Configuration

/* Thread */
thread c_thread;
thread c_thread; // Instance of the thread and the run-variable to check if this thread is running or not
bool run = { false };

/* Remote */
std::unique_ptr<ibvQp> qpair;
std::unique_ptr<ibvQp> qpair; // Qpair for RDMA-operations based on this Thread
bool is_buff_attached;

/* Connection */
Expand All @@ -73,12 +74,13 @@ class bThread {
named_mutex plock; // User vFPGA lock

/* Scheduler */
cSched *csched = { nullptr };
cSched *csched = { nullptr }; // Scheduler for the thread

/* Used markers */
uint32_t cmd_cnt = { 0 };
uint32_t cmd_cnt = { 0 }; // Counter for issued commands via this thread

/* eventfd */
// Description of an Event with File Descriptor, terminator and its own thread (not sure what for)
int32_t efd = { -1 };
int32_t terminate_efd = { -1 };
std::thread event_thread;
Expand All @@ -87,20 +89,25 @@ class bThread {
#ifdef EN_AVX
volatile __m256i *cnfg_reg_avx = { 0 };
#endif
// Memory-mappings for configuration- and control-registers of Coyote
volatile uint64_t *cnfg_reg = { 0 };
volatile uint64_t *ctrl_reg = { 0 };

/* Writeback */
// Not sure what's going on here with these
volatile uint32_t *wback = 0;

/* Mapped pages */
// All mapped pages
std::unordered_map<void*, csAlloc> mapped_pages;

/* Utility */
// Functions for creating and ending the memory mapping of the FPGA
void mmapFpga();
void munmapFpga();

/* Connection */
// Networking functions for syncing up
void sendAck(uint32_t ack);
uint32_t readAck();
void closeAck();
Expand All @@ -114,7 +121,11 @@ class bThread {
* @brief Ctor, Dtor
*
*/

// Constructor-Call
bThread(int32_t vfid, pid_t hpid, uint32_t dev, cSched *csched = nullptr, void (*uisr)(int) = nullptr);

// Destructor-Call
~bThread();

/**
Expand All @@ -127,7 +138,7 @@ class bThread {
void pUnlock();

/**
*
* Virtual function for starting the thread for runtime-polymorphism, function is re-implemented in cThread.hpp
*/
virtual void start() = 0;

Expand All @@ -154,7 +165,9 @@ class bThread {
inline auto getCSR(uint32_t offs) { return ctrl_reg[offs]; }

/**
* @brief Invoke a transfer
* @brief Invoke a transfer of data
* coper - Coyote Operation (i.e. a LOCAL_WRITE or a REMOTE_RDMA_WRITE)
* sgEntry -
*
* @param cs_invoke : Coyote invoke struct
*/
Expand Down
36 changes: 24 additions & 12 deletions sw/include/cBench.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,25 @@ using namespace std::chrono;

/**
* Exec times [ns]
*
* Utility tool that allows to benchmark function execution with lots of added statistics:
*/
class cBench {
double avg_time = { 0.0 };
int num_runs = { 0 };
int num_runs_def = { 0 };
bool calibrate = { false };
bool distribute = { false };
// Variables for later usage
double avg_time = { 0.0 }; // average time
int num_runs = { 0 }; // number of (done) runs
int num_runs_def = { 0 }; // number of predefined runs
bool calibrate = { false }; // Bool: Should we habe calibration runs?
bool distribute = { false }; // Should we get a timing distribution from our measurement?

// Accummulated
std::vector<double> times;
std::vector<double> times; // Vector holds all times that are measured

void sortBench() { std::sort(times.begin(), times.end()); }
void sortBench() { std::sort(times.begin(), times.end()); } // Function to sort time-values in the vector

public:

// Constructor: Define how many function runs the benchmarking suite should do, whether it should do a warm-up run for calibration first and whether it should display a distribution of results
cBench(int num_runs = kNumRunsDef, bool calibrate = kCalibrate, bool distribute = kDistribute) {
this->num_runs_def = num_runs;
this->calibrate = calibrate;
Expand All @@ -50,13 +55,18 @@ class cBench {

/**
* Measure the function execution
*
* Functional programming + variadic function: Function takes another function as argument +
* arbitrary number of other arguments.
*
*
*/
template <class Func, typename... Args>
void runtime(Func const &func, Args... args) {
times.clear();

#ifdef EN_AVX
// Warm-up
// Warm-up: Do some calibration runs in the first place
if (calibrate) {
num_runs = 1;
while (num_runs < (1 << 14)) {
Expand All @@ -80,19 +90,21 @@ class cBench {

//DBG2("Number of bench runs: " << num_runs);

// Average time
// Average time - start timer, execute the function (which is given as argument) for the required number of times and stop timer afterwards
auto begin_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_runs; ++i) {
func(args...);
}
auto end_time = std::chrono::high_resolution_clock::now();

// Calculate time from start and end, divide by number of executions for average time
double time = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - begin_time).count();
avg_time = time / num_runs;

// Latency distribution
// Latency distribution - get the time for every single repetition of the function call, store that intermediate timing result in the time vector and sort it
if(distribute) {
for (int i = 0; i < kNumRunsDist; ++i) {
// Take time and start the function in between
begin_time = std::chrono::high_resolution_clock::now();
func(args...);
end_time = std::chrono::high_resolution_clock::now();
Expand All @@ -113,7 +125,7 @@ class cBench {
// Average run time
inline auto getAvg() { return avg_time; }

// Statistics
// Statistics - get percentile timings etc.
inline auto getMin() { if(!times.empty()) return times[0]; else return 0.0; }
inline auto getMax() { if(!times.empty()) return times[times.size()-1]; else return 0.0; }
inline auto getP25() { if(!times.empty()) return times[(times.size()/4)-1]; else return 0.0; }
Expand All @@ -122,7 +134,7 @@ class cBench {
inline auto getP95() { if(!times.empty()) return times[((times.size()*95)/100)-1]; else return 0.0; }
inline auto getP99() { if(!times.empty()) return times[((times.size()*99)/100)-1]; else return 0.0; }

// Print results
// Print results - advanced statistics are printed, including avg and percentiles
void printOut() {
std::ios_base::fmtflags f(std::cout.flags());

Expand Down
Loading

0 comments on commit 9174c7b

Please sign in to comment.