Added some documentation + comments to the code

fpgasystems · Sep 6, 2024 · 9174c7b · 9174c7b
1 parent 8d73404
commit 9174c7b
Show file tree

Hide file tree

Showing 10 changed files with 474 additions and 135 deletions.
diff --git a/README.md b/README.md
@@ -62,6 +62,15 @@ $ mkdir build_hw && cd build_hw
 $ cmake <path_to_cmake_config> -DFDEV_NAME=<target_device>  -DEXAMPLE=<target_example>
 ~~~~
 
+It's a good practice to generate the hardware-build in a subfolder of the `examples_hw`, since this already contains the cmake that needs to be referenced. In this case, the procedure would look like this: 
+
+~~~~
+$ mkdir examples_hw/build_hw && cd examples_hw/build_hw 
+$ cmake ../ -DFDEV_NAME=<target_device>  -DEXAMPLE=<target_example>
+~~~~
+
+Already implemented target-examples are specified in `examples_hw/CMakeLists.txt` and allow to build a variety of interesting design constellations, i.e. `rdma_perf` will create a RDMA-capable Coyote-NIC. 
+
 Generate all projects and compile all bitstreams:
 
 ~~~~

diff --git a/examples_sw/apps/rdma_service/client/main.cpp b/examples_sw/apps/rdma_service/client/main.cpp
@@ -85,6 +85,7 @@ int main(int argc, char *argv[])
     // ARGS
     // -----------------------------------------------------------------------------------------------------------------------
 
+    // Generates the command-line printout and deals with reading in the user-defined arguments for running the experiments 
     boost::program_options::options_description programDescription("Options:");
     programDescription.add_options()
         ("bitstream,b", boost::program_options::value<string>(), "Shell bitstream")
@@ -101,6 +102,7 @@ int main(int argc, char *argv[])
     boost::program_options::store(boost::program_options::parse_command_line(argc, argv, programDescription), commandLineArgs);
     boost::program_options::notify(commandLineArgs);
 
+    // Set the default values to variables for further usage 
     string bstream_path = "";
     uint32_t cs_dev = defDevice; 
     uint32_t vfid = defTargetVfid;
@@ -111,6 +113,7 @@ int main(int argc, char *argv[])
     uint32_t n_reps_thr = defNRepsThr;
     uint32_t n_reps_lat = defNRepsLat;
 
+    // Read the actual arguments from the command line and parse them to variables for further usage, for setting the experiment correctly 
     if(commandLineArgs.count("bitstream") > 0) { 
         bstream_path = commandLineArgs["bitstream"].as<string>();
 
@@ -136,68 +139,108 @@ int main(int argc, char *argv[])
     // RDMA client side
     // -----------------------------------------------------------------------------------------------------------------------
 
-    // Get a thread ...
+    // Get a thread for execution: Has the vFPGA-ID, host-process-ID of this calling process, and device number
     cThread<int> cthread(defTargetVfid, getpid(), cs_dev);
+
+    // Get memory in the max size of the experiment. Argument is a cs_alloc-struct: Huge Page, max size, is remote 
+    // This operation attaches the buffer to the Thread, which is required for the cLib constructor for RDMA-capabilities
     cthread.getMem({CoyoteAlloc::HPF, max_size, true});
 
     // Connect to the RDMA server and run the task
+
+    // This instantiates the communication library cLib with the name of the socket, function-ID (?), the executing cthread, the target IP-address and the target port
+    // The constructor of the communication library also automatically does the meta-exchange of information in the beginning to connect the queue pairs from local and remote 
     cLib<int, bool, uint32_t, uint32_t, uint32_t, uint32_t> clib_rdma("/tmp/coyote-daemon-vfid-0-rdma", 
         fidRDMA, &cthread, tcp_ip.c_str(), defPort); 
 
+    // Execute the iTask -> That goes to cLib and from there probably to cFunc for scheduling of the execution of the cThread
     clib_rdma.iTask(opPriority, oper, min_size, max_size, n_reps_thr, n_reps_lat);
 
     // Benchmark the RDMA
 
     // SG entries
+
+    // Create a Scatter-Gather-Entry, save it in memory - size of the rdmaSg
+    // How is this sg-element connected to the thread-attached buffer? Should be the vaddr, shouldn't it? 
+    // There has to be a connection, since sg is handed over to the invoke-function, where the local_dest and offset is accessed 
     sgEntry sg;
     memset(&sg, 0, sizeof(rdmaSg));
-    sg.rdma.len = min_size; sg.rdma.local_stream = strmHost;
+
+    // Set properties of the Scatter-Gather-Entry: Min-Size (size to start the experiment with), Stream Host as origin of data to be used for the RDMA-experiment
+    sg.rdma.len = min_size; 
+    sg.rdma.local_stream = strmHost;
+
+    // Set the Coyote Operation, which can either be a REMOTE_WRITE or a REMOTE_READ, depending on the settings for the experiment 
     CoyoteOper coper = oper ? CoyoteOper::REMOTE_RDMA_WRITE : CoyoteOper::REMOTE_RDMA_READ;;
 
     PR_HEADER("RDMA BENCHMARK");
 
+    // Iterate over the experiment size (for incrementing size up to defined maximum)
     while(sg.rdma.len <= max_size) {
+
         // Sync
+        // Clear the registers that hold information about completed functions 
         cthread.clearCompleted();
+        // Initiate a sync between the remote nodes with handshaking via exchanged ACKs 
         cthread.connSync(true);
+        // Initialize a benchmark-object to precisely benchmark the RDMA-execution. Number of executions is set to 1 (no further repetitions on this level), no calibration required, no distribution required. 
         cBench bench(1);
 
+        // Lambda-function for throughput-benchmarking
         auto benchmark_thr = [&]() {
+            // For the desired number of repetitions per size, invoke the cThread-Function with the coyote-Operation 
             for(int i = 0; i < n_reps_thr; i++)
                 cthread.invoke(coper, &sg);
 
+            // Check the number of completed RDMA-transactions, wait until all operations have been completed. Check for stalling in-between. 
             while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < n_reps_thr) { 
+                // stalled is an atomic boolean used for event-handling (?) that would indicate a stalled operation
                 if( stalled.load() ) throw std::runtime_error("Stalled, SIGINT caught");
             }
-
         };  
+
+        // Execution of the throughput-lambda-function through the benchmarking-function to get timing
         bench.runtime(benchmark_thr);
+
+        // Generate the required output based on the statistical data from the benchmarking tool 
         std::cout << std::fixed << std::setprecision(2);
         std::cout << std::setw(8) << sg.rdma.len << " [bytes], thoughput: " 
                     << std::setw(8) << ((1 + oper) * ((1000 * sg.rdma.len ))) / ((bench.getAvg()) / n_reps_thr) << " [MB/s], latency: ";
 
-        // Sync
+        // Sync - reset the completion counter from the thread, sync-up via ACK-handshakes 
         cthread.clearCompleted();
         cthread.connSync(true); 
 
+        // Lambda-function for latency-benchmarking 
         auto benchmark_lat = [&]() {
+            // Different than before: Issue one single command via invoke, then wait for its completion (ping-pong-scheme)
+            // Repeated for the number of desired repetitions 
             for(int i = 0; i < n_reps_lat; i++) {
                 cthread.invoke(coper, &sg);
                 while(cthread.checkCompleted(CoyoteOper::LOCAL_WRITE) < i+1) { 
+                    // As long as the completion is not yet received, check for a possible stall-event 
                     if( stalled.load() ) throw std::runtime_error("Stalled, SIGINT caught");
                 }
             }
         };
+
+        // Execution of the latency-lambda-function through the benchmarking-function to get the timing right 
         bench.runtime(benchmark_lat);
+
+        // Generate the average time for the latency-test execution 
 	    std::cout << (bench.getAvg()) / (n_reps_lat * (1 + oper)) << " [ns]" << std::endl;
 
+        // Scale up the Scatter-Gather-length to get to the next step of the experiment 
         sg.rdma.len *= 2;
     }
 
+    // End the printout 
     std::cout << std::endl;
 
+    // Final connection sync via the thread-provided function
     cthread.connSync(true);
 
+    // Try to obtain the completion event at the end - probably has to do with the iTask at the beginning? 
     int ret_val = clib_rdma.iCmpl();
 
     return (ret_val);

diff --git a/sw/include/bFunc.hpp b/sw/include/bFunc.hpp
@@ -16,14 +16,26 @@ namespace fpga {
 /**
  * @brief User functions, base
  * 
+ * To be inherited by the cFunc-class 
+ * 
  */
 class bFunc {
 public:
-    //
+    // General notion: virtual functions are expected to be overwritten in derived classes 
+
+    // Function to register a client thread (also returns such a thread). Takes the following parameters as input: 
+    // connfd: connection file descriptor 
+    // vfid: vFPGA ID 
+    // rpid: remote process ID 
+    // dev: Device ID 
+    // cSched: Pointer to a scheduler 
+    // Pointer to a User Interrupt Service Routine 
     virtual bThread* registerClientThread(int connfd, int32_t vfid, pid_t rpid, uint32_t dev, cSched *csched, void (*uisr)(int) = nullptr) = 0;
-    //
+
+    // Virtual function to start 
     virtual void start() = 0;
 
+    // Virtual destructor of the bFunc 
     virtual ~bFunc() {}
 };
 

diff --git a/sw/include/bThread.hpp b/sw/include/bThread.hpp
@@ -51,18 +51,19 @@ constexpr auto cmd_fifo_thr = cmdFifoThr;
 class bThread {
 protected: 
 	/* Fpga device */
+	// Relevant IDs for describing the vFPGA / the interaction with it 
 	int32_t fd = { 0 };
-	int32_t vfid = { -1 };
-	int32_t ctid = { -1 };
+	int32_t vfid = { -1 }; // vFPGA ID, part of the QPN later on 
+	int32_t ctid = { -1 }; // Not sure where this ID comes from 
 	pid_t hpid = { 0 };
-	fCnfg fcnfg;
+	fCnfg fcnfg; // vFPGA Configuration 
 
     /* Thread */
-    thread c_thread;
+    thread c_thread; // Instance of the thread and the run-variable to check if this thread is running or not 
     bool run = { false };
 
     /* Remote */
-    std::unique_ptr<ibvQp> qpair;
+    std::unique_ptr<ibvQp> qpair; // Qpair for RDMA-operations based on this Thread 
     bool is_buff_attached;
 
     /* Connection */
@@ -73,12 +74,13 @@ class bThread {
     named_mutex plock; // User vFPGA lock
 
     /* Scheduler */
-    cSched *csched = { nullptr };
+    cSched *csched = { nullptr }; // Scheduler for the thread 
 
 	/* Used markers */
-	uint32_t cmd_cnt = { 0 };
+	uint32_t cmd_cnt = { 0 }; // Counter for issued commands via this thread 
 
     /* eventfd */
+	// Description of an Event with File Descriptor, terminator and its own thread (not sure what for)
 	int32_t efd = { -1 };
 	int32_t terminate_efd = { -1 };
 	std::thread event_thread;
@@ -87,20 +89,25 @@ class bThread {
 #ifdef EN_AVX
 	volatile __m256i *cnfg_reg_avx = { 0 };
 #endif
+	// Memory-mappings for configuration- and control-registers of Coyote 
 	volatile uint64_t *cnfg_reg = { 0 };
 	volatile uint64_t *ctrl_reg = { 0 };
 
 	/* Writeback */
+	// Not sure what's going on here with these
 	volatile uint32_t *wback = 0;
 
 	/* Mapped pages */
+	// All mapped pages 
 	std::unordered_map<void*, csAlloc> mapped_pages;
 
 	/* Utility */
+	// Functions for creating and ending the memory mapping of the FPGA
 	void mmapFpga();
 	void munmapFpga();
 
     /* Connection */
+	// Networking functions for syncing up 
     void sendAck(uint32_t ack);
     uint32_t readAck();
     void closeAck();
@@ -114,7 +121,11 @@ class bThread {
 	 * @brief Ctor, Dtor
 	 * 
 	 */
+
+	// Constructor-Call 
 	bThread(int32_t vfid, pid_t hpid, uint32_t dev, cSched *csched = nullptr, void (*uisr)(int) = nullptr);
+
+	// Destructor-Call 
 	~bThread();
 
     /**
@@ -127,7 +138,7 @@ class bThread {
 	void pUnlock();
 
     /**
-     * 
+     * Virtual function for starting the thread for runtime-polymorphism, function is re-implemented in cThread.hpp
     */
     virtual void start() = 0;
 
@@ -154,7 +165,9 @@ class bThread {
 	inline auto getCSR(uint32_t offs) { return ctrl_reg[offs]; }
 
 	/**
-	 * @brief Invoke a transfer
+	 * @brief Invoke a transfer of data 
+	 * coper - Coyote Operation (i.e. a LOCAL_WRITE or a REMOTE_RDMA_WRITE)
+	 * sgEntry - 
 	 * 
 	 * @param cs_invoke : Coyote invoke struct
 	 */

diff --git a/sw/include/cBench.hpp b/sw/include/cBench.hpp
@@ -28,20 +28,25 @@ using namespace std::chrono;
 
 /**
  * Exec times [ns]
+ * 
+ * Utility tool that allows to benchmark function execution with lots of added statistics: 
  */
 class cBench {
-    double avg_time = { 0.0 };
-    int num_runs = { 0 };
-    int num_runs_def = { 0 };
-    bool calibrate = { false };
-    bool distribute = { false };
+    // Variables for later usage 
+    double avg_time = { 0.0 }; // average time 
+    int num_runs = { 0 }; // number of (done) runs
+    int num_runs_def = { 0 }; // number of predefined runs 
+    bool calibrate = { false }; // Bool: Should we habe calibration runs? 
+    bool distribute = { false }; // Should we get a timing distribution from our measurement? 
 
     // Accummulated
-    std::vector<double> times;
+    std::vector<double> times; // Vector holds all times that are measured
 
-    void sortBench() { std::sort(times.begin(), times.end()); } 
+    void sortBench() { std::sort(times.begin(), times.end()); } // Function to sort time-values in the vector 
 
 public:
+
+    // Constructor: Define how many function runs the benchmarking suite should do, whether it should do a warm-up run for calibration first and whether it should display a distribution of results 
     cBench(int num_runs = kNumRunsDef, bool calibrate = kCalibrate, bool distribute = kDistribute) { 
         this->num_runs_def = num_runs; 
         this->calibrate = calibrate; 
@@ -50,13 +55,18 @@ class cBench {
 
     /**
      * Measure the function execution
+     * 
+     * Functional programming + variadic function: Function takes another function as argument +
+     * arbitrary number of other arguments. 
+     * 
+     * 
      */
     template <class Func, typename... Args>
     void runtime(Func const &func, Args... args) {
         times.clear();
 
 #ifdef EN_AVX    
-        // Warm-up
+        // Warm-up: Do some calibration runs in the first place 
         if (calibrate) {
             num_runs = 1;
             while (num_runs < (1 << 14)) {
@@ -80,19 +90,21 @@ class cBench {
 
         //DBG2("Number of bench runs: " << num_runs);
 
-        // Average time
+        // Average time - start timer, execute the function (which is given as argument) for the required number of times and stop timer afterwards 
         auto begin_time = std::chrono::high_resolution_clock::now();
         for (int i = 0; i < num_runs; ++i) {
             func(args...);
         }
         auto end_time = std::chrono::high_resolution_clock::now();
 
+        // Calculate time from start and end, divide by number of executions for average time 
         double time = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - begin_time).count();
         avg_time = time / num_runs;
 
-        // Latency distribution
+        // Latency distribution - get the time for every single repetition of the function call, store that intermediate timing result in the time vector and sort it 
         if(distribute) {       
             for (int i = 0; i < kNumRunsDist; ++i) {
+                // Take time and start the function in between 
                 begin_time = std::chrono::high_resolution_clock::now();
                     func(args...);
                 end_time = std::chrono::high_resolution_clock::now();
@@ -113,7 +125,7 @@ class cBench {
     // Average run time
     inline auto getAvg() { return avg_time; }
 
-    // Statistics
+    // Statistics - get percentile timings etc. 
     inline auto getMin() { if(!times.empty()) return times[0]; else return 0.0; }
     inline auto getMax() { if(!times.empty()) return times[times.size()-1]; else return 0.0; }
     inline auto getP25() { if(!times.empty()) return times[(times.size()/4)-1]; else return 0.0; }
@@ -122,7 +134,7 @@ class cBench {
     inline auto getP95() { if(!times.empty()) return times[((times.size()*95)/100)-1]; else return 0.0; }
     inline auto getP99() { if(!times.empty()) return times[((times.size()*99)/100)-1]; else return 0.0; }
 
-    // Print results
+    // Print results - advanced statistics are printed, including avg and percentiles 
     void printOut() {
         std::ios_base::fmtflags f(std::cout.flags());